You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
331 lines
9.1 KiB
331 lines
9.1 KiB
//
|
|
// Metaphone.cc
|
|
//
|
|
// Metaphone: A fuzzy matching algorithm used to match words that
|
|
// sound alike in the English language. Probably not so
|
|
// good for foreign languages.
|
|
//
|
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
|
// Copyright (c) 1995-2004 The ht://Dig Group
|
|
// For copyright details, see the file COPYING in your distribution
|
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
|
//
|
|
// $Id: Metaphone.cc,v 1.12 2004/05/28 13:15:20 lha Exp $
|
|
//
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "htconfig.h"
|
|
#endif /* HAVE_CONFIG_H */
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include "Metaphone.h"
|
|
#include "Dictionary.h"
|
|
|
|
#include <ctype.h>
|
|
|
|
|
|
//*****************************************************************************
|
|
// Metaphone::Metaphone(const HtConfiguration& config_arg)
|
|
//
|
|
Metaphone::Metaphone(const HtConfiguration& config_arg) :
|
|
Fuzzy(config_arg)
|
|
{
|
|
name = "metaphone";
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// Metaphone::~Metaphone()
|
|
//
|
|
Metaphone::~Metaphone()
|
|
{
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// void Metaphone::generateKey(char *word, String &key)
|
|
//
|
|
/*
|
|
* This code was copied from the slapd package developed at umich.
|
|
* it was debugged and cleaned up in February 1999 by Geoffrey Hutchison
|
|
* for the ht://Dig Project.
|
|
*/
|
|
/*
|
|
* Metaphone copied from C Gazette, June/July 1991, pp 56-57,
|
|
* author Gary A. Parker, with changes by Bernard Tiffany of the
|
|
* University of Michigan, and more changes by Tim Howes of the
|
|
* University of Michigan.
|
|
*/
|
|
|
|
/* Character coding array */
|
|
static char vsvfn[26] = {
|
|
1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
|
|
/* A B C D E F G H I J K L M */
|
|
2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
|
|
/* N O P Q R S T U V W X Y Z */
|
|
|
|
/* Macros to access character coding array */
|
|
#define vscode(x) ((x) >= 'A' && (x) <= 'Z' ? vsvfn[(x) - 'A'] : 0)
|
|
#define vowel(x) ((x) != '\0' && vscode(x) & 1) /* AEIOU */
|
|
#define same(x) ((x) != '\0' && vscode(x) & 2) /* FJLMNR */
|
|
#define varson(x) ((x) != '\0' && vscode(x) & 4) /* CGPST */
|
|
#define frontv(x) ((x) != '\0' && vscode(x) & 8) /* EIY */
|
|
#define noghf(x) ((x) != '\0' && vscode(x) & 16) /* BDH */
|
|
|
|
#define MAXPHONEMELEN 6
|
|
|
|
void
|
|
Metaphone::generateKey(char *word, String &key)
|
|
{
|
|
if (!word || !*word)
|
|
return;
|
|
|
|
char *n;
|
|
String ntrans;
|
|
|
|
/*
|
|
* Copy Word to internal buffer, dropping non-alphabetic characters
|
|
* and converting to upper case
|
|
*/
|
|
|
|
ntrans << "0000";
|
|
|
|
for (; *word; word++)
|
|
{
|
|
if (isalpha(*word))
|
|
ntrans << *word;
|
|
}
|
|
ntrans.uppercase();
|
|
|
|
/* ntrans[0] will always be == 0 */
|
|
n = ntrans.get();
|
|
*n++ = 0;
|
|
*n++ = 0;
|
|
*n++ = 0;
|
|
*n = 0; /* Pad with nulls */
|
|
n = ntrans.get() + 4; /* Assign pointer to start */
|
|
|
|
/* Check for PN, KN, GN, AE, WR, WH, and X at start */
|
|
switch (*n)
|
|
{
|
|
case 'P':
|
|
case 'K':
|
|
case 'G':
|
|
/* 'PN', 'KN', 'GN' becomes 'N' */
|
|
if (*(n + 1) == 'N')
|
|
*n++ = 0;
|
|
break;
|
|
case 'A':
|
|
/* 'AE' becomes 'E' */
|
|
if (*(n + 1) == 'E')
|
|
*n++ = 0;
|
|
break;
|
|
case 'W':
|
|
/* 'WR' becomes 'R', and 'WH' to 'W' */
|
|
if (*(n + 1) == 'R')
|
|
*n++ = 0;
|
|
else if (*(n + 1) == 'H') {
|
|
*(n + 1) = *n;
|
|
*n++ = 0;
|
|
}
|
|
break;
|
|
case 'X':
|
|
/* 'X' becomes 'S' */
|
|
*n = 'S';
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Now, loop step through string, stopping at end of string or when
|
|
* the computed 'metaph' is MAXPHONEMELEN characters long
|
|
*/
|
|
|
|
for (; *n && key.length() < MAXPHONEMELEN; n++)
|
|
{
|
|
/* Drop duplicates except for CC */
|
|
if (*(n - 1) == *n && *n != 'C')
|
|
continue;
|
|
/* Check for F J L M N R or first letter vowel */
|
|
if (same(*n) || *(n - 1) == '\0' && vowel(*n))
|
|
key << *n;
|
|
else
|
|
{
|
|
switch (*n)
|
|
{
|
|
case 'B':
|
|
/*
|
|
* B unless in -MB
|
|
*/
|
|
if (*(n + 1) || *(n - 1) != 'M')
|
|
key << *n;
|
|
break;
|
|
case 'C':
|
|
/*
|
|
* X if in -CIA-, -CH- else S if in
|
|
* -CI-, -CE-, -CY- else dropped if
|
|
* in -SCI-, -SCE-, -SCY- else K
|
|
*/
|
|
if (*(n - 1) != 'S' || !frontv(*(n + 1)))
|
|
{
|
|
if (*(n + 1) == 'I' && *(n + 2) == 'A')
|
|
key << 'X';
|
|
else if (frontv(*(n + 1)))
|
|
key << 'S';
|
|
else if (*(n + 1) == 'H')
|
|
key << (((*(n - 1) == '\0' && !vowel(*(n + 2)))
|
|
|| *(n - 1) == 'S')
|
|
? 'K' : 'X');
|
|
else
|
|
key << 'K';
|
|
}
|
|
break;
|
|
case 'D':
|
|
/*
|
|
* J if in DGE or DGI or DGY else T
|
|
*/
|
|
key << ((*(n + 1) == 'G' && frontv(*(n + 2)))
|
|
? (char) 'J' : (char) 'T');
|
|
break;
|
|
case 'G':
|
|
/*
|
|
* F if in -GH and not B--GH, D--GH,
|
|
* -H--GH, -H---GH else dropped if
|
|
* -GNED, -GN, -DGE-, -DGI-, -DGY-
|
|
* else J if in -GE-, -GI-, -GY- and
|
|
* not GG else K
|
|
*
|
|
*/
|
|
if ((*(n + 1) != 'G' || vowel(*(n + 2))) &&
|
|
(*(n + 1) != 'N' || (*(n + 1) &&
|
|
(*(n + 2) != 'E' ||
|
|
*(n + 3) != 'D'))) &&
|
|
(*(n - 1) != 'D' || !frontv(*(n + 1))))
|
|
if (frontv(*(n + 1)) && *(n + 2) != 'G')
|
|
key << 'J';
|
|
else
|
|
key << 'K';
|
|
else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
|
|
*(n - 4) != 'H')
|
|
key << 'F';
|
|
break;
|
|
case 'H':
|
|
/*
|
|
* H if before a vowel and not after
|
|
* C, G, P, S, T else dropped
|
|
*/
|
|
if (!varson(*(n - 1)) && (!vowel(*(n - 1
|
|
)) ||
|
|
vowel(*(n + 1))))
|
|
key << 'H';
|
|
break;
|
|
case 'K':
|
|
/*
|
|
* dropped if after C else K
|
|
*/
|
|
if (*(n - 1) != 'C')
|
|
key << 'K';
|
|
break;
|
|
case 'P':
|
|
/*
|
|
* F if before H, else P
|
|
*/
|
|
key << (*(n + 1) == 'H' ?
|
|
(char) 'F' : (char) 'P');
|
|
break;
|
|
case 'Q':
|
|
/*
|
|
* K
|
|
*/
|
|
key << 'K';
|
|
break;
|
|
case 'S':
|
|
/*
|
|
* X in -SH-, -SIO- or -SIA- else S
|
|
*/
|
|
key << ((*(n + 1) == 'H' ||
|
|
(*(n + 1) == 'I' && (*(n + 2) == 'O' ||
|
|
*(n + 2) == 'A')))
|
|
? (char) 'X' : (char) 'S');
|
|
break;
|
|
case 'T':
|
|
/*
|
|
* X in -TIA- or -TIO- else 0 (zero)
|
|
* before H else dropped if in -TCH-
|
|
* else T
|
|
*/
|
|
if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
|
|
*(n + 2) == 'A'))
|
|
key << 'X';
|
|
else if (*(n + 1) == 'H')
|
|
key << '0';
|
|
else if (*(n + 1) != 'C' || *(n + 2) != 'H')
|
|
key << 'T';
|
|
break;
|
|
case 'V':
|
|
/*
|
|
* F
|
|
*/
|
|
key << 'F';
|
|
break;
|
|
case 'W':
|
|
/*
|
|
* W after a vowel, else dropped
|
|
*/
|
|
case 'Y':
|
|
/*
|
|
* Y unless followed by a vowel
|
|
*/
|
|
if (vowel(*(n + 1)))
|
|
key << *n;
|
|
break;
|
|
case 'X':
|
|
/*
|
|
* KS
|
|
*/
|
|
if (*(n - 1) == '\0')
|
|
key << 'S';
|
|
else
|
|
key << "KS"; /* Insert K, then S */
|
|
break;
|
|
case 'Z':
|
|
/*
|
|
* S
|
|
*/
|
|
key << 'S';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// void Metaphone::addWord(char *word)
|
|
//
|
|
void
|
|
Metaphone::addWord(char *word)
|
|
{
|
|
if (!dict)
|
|
{
|
|
dict = new Dictionary;
|
|
}
|
|
|
|
String key;
|
|
generateKey(word, key);
|
|
|
|
if (key.length() == 0)
|
|
return;
|
|
String *s = (String *) dict->Find(key);
|
|
if (s)
|
|
{
|
|
// if (mystrcasestr(s->get(), word) != 0)
|
|
(*s) << ' ' << word;
|
|
}
|
|
else
|
|
{
|
|
dict->Add(key, new String(word));
|
|
}
|
|
}
|