//
// htfuzzy.cc
//
// htfuzzy: Create one or more ``fuzzy'' indexes into the main word database.
// These indexes can be used by htsearch to perform a search that uses
// other algorithms than exact word match.
//
// This program is meant to be run after htmerge has created the word
// database.
//
// For each fuzzy algorithm, there will be a separate database. Each
// database is simply a mapping from the fuzzy key to a list of words
// in the main word database.
//
// Part of the ht://Dig package
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
//
//
// $Id: htfuzzy.cc,v 1.20 2004/05/28 13:15:20 lha Exp $
//
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
#include "htfuzzy.h"
#include "Fuzzy.h"
#include "Accents.h"
#include "Soundex.h"
#include "Endings.h"
#include "Metaphone.h"
#include "Synonym.h"
#include "htString.h"
#include "List.h"
#include "Dictionary.h"
#include "defaults.h"
#include "HtWordList.h"
#include "WordContext.h"
// If we have this, we probably want it.
#ifdef HAVE_GETOPT_H
#include
#elif HAVE_GETOPT_LOCAL
#include
#endif
int debug = 0;
void usage();
//*****************************************************************************
// int main(int ac, char **av)
//
int
main(int ac, char **av)
{
int c, i;
extern char *optarg;
extern int optind;
String configFile = DEFAULT_CONFIG_FILE;
//
// Parse command line arguments
//
while ((c = getopt(ac, av, "c:v")) != -1)
{
switch (c)
{
case 'c':
configFile = optarg;
break;
case 'v':
debug++;
break;
default:
usage();
}
}
HtConfiguration* config= HtConfiguration::config();
//
// Determine what algorithms to use
//
List wordAlgorithms;
List noWordAlgorithms;
for (i = optind; i < ac; i++)
{
if (mystrcasecmp(av[i], "soundex") == 0)
{
wordAlgorithms.Add(new Soundex(*config));
}
else if (mystrcasecmp(av[i], "metaphone") == 0)
{
wordAlgorithms.Add(new Metaphone(*config));
}
else if (mystrcasecmp(av[i], "accents") == 0)
{
wordAlgorithms.Add(new Accents(*config));
}
else if (mystrcasecmp(av[i], "endings") == 0)
{
noWordAlgorithms.Add(new Endings(*config));
}
else if (mystrcasecmp(av[i], "synonyms") == 0)
{
noWordAlgorithms.Add(new Synonym(*config));
}
else
{
reportError(form("'%s' is not a supported algorithm",
av[i]));
}
}
if (wordAlgorithms.Count() == 0 && noWordAlgorithms.Count() == 0)
{
cout << "htfuzzy: No algorithms specified\n";
usage();
}
//
// Find and parse the configuration file.
//
config->Defaults(&defaults[0]);
if (access((char*)configFile, R_OK) < 0)
{
reportError(form("Unable to find configuration file '%s'",
configFile.get()));
}
config->Read(configFile);
// Initialize htword library (key description + wordtype...)
WordContext::Initialize(*config);
Fuzzy *fuzzy;
if (wordAlgorithms.Count() > 0)
{
//
// Open the word database so that we can grab the words from it.
//
HtWordList worddb(*config);
if (worddb.Open(config->Find("word_db"), O_RDONLY) == OK)
{
//
// Go through all the words in the database
//
List *words = worddb.Words();
String *key;
Fuzzy *fuzzy = 0;
String word, fuzzyKey;
int count = 0;
words->Start_Get();
while ((key = (String *) words->Get_Next()))
{
word = *key;
wordAlgorithms.Start_Get();
while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next()))
{
fuzzy->addWord(word);
}
count++;
if ((count % 100) == 0 && debug)
{
cout << "htfuzzy: words: " << count << '\n';
cout.flush();
}
}
if (debug)
{
cout << "htfuzzy: total words: " << count << "\n";
cout << "htfuzzy: Writing index files...\n";
}
//
// All the information is now in memory.
// Write all of it out to the individual databases
//
wordAlgorithms.Start_Get();
while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next()))
{
fuzzy->writeDB();
}
worddb.Close();
words->Destroy();
delete words;
if (fuzzy)
delete fuzzy;
}
else
{
reportError(form("Unable to open word database %s", config->Find("word_db").get()));
}
}
if (noWordAlgorithms.Count() > 0)
{
noWordAlgorithms.Start_Get();
while ((fuzzy = (Fuzzy *) noWordAlgorithms.Get_Next()))
{
if (debug)
{
cout << "htfuzzy: Selected algorithm: " << fuzzy->getName()
<< endl;
}
if (fuzzy->createDB(*config) == NOTOK)
{
cout << "htfuzzy: Could not create database for algorithm: "
<< fuzzy->getName() << endl;
}
}
}
if (debug)
{
cout << "htfuzzy: Done.\n";
}
return 0;
}
//*****************************************************************************
// void usage()
//
void
usage()
{
cout << "usage: htfuzzy [-c configfile][-v] algorithm ...\n";
cout << "This program is part of ht://Dig " << VERSION << "\n\n";
cout << "Supported algorithms:\n";
cout << "\tsoundex\n";
cout << "\tmetaphone\n";
cout << "\taccents\n";
cout << "\tendings\n";
cout << "\tsynonyms\n";
cout << "\n";
cout << "Options:\n";
cout << "\t-c configfile\n";
cout << "\t\tUse the specified configuration file instead of the\n";
cout << "\t\tdefault.\n\n";
cout << "\t-v\tVerbose mode. This increases the verbosity of the\n";
cout << "\t\tprogram. Using more than 2 is probably only useful\n";
cout << "\t\tfor debugging purposes.\n\n";
exit(0);
}
//*****************************************************************************
// void reportError(char *msg)
//
void
reportError(char *msg)
{
cout << "htfuzzy: " << msg << "\n\n";
exit(1);
}