You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

266 lines
6.5 KiB

//----------------------------------------------------------------
//
// libhtdig_htfuzzy.cc
//
// 1/25/2002 created from htfuzzy.cc
//
// Neal Richter nealr@rightnow.com
//
// libhtdig_htfuzzy.cc
//
// htfuzzy: Create one or more ``fuzzy'' indexes into the main word database.
// These indexes can be used by htsearch to perform a search that uses
// other algorithms than exact word match.
//
// This program is meant to be run after htmerge has created the word
// database.
//
// For each fuzzy algorithm, there will be a separate database. Each
// database is simply a mapping from the fuzzy key to a list of words
// in the main word database.
//
// Part of the ht://Dig package <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: libhtdig_htfuzzy.cc,v 1.5 2004/05/28 13:15:29 lha Exp $
//
//----------------------------------------------------------------
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
extern "C"
{
#include "libhtdig_api.h"
}
#include "libhtdig_log.h"
//#include "htfuzzy.h" //NOT USED
#include "Fuzzy.h"
#include "Accents.h"
#include "Soundex.h"
#include "Endings.h"
#include "Metaphone.h"
#include "Synonym.h"
#include "htString.h"
#include "List.h"
#include "Dictionary.h"
#include "defaults.h"
#include "HtWordList.h"
#include "WordContext.h"
// If we have this, we probably want it.
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif
#include "HtConfiguration.h"
#include "HtWordList.h"
#include <stdlib.h>
#ifndef _WIN32
#include <unistd.h>
#endif
#ifdef HAVE_STD
#include <fstream>
#ifdef HAVE_NAMESPACES
using namespace std;
#endif
#else
#include <fstream.h>
#endif /* HAVE_STD */
#include <stdio.h>
extern int debug;
static HtConfiguration * config = NULL;
//*****************************************************************************
// int main(int ac, char **av)
//
//int main(int ac, char **av)
int htfuzzy_index(htfuzzy_parameters_struct * htfuzzy_parms)
{
String configFile = DEFAULT_CONFIG_FILE;
int ret = 0;
//
// Parse command line arguments
//
debug = htfuzzy_parms->debug;
if (debug != 0)
{
ret = logOpen(htfuzzy_parms->logFile);
if (ret == FALSE)
{
fprintf(stderr, "htdig: Error opening file [%s]. Error:[%d], %s\n",
htfuzzy_parms->logFile, errno, strerror(errno));
}
}
configFile = htfuzzy_parms->configFile;
config = HtConfiguration::config();
//
// Determine what algorithms to use
//
List wordAlgorithms;
List noWordAlgorithms;
if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_SOUNDEX)
{
wordAlgorithms.Add(new Soundex(*config));
}
else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_METAPHONE)
{
wordAlgorithms.Add(new Metaphone(*config));
}
else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_ACCENTS)
{
wordAlgorithms.Add(new Accents(*config));
}
else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_ENDINGS)
{
noWordAlgorithms.Add(new Endings(*config));
}
else if (htfuzzy_parms->algorithms_flag & HTDIG_ALG_SYNONYMS)
{
noWordAlgorithms.Add(new Synonym(*config));
}
if (wordAlgorithms.Count() == 0 && noWordAlgorithms.Count() == 0)
{
logEntry(form("htfuzzy: No algorithms specified\n"));
}
//
// Find and parse the configuration file.
//
config->Defaults(&defaults[0]);
if (access((char *) configFile, R_OK) < 0)
{
reportError(form("[HTFUZZY] Unable to find configuration file '%s'", configFile.get()));
}
config->Read(configFile);
// Initialize htword library (key description + wordtype...)
WordContext::Initialize(*config);
Fuzzy *fuzzy;
if (wordAlgorithms.Count() > 0)
{
//
// Open the word database so that we can grab the words from it.
//
HtWordList worddb(*config);
if (worddb.Open(config->Find("word_db"), O_RDONLY) == OK)
{
//
// Go through all the words in the database
//
List *words = worddb.Words();
String *key;
Fuzzy *fuzzy = 0;
String word, fuzzyKey;
int count = 0;
words->Start_Get();
while ((key = (String *) words->Get_Next()))
{
word = *key;
wordAlgorithms.Start_Get();
while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next()))
{
fuzzy->addWord(word);
}
count++;
if ((count % 100) == 0 && debug)
{
//cout << "htfuzzy: words: " << count << '\n';
}
}
if (debug)
{
logEntry(form("htfuzzy: total words: %d\n", count));
logEntry(form("htfuzzy: Writing index files...\n"));
}
//
// All the information is now in memory.
// Write all of it out to the individual databases
//
wordAlgorithms.Start_Get();
while ((fuzzy = (Fuzzy *) wordAlgorithms.Get_Next()))
{
fuzzy->writeDB();
}
worddb.Close();
words->Destroy();
delete words;
if (fuzzy)
delete fuzzy;
}
else
{
reportError(form("[htfuzzy] Unable to open word database %s", config->Find("word_db").get()));
}
}
if (noWordAlgorithms.Count() > 0)
{
noWordAlgorithms.Start_Get();
while ((fuzzy = (Fuzzy *) noWordAlgorithms.Get_Next()))
{
if (debug)
{
logEntry(form( "htfuzzy: Selected algorithm: %s\n", fuzzy->getName()));
}
if (fuzzy->createDB(*config) == NOTOK)
{
logEntry(form("htfuzzy: Could not create database for algorithm: %s\n", fuzzy->getName()));
}
}
}
if (debug)
{
logEntry("htfuzzy: Done.\n");
}
if (debug != 0)
{
ret = logClose();
if (ret == FALSE)
{
fprintf(stderr, "htfuzzy: Error closing file [%s]. Error:[%d], %s\n",
htfuzzy_parms->logFile, errno, strerror(errno));
}
}
delete config;
return 0;
}