// // Synonym.cc // // Synonym: A fuzzy matching algorithm to create a database of related words // (or misspellings) that should be searched together. // // Part of the ht://Dig package // Copyright (c) 1995-2004 The ht://Dig Group // For copyright details, see the file COPYING in your distribution // or the GNU Library General Public License (LGPL) version 2 or later // // // $Id: Synonym.cc,v 1.16 2004/05/28 13:15:20 lha Exp $ // #ifdef HAVE_CONFIG_H #include "htconfig.h" #endif /* HAVE_CONFIG_H */ #include #include "Synonym.h" #include "htfuzzy.h" #include "List.h" #include "StringList.h" #include "HtConfiguration.h" #include "filecopy.h" #include #include #include #ifdef HAVE_STD #include #ifdef HAVE_NAMESPACES using namespace std; #endif #else #include #endif /* HAVE_STD */ //***************************************************************************** Synonym::Synonym(const HtConfiguration& config_arg) : Fuzzy(config_arg) { name = "synonyms"; db = 0; } //***************************************************************************** Synonym::~Synonym() { if (db) { db->Close(); delete db; db = 0; } } //***************************************************************************** int Synonym::createDB(const HtConfiguration &config) { String tmpdir = getenv("TMPDIR"); String dbFile; #if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32 int ret = -1; char * source = NULL; char * dest = NULL; #endif if (tmpdir.length()) dbFile = tmpdir; else dbFile = "/tmp"; dbFile << "/synonyms.db"; char input[1000]; FILE *fl; const String sourceFile = config["synonym_dictionary"]; fl = fopen(sourceFile, "r"); if (fl == NULL) { cout << "htfuzzy/synonyms: unable to open " << sourceFile << endl; cout << "htfuzzy/synonyms: Use the 'synonym_dictionary' attribute\n"; cout << "htfuzzy/synonyms: to specify the file that contains the synonyms\n"; return NOTOK; } Database *db = Database::getDatabaseInstance(DB_HASH); if (db->OpenReadWrite(dbFile.get(), 0664) == NOTOK) { delete db; db = 0; return NOTOK; } String data; String word; int count = 0; while (fgets(input, sizeof(input), fl)) { StringList sl(input, " \t\r\n"); if (sl.Count() < 2) { // Avoid segfault caused by calling Database::Put() if (debug) // with negative length for data field { cout<<"htfuzzy/synonyms: Rejected line with less than 2 words: " << input << endl; cout.flush(); } continue; } for (int i = 0; i < sl.Count(); i++) { data = 0; for (int j = 0; j < sl.Count(); j++) { if (i != j) data << sl[j] << ' '; } word = sl[i]; word.lowercase(); data.lowercase(); db->Put(word, String(data.get(), data.length() - 1)); if (debug && (count % 10) == 0) { cout << "htfuzzy/synonyms: " << count << ' ' << word << "\n"; cout.flush(); } count++; } } fclose(fl); db->Close(); delete db; #if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32 //Uses file_copy function - works on Unix/Linux & WinNT source = dbFile.get(); dest = (char *)config["synonym_db"].get(); //Attempt rename, if fail attempt copy & delete. ret = rename(source, dest); if (ret < 0) { ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON); if (ret == TRUE) unlink(source); else return NOTOK; } if (debug) { cout << "htfuzzy/synonyms: " << count << ' ' << word << "\n"; cout << "htfuzzy/synonyms: Done.\n"; } #else //This code uses a system call - Phase this out struct stat stat_buf; String mv("mv"); // assume it's in the PATH if predefined setting fails if ((stat(MV, &stat_buf) != -1) && S_ISREG(stat_buf.st_mode)) mv = MV; system(form("%s %s %s", mv.get(), dbFile.get(), config["synonym_db"].get())); #endif return OK; } //***************************************************************************** int Synonym::openIndex() { const String dbFile = config["synonym_db"]; if (db) { db->Close(); delete db; db = 0; } db = Database::getDatabaseInstance(DB_HASH); if (db->OpenRead(dbFile) == NOTOK) { delete db; db = 0; return NOTOK; } return OK; } //***************************************************************************** void Synonym::getWords(char *originalWord, List &words) { String data; String stripped = originalWord; HtStripPunctuation(stripped); if (db && db->Get(stripped, data) == OK) { char *token = strtok(data.get(), " "); while (token) { words.Add(new String(token)); token = strtok(0, " "); } } }