extra-dependencies/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc

//
// EndingsDB.cc
//
// EndingsDB: Implementation of the private endings database
//
// Part of the ht://Dig package   <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: EndingsDB.cc,v 1.17 2004/05/28 13:15:20 lha Exp $
//

#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#include <fcntl.h>

#include "Endings.h"
#include "htfuzzy.h"
#include "SuffixEntry.h"
#include "Dictionary.h"
#include "List.h"
#include "HtConfiguration.h"

#include "filecopy.h"

// This is an attempt to get around compatibility problems
// with the included regex
#ifdef _MSC_VER /* _WIN32 */
#include "regex_win32.h"
#else
# ifdef USE_RX
#  include <rxposix.h>
# else // Use regex
#  ifdef HAVE_BROKEN_REGEX
#   include <regex.h>
#  else // include regex code and header
#   include "gregex.h"
#  endif
# endif
#endif //_MSC_VER /* _WIN32 */

#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>

#ifdef HAVE_STD
#include <fstream>
#ifdef HAVE_NAMESPACES
using namespace std;
#endif
#else
#include <fstream.h>
#endif /* HAVE_STD */

//*****************************************************************************
//
int
Endings::createDB(const HtConfiguration &config)
{
    Dictionary	rules;
    String      tmpdir = getenv("TMPDIR");
    String      word2root, root2word;

#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32
    int ret = -1;
    char * source = NULL;
    char * dest = NULL;
#endif

    if (tmpdir.length())
      {
	word2root = tmpdir;
	root2word = tmpdir;
      }
    else
      {
	word2root = "/tmp";
	root2word = "/tmp";
      }

    word2root << "/word2root.db";
    root2word << "/root2word.db";

    if (debug)
	cout << "htfuzzy/endings: Reading rules\n";

    if (readRules(rules, config["endings_affix_file"]) == NOTOK)
	return NOTOK;

    if (debug)
	cout << "htfuzzy/endings: Creating databases\n";

    if (createRoot(rules, word2root, root2word,
		   config["endings_dictionary"]) == NOTOK)
	return NOTOK;

    //
    // Since we used files in TMPDIR for our temporary databases, we need
    // to now move them to the correct location as defined in the config
    // database.
    //

#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32

    //Uses file_copy function - works on Unix/Linux & WinNT
    source = root2word.get();
    dest = (char *)config["endings_root2word_db"].get();

    //Attempt rename, if fail attempt copy & delete.
    ret = rename(source, dest);
    if (ret < 0)
    {
        ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON);
        if (ret == TRUE)
            unlink(source);
        else
            return NOTOK;
    }

    source = word2root.get();
    dest = (char *)config["endings_word2root_db"].get();

    //Attempt rename, if fail attempt copy & delete.
    ret = rename(source, dest);
    if (ret < 0)
    {
        ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON);
        if (ret == TRUE)
            unlink(source);
        else
            return NOTOK;
    }

#else //This code uses a system call - Phase this out

    struct stat stat_buf;
    String mv("mv");	// assume it's in the PATH if predefined setting fails
    if ((stat(MV, &stat_buf) != -1) && S_ISREG(stat_buf.st_mode))
	mv = MV;
    system(form("%s %s %s;%s %s %s",
	mv.get(), root2word.get(), config["endings_root2word_db"].get(),
	mv.get(), word2root.get(), config["endings_word2root_db"].get()));

#endif

    return OK;

}


//*****************************************************************************
int
Endings::readRules(Dictionary &rules, const String& rulesFile)
{
    FILE	*fl = fopen(rulesFile, "r");

    if (fl == NULL)
	return NOTOK;

    int		inSuffixes = 0;
    char	currentSuffix[2] = " ";
    char	*p;
    char	input[1024];
    String	line;

    while (fgets(input, sizeof(input), fl))
    {
	if (input[0] == '\n' || input[0] == '#')
	    continue;

	if (mystrncasecmp(input, "suffixes", 8) == 0)
	{
	    inSuffixes = 1;
	    continue;
	}
	else if (mystrncasecmp(input, "prefixes", 8) == 0)
	{
	    inSuffixes = 0;
	    continue;
	}
	if (!inSuffixes)
	    continue;

	if (mystrncasecmp(input, "flag ", 5) == 0)
	{
	    p = input + 5;
	    while (*p == '*' || *p == ' ' || *p == '\t')
		p++;
	    currentSuffix[0] = *p;
	}
	else
	{
	    line << input;
	    line.chop("\r\n");
	    if (line.indexOf('>') > 0)
	    {
		List		*list;
		SuffixEntry	*se = new SuffixEntry(line);

		if (rules.Exists(currentSuffix))
		{
		    list = (List *) rules[currentSuffix];
		}
		else
		{
		    list = new List;
		    rules.Add(currentSuffix, list);
		}
		list->Add(se);
		line = 0;
	    }
	}
    }

    fclose(fl);
    return OK;
}


//*****************************************************************************
int
Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, const String& dictFile)
{
    FILE	*fl = fopen(dictFile, "r");
    if (fl == NULL)
	return NOTOK;

    Database	*w2r = Database::getDatabaseInstance(DB_BTREE);
    Database	*r2w = Database::getDatabaseInstance(DB_BTREE);

    w2r->OpenReadWrite(word2root, 0664);
    r2w->OpenReadWrite(root2word, 0664);

    char	input[1024];
    char	*p;
    String	words;
    String	word;
    List	wordList;
    int		count = 0;
    String	data;

    while (fgets(input, sizeof(input), fl))
    {
	if ((count % 100) == 0 && debug == 1)
	{
	    cout << "htfuzzy/endings: words: " << count << '\n';
	    cout.flush();
	}
	count++;

	p = strchr(input, '/');
	if (p == NULL)
	    continue;		// Only words that have legal endings are used

	*p++ = '\0';

	mungeWord(input, word);
	expandWord(words, wordList, rules, word, p);

	if (debug > 1)
	    cout << "htfuzzy/endings: " << word << " --> " << words << endl;

	//
	// Store the root mapped to the list of expanded words.
	//
	r2w->Put(word, words);

	//
	// For each of the expanded words, build a map to its root.
	//
	for (int i = 0; i < wordList.Count(); i++)
	{
	    //
	    // Append to existing record if there is one.
	    //
	    data = "";
	    if (w2r->Get(*(String *)wordList[i], data) == OK)
		data << ' ';
	    data << word;
	    w2r->Put(*(String *)wordList[i], data);
	}
    }

    if (debug == 1)
	cout << endl;

    fclose(fl);
    w2r->Close();
    r2w->Close();
    delete w2r;
    delete r2w;

    return OK;
}


//*****************************************************************************
// Convert a word from the dictionary format into something we can actually
// use.  This means that the word will be converted to lowercase and that
// any accents will be combined into single characters.
//
void
Endings::mungeWord(char *input, String &word)
{
    char	*p = input + 1;

    word = 0;
    while (*input)
    {
	p = input + 1;
	switch (*p)
	{
    	    case '"':	// The previous character needs to get an umlaut
		switch (*input)
		{
		    case 'a':
		    case 'A':
			word << char(228);
			input += 2;
			continue;
			break;
		    case 'e':
		    case 'E':
			word << char(235);
			input += 2;
			continue;
			break;
		    case 'i':
		    case 'I':
			word << char(239);
			input += 2;
			continue;
			break;
		    case 'o':
		    case 'O':
			word << char(246);
			input += 2;
			continue;
			break;
		    case 'u':
		    case 'U':
			word << char(252);
			input += 2;
			continue;
			break;
		}
		break;

	    case 'S':	// See if the previous character needs to be an sz
		if (*input == 's')
		{
		    word << char(223);
		    input += 2;
		    continue;
		}
		else
		{
		    word << *input;
		}
		break;

	    default:
		word << *input;
		break;
	}
	input++;
    }
    word.lowercase();
}


//*****************************************************************************
void
Endings::expandWord(String &words, List &wordList,
		    Dictionary &rules, char *word, char *suffixes)
{
    char	suffix[2] = " ";
    String	root;
    SuffixEntry	*entry;
    List	*suffixRules;
    char	*p;
    String	rule;

    words = 0;
    wordList.Destroy();

    while (*suffixes > ' ')
    {
	suffix[0] = *suffixes++;
	if (!rules.Exists(suffix))
	    continue;

	suffixRules = (List *) rules[suffix];
	for (int i = 0; i < suffixRules->Count(); i++)
	{
	    entry = (SuffixEntry *) (*suffixRules)[i];
	    root = word;
	    regex_t	reg;
	    rule = entry->rule;
	    if (strchr((char*)rule, '\''))
		continue;
	    if (debug > 2)
		cout << "Applying regex '" << entry->expression << "' to " << word << endl;
	    regcomp(&reg, (char*)entry->expression, REG_ICASE | REG_NOSUB | REG_EXTENDED);
	    if (regexec(&reg, word, 0, NULL, 0) == 0)
	    {
		//
		// Matched
		//
		if (rule[0] == '-')
		{
		    //
		    // We need to remove something...
		    //
		    p = strchr((char*)rule, ',');
		    if (p)
		    {
			*p++ = '\0';
			root.chop((int)strlen(rule.get()) - 1);
			root << p;
		    }
		}
		else
		{
		    root << rule;
		}
		root.lowercase();
		if (debug > 2)
		    cout << word << " with " << rule << " --> '" << root << "'\n";
		wordList.Add(new String(root));
		words << root << ' ';
	    }
	    regfree(&reg);
	}
    }
    words.chop(1);
}