You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
408 lines
11 KiB
408 lines
11 KiB
//----------------------------------------------------------------
|
|
//
|
|
// libhtdig_htmerge.cc
|
|
//
|
|
// 1/25/2002 created from htmerge.cc
|
|
//
|
|
// Neal Richter nealr@rightnow.com
|
|
//
|
|
// libhtdig_htmerge.cc
|
|
//
|
|
// htmerge: Merges two databases and/or updates databases to remove
|
|
// old documents and ensures the databases are consistent.
|
|
// Calls db.cc, docs.cc, and/or words.cc as necessary
|
|
//
|
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
|
// Copyright (c) 1999-2004 The ht://Dig Group
|
|
// For copyright details, see the file COPYING in your distribution
|
|
// or the GNU Library General Public License (LGPL) version 2 or later or later
|
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
|
//
|
|
// $Id: libhtdig_htmerge.cc,v 1.5 2004/05/28 13:15:29 lha Exp $
|
|
//
|
|
//----------------------------------------------------------------
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "htconfig.h"
|
|
#endif /* HAVE_CONFIG_H */
|
|
|
|
extern "C" {
|
|
#include "libhtdig_api.h"
|
|
}
|
|
|
|
#include "libhtdig_log.h"
|
|
|
|
#include "WordContext.h"
|
|
#include "good_strtok.h"
|
|
#include "defaults.h"
|
|
#include "DocumentDB.h"
|
|
#include "HtURLCodec.h"
|
|
#include "HtWordList.h"
|
|
#include "HtWordReference.h"
|
|
#include "htString.h"
|
|
|
|
#ifdef HAVE_STD
|
|
#include <fstream>
|
|
#ifdef HAVE_NAMESPACES
|
|
using namespace std;
|
|
#endif
|
|
#else
|
|
#include <fstream.h>
|
|
#endif /* HAVE_STD */
|
|
|
|
#include <stdio.h>
|
|
|
|
#ifndef _WIN32
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
|
|
// If we have this, we probably want it.
|
|
//#ifdef HAVE_GETOPT_H
|
|
//#include <getopt.h>
|
|
//#endif
|
|
|
|
|
|
|
|
|
|
|
|
//Global Variables for this file
|
|
|
|
// This hash is used to keep track of all the document IDs which have to be
|
|
// discarded.
|
|
// This is generated from the doc database and is used to prune words
|
|
// from the word db
|
|
static Dictionary discard_list;
|
|
|
|
// This config is used for merging multiple databses
|
|
static HtConfiguration merge_config;
|
|
static HtConfiguration *config = NULL;
|
|
|
|
static int verbose = 0;
|
|
//static int stats = 0;
|
|
static int alt_work_area = 0;
|
|
|
|
//static String configFile = DEFAULT_CONFIG_FILE;
|
|
extern String configFile;
|
|
|
|
static String merge_configFile = 0;
|
|
|
|
|
|
// Component procedures
|
|
static int mergeDB ();
|
|
|
|
int htmerge_index_merge(htmerge_parameters_struct *htmerge_parms)
|
|
{
|
|
int ret = -1;
|
|
int merge_ret = -1;
|
|
|
|
//load htmerge 'command-line parameters'
|
|
configFile = htmerge_parms->configFile;
|
|
merge_configFile = htmerge_parms->merge_configFile;
|
|
verbose = htmerge_parms->debug;
|
|
if(verbose != 0)
|
|
{
|
|
ret = logOpen(htmerge_parms->logFile);
|
|
|
|
if(ret == FALSE)
|
|
{
|
|
reportError (form ("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n",
|
|
htmerge_parms->logFile, errno, strerror(errno)) );
|
|
return(HTMERGE_ERROR_LOGFILE_OPEN);
|
|
}
|
|
}
|
|
|
|
alt_work_area = htmerge_parms->alt_work_area;
|
|
|
|
|
|
|
|
config = HtConfiguration::config ();
|
|
config->Defaults (&defaults[0]);
|
|
|
|
if (access ((char *) configFile, R_OK) < 0)
|
|
{
|
|
reportError (form ("[HTMERGE] Unable to find configuration file '%s'",
|
|
configFile.get ()));
|
|
return(HTMERGE_ERROR_CONFIG_READ);
|
|
}
|
|
|
|
config->Read (configFile);
|
|
|
|
//
|
|
// Check url_part_aliases and common_url_parts for
|
|
// errors.
|
|
String url_part_errors = HtURLCodec::instance ()->ErrMsg ();
|
|
|
|
if (url_part_errors.length () != 0)
|
|
{
|
|
reportError (form("[HTMERGE] Invalid url_part_aliases or common_url_parts: %s",
|
|
url_part_errors.get ()));
|
|
return(HTMERGE_ERROR_URL_PART);
|
|
}
|
|
|
|
if (merge_configFile.length ())
|
|
{
|
|
merge_config.Defaults (&defaults[0]);
|
|
if (access ((char *) merge_configFile, R_OK) < 0)
|
|
{
|
|
reportError (form ("[HTMERGE] Unable to find configuration file '%s'",
|
|
merge_configFile.get ()));
|
|
return(HTMERGE_ERROR_CONFIG_READ);
|
|
}
|
|
merge_config.Read (merge_configFile);
|
|
}
|
|
|
|
if (alt_work_area != 0)
|
|
{
|
|
String configValue;
|
|
|
|
configValue = config->Find ("word_db");
|
|
if (configValue.length () != 0)
|
|
{
|
|
configValue << ".work";
|
|
config->Add ("word_db", configValue);
|
|
}
|
|
|
|
configValue = config->Find ("doc_db");
|
|
if (configValue.length () != 0)
|
|
{
|
|
configValue << ".work";
|
|
config->Add ("doc_db", configValue);
|
|
}
|
|
|
|
configValue = config->Find ("doc_index");
|
|
if (configValue.length () != 0)
|
|
{
|
|
configValue << ".work";
|
|
config->Add ("doc_index", configValue);
|
|
}
|
|
|
|
configValue = config->Find ("doc_excerpt");
|
|
if (configValue.length () != 0)
|
|
{
|
|
configValue << ".work";
|
|
config->Add ("doc_excerpt", configValue);
|
|
}
|
|
}
|
|
|
|
WordContext::Initialize(*config);
|
|
|
|
if (merge_configFile.length())
|
|
{
|
|
// Merge the databases specified in merge_configFile into the current
|
|
// databases. Do this first then update the other databases as usual
|
|
// Note: We don't have to specify anything, it's all in the config vars
|
|
|
|
merge_ret = mergeDB();
|
|
}
|
|
|
|
//call destructors here
|
|
config->~HtConfiguration();
|
|
merge_config.~HtConfiguration();
|
|
|
|
if (verbose != 0)
|
|
{
|
|
ret = logClose();
|
|
|
|
if (ret == FALSE)
|
|
{
|
|
reportError (form("[HTMERGE]: Error closing file [%s]. Error:[%d], %s\n",
|
|
htmerge_parms->logFile, errno, strerror(errno)) );
|
|
return(HTMERGE_ERROR_LOGFILE_CLOSE);
|
|
}
|
|
}
|
|
|
|
return(TRUE);
|
|
}
|
|
|
|
//*****************************************************************************
|
|
// void mergeDB()
|
|
//
|
|
static int mergeDB ()
|
|
{
|
|
HtConfiguration *config = HtConfiguration::config ();
|
|
DocumentDB merge_db, db;
|
|
List *urls;
|
|
Dictionary merge_dup_ids, db_dup_ids; // Lists of DocIds to ignore
|
|
int docIDOffset;
|
|
|
|
const String doc_index = config->Find ("doc_index");
|
|
if (access (doc_index, R_OK) < 0)
|
|
{
|
|
reportError (form
|
|
("[HTMERGE] Unable to open document index '%s'",
|
|
(const char *) doc_index));
|
|
return(HTMERGE_ERROR_DOCINDEX_READ);
|
|
}
|
|
const String doc_excerpt = config->Find ("doc_excerpt");
|
|
if (access (doc_excerpt, R_OK) < 0)
|
|
{
|
|
reportError (form
|
|
("[HTMERGE] Unable to open document excerpts '%s'",
|
|
(const char *) doc_excerpt));
|
|
return(HTMERGE_ERROR_EXCERPTDB_READ);
|
|
}
|
|
const String doc_db = config->Find ("doc_db");
|
|
if (db.Open (doc_db, doc_index, doc_excerpt) < 0)
|
|
{
|
|
reportError (form ("[HTMERGE] Unable to open/create document database '%s'",
|
|
(const char *) doc_db));
|
|
return(HTMERGE_ERROR_DOCDB_READ);
|
|
}
|
|
|
|
|
|
const String merge_doc_index = merge_config["doc_index"];
|
|
if (access (merge_doc_index, R_OK) < 0)
|
|
{
|
|
reportError (form
|
|
("[HTMERGE] Unable to open document index '%s'",
|
|
(const char *) merge_doc_index));
|
|
return(HTMERGE_ERROR_DOCINDEX_READ);
|
|
}
|
|
const String merge_doc_excerpt = merge_config["doc_excerpt"];
|
|
if (access (merge_doc_excerpt, R_OK) < 0)
|
|
{
|
|
reportError (form
|
|
("[HTMERGE] Unable to open document excerpts '%s'",
|
|
(const char *) merge_doc_excerpt));
|
|
return(HTMERGE_ERROR_EXCERPTDB_READ);
|
|
}
|
|
const String merge_doc_db = merge_config["doc_db"];
|
|
if (merge_db.Open (merge_doc_db, merge_doc_index, merge_doc_excerpt) < 0)
|
|
{
|
|
reportError (form ("[HTMERGE] Unable to open document database '%s'",
|
|
(const char *) merge_doc_db));
|
|
return(HTMERGE_ERROR_DOCDB_READ);
|
|
}
|
|
|
|
// Start the merging by going through all the URLs that are in
|
|
// the database to be merged
|
|
|
|
urls = merge_db.URLs ();
|
|
// This ensures that every document added from merge_db has a unique ID
|
|
// in the new database
|
|
docIDOffset = db.NextDocID ();
|
|
|
|
urls->Start_Get ();
|
|
String *url;
|
|
String id;
|
|
while ((url = (String *) urls->Get_Next ()))
|
|
{
|
|
DocumentRef *ref = merge_db[url->get ()];
|
|
DocumentRef *old_ref = db[url->get ()];
|
|
if (!ref)
|
|
continue;
|
|
|
|
if (old_ref)
|
|
{
|
|
// Oh well, we knew this would happen. Let's get the duplicate
|
|
// And we'll only use the most recent date.
|
|
|
|
if (old_ref->DocTime () >= ref->DocTime ())
|
|
{
|
|
// Cool, the ref we're merging is too old, just ignore it
|
|
char str[20];
|
|
sprintf (str, "%d", ref->DocID ());
|
|
merge_dup_ids.Add (str, 0);
|
|
|
|
if (verbose > 1)
|
|
{
|
|
logEntry(form("[HTMERGE] Duplicate, URL: {%s} ignoring & merging copy\n", url));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// The ref we're merging is newer, delete the old one and add
|
|
char str[20];
|
|
sprintf (str, "%d", old_ref->DocID ());
|
|
db_dup_ids.Add (str, 0);
|
|
db.Delete (old_ref->DocID ());
|
|
ref->DocID (ref->DocID () + docIDOffset);
|
|
db.Add (*ref);
|
|
if (verbose > 1)
|
|
{
|
|
logEntry(form("[HTMERGE] Duplicate, URL: {%s} ignoring destination copy\n",url->get()));
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// It's a new URL, just add it, making sure to load the excerpt
|
|
merge_db.ReadExcerpt (*ref);
|
|
ref->DocID (ref->DocID () + docIDOffset);
|
|
db.Add (*ref);
|
|
if (verbose > 1)
|
|
{
|
|
logEntry(form("[HTMERGE] Merged URL: {%s} \n",url->get()));
|
|
}
|
|
}
|
|
delete ref;
|
|
delete old_ref;
|
|
}
|
|
delete urls;
|
|
|
|
// As reported by Roman Dimov, we must update db.NextDocID()
|
|
// because of all the added records...
|
|
db.IncNextDocID (merge_db.NextDocID ());
|
|
merge_db.Close ();
|
|
db.Close ();
|
|
|
|
// OK, after merging the doc DBs, we do the same for the words
|
|
HtWordList mergeWordDB (*config), wordDB (*config);
|
|
List *words;
|
|
String docIDKey;
|
|
|
|
if (wordDB.Open (config->Find ("word_db"), O_RDWR) < 0)
|
|
{
|
|
reportError (form ("[HTMERGE] Unable to open/create word database '%s'",
|
|
(const char *) config->Find ("word_db")));
|
|
return(HTMERGE_ERROR_WORDDB_READ);
|
|
}
|
|
|
|
if (mergeWordDB.Open (merge_config["word_db"], O_RDONLY) < 0)
|
|
{
|
|
reportError (form ("[HTMERGE] Unable to open word database '%s'",
|
|
(const char *) merge_config["word_db"]));
|
|
return(HTMERGE_ERROR_WORDDB_READ);
|
|
}
|
|
|
|
// Start the merging by going through all the URLs that are in
|
|
// the database to be merged
|
|
|
|
words = mergeWordDB.WordRefs ();
|
|
|
|
words->Start_Get ();
|
|
HtWordReference *word;
|
|
while ((word = (HtWordReference *) words->Get_Next ()))
|
|
{
|
|
docIDKey = word->DocID ();
|
|
if (merge_dup_ids.Exists (docIDKey))
|
|
continue;
|
|
|
|
word->DocID (word->DocID () + docIDOffset);
|
|
wordDB.Override (*word);
|
|
}
|
|
delete words;
|
|
|
|
words = wordDB.WordRefs ();
|
|
words->Start_Get ();
|
|
while ((word = (HtWordReference *) words->Get_Next ()))
|
|
{
|
|
docIDKey = word->DocID ();
|
|
if (db_dup_ids.Exists (docIDKey))
|
|
wordDB.Delete (*word);
|
|
}
|
|
delete words;
|
|
|
|
// Cleanup--just close the two word databases
|
|
mergeWordDB.Close ();
|
|
wordDB.Close ();
|
|
|
|
return(TRUE);
|
|
|
|
}
|
|
|