You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

518 lines
12 KiB

//--------------------------------------------------------------------
//
// TextCollector.cc
//
// 2/6/2002 created for libhtdig
//
// Neal Richter nealr@rightnow.com
//
// TextCollector:
// General Purpose Text Document Indexer.
// Calls appropriate parsers.
// The parser notifies the TextCollector object that it got something
// (got_* functions) and the TextCollector object feed the databases
// and statistics accordingly.
//
//
// Part of the ht://Dig package <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: TextCollector.cc,v 1.4 2004/05/28 13:15:29 lha Exp $
//
//--------------------------------------------------------------------
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
#include "TextCollector.h"
#include "htdig.h"
#include "HtWordList.h"
#include "WordRecord.h"
#include "URLRef.h"
#include "Server.h"
#include "Parsable.h"
#include "BasicDocument.h"
#include "StringList.h"
#include "WordType.h"
#include "md5.h"
#include "defaults.h"
#include <signal.h>
#include <stdio.h>
#include <sys/timeb.h>
//*****************************************************************************
// TextCollector::TextCollector()
//
TextCollector::TextCollector(TextCollectorLog flags):
words(*(HtConfiguration::config()))
{
HtConfiguration *config = HtConfiguration::config();
//FILE *urls_parsed;
currenthopcount = 0;
//turn on word tracking!
trackWords = 1;
//
// Initialize the flags for the various HTML factors
//
// text_factor
factor[0] = FLAG_TEXT;
// title_factor
factor[1] = FLAG_TITLE;
// heading factor (now generic)
factor[2] = FLAG_HEADING;
factor[3] = FLAG_HEADING;
factor[4] = FLAG_HEADING;
factor[5] = FLAG_HEADING;
factor[6] = FLAG_HEADING;
factor[7] = FLAG_HEADING;
// img alt text
//factor[8] = FLAG_KEYWORDS;
factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has
// its own FLAG and factor.
// keywords factor
factor[9] = FLAG_KEYWORDS;
// META description factor
factor[10] = FLAG_DESCRIPTION;
doc = NULL;
minimumWordLength = config->Value("minimum_word_length", 3);
//TODO put document-index log file stuff here via logs like Retriever
check_unique_md5 = config->Boolean("check_unique_md5", 0);
check_unique_date = config->Boolean("check_unique_date", 0);
d_md5 = 0;
if (check_unique_md5)
{
d_md5 = Database::getDatabaseInstance(DB_HASH);
if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)
{
cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";
}
}
temp_doc_count = 0;
}
//*****************************************************************************
// TextCollector::~TextCollector()
//
TextCollector::~TextCollector()
{
if (d_md5)
d_md5->Close();
//delete doc;
if(temp_doc_count != 0)
{
words.Flush();
temp_doc_count = 0;
}
words.Flush();
words.Close();
}
//*****************************************************************************
// void TextCollector::IndexDoc()
//
//
int
TextCollector::IndexDoc(BasicDocument & a_basicdoc)
{
DocumentRef *ref;
time_t date;
int old_document = 0;
static int index = 0;
//struct timeb tb;
//HtConfiguration *config = HtConfiguration::config();
doc = &a_basicdoc;
ref = docs[doc->Location()]; // It might be nice to have just an Exists() here
if (ref)
{
//
// We already have an entry for this document in our database.
// This means we can get the document ID and last modification
// time from there.
//
current_id = ref->DocID();
date = ref->DocTime();
if (ref->DocAccessed())
old_document = 1;
else // we haven't retrieved it yet, so we only have the first link
old_document = 0;
ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link
ref->DocAccessed(time(0));
ref->DocState(Reference_normal);
currenthopcount = ref->DocHopCount();
}
else
{
//
// Never seen this document before. We need to create an
// entry for it. This implies that it gets a new document ID.
//
date = 0;
current_id = docs.NextDocID();
ref = new DocumentRef;
ref->DocID(current_id);
ref->DocURL(doc->Location());
ref->DocState(Reference_normal);
ref->DocAccessed(time(0));
ref->DocHopCount(0);
ref->DocBackLinks(1); // We had to have a link to get here!
old_document = 0;
}
word_context.DocID(ref->DocID());
if (debug > 0)
{
//
// Display progress
//
cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << doc->Location() <<
": ";
cout.flush();
}
//printf("New Doc\n");
//ftime(&tb);
//fprintf(stderr, "[1] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
RetrievedDocument(ref);
//ftime(&tb);
//fprintf(stderr, "[2] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
if(temp_doc_count > 250)
{
//words.Flush();
temp_doc_count = 0;
}
else
{
temp_doc_count++;
}
//ftime(&tb);
//fprintf(stderr, "[3] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
docs.Add(*ref);
//ftime(&tb);
//fprintf(stderr, "[4] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);
delete ref;
words.Flush();
//words.Close();
if (urls_seen)
{
fprintf(urls_seen, "%s|%d|%s|%d|0|1\n",
(const char *) doc->Location(), doc->Length(), doc->ContentType(),
(int) doc->ModTime());
}
return(1);
}
int TextCollector::FlushWordDB()
{
if(temp_doc_count != 0)
{
words.Flush();
temp_doc_count = 0;
}
words.Flush();
words.Close();
return(1);
}
//*****************************************************************************
// void TextCollector::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)
// We found a document that needs to be parsed. Since we don't know the
// document type, we'll let the Document itself return an appropriate
// Parsable object which we can call upon to parse the document contents.
//
void
TextCollector::RetrievedDocument(DocumentRef * ref)
{
n_links = 0;
current_ref = ref;
current_title = 0;
word_context.Anchor(0);
current_time = 0;
current_head = 0;
current_meta_dsc = 0;
time_t doc_time;
//Check if the Document is self-parseable
//We will pass ourselves as a callback object for all the got_*() routines
if (doc->SelfParseable() == TRUE)
{
doc->internalParser(*this);
}
else
{
// Create a parser object and let it have a go at the document.
// We will pass ourselves as a callback object for all the got_*()
// routines.
// This will generate the Parsable object as a specific parser
/*
Parsable *parsable = doc->getParsable();
if (parsable)
parsable->parse(*this, *base);
else
{ // If we didn't get a parser, then we should get rid of this!
ref->DocState(Reference_noindex);
return;
}
*/
}
// We don't need to dispose of the parsable object since it will
// automatically be reused.
//
// Update the document reference
//
ref->DocTitle((char *) current_title);
ref->DocHead((char *) current_head);
ref->DocMetaDsc((char *) current_meta_dsc);
/* if (current_time == 0)
ref->DocTime(doc->ModTime());
else
ref->DocTime(current_time); */
doc_time = doc->ModTime();
if(doc_time != 0)
ref->DocTime(doc_time);
else
ref->DocTime(time(NULL));
ref->DocSize(doc->Length());
ref->DocAccessed(time(0));
ref->DocLinks(n_links);
}
//*****************************************************************************
// void TextCollector::got_word(char *word, int location, int heading)
// The location is normalized to be in the range 0 - 1000.
//
void
TextCollector::got_word(const char *word, int location, int heading)
{
if (debug > 3)
cout << "word: " << word << '@' << location << endl;
if (heading >= 11 || heading < 0) // Current limits for headings
heading = 0; // Assume it's just normal text
if ((trackWords) && (strlen(word) >= minimumWordLength))
{
String w = word;
HtWordReference wordRef;
wordRef.Location(location);
wordRef.Flags(factor[heading]);
wordRef.Word(w);
words.Replace(WordReference::Merge(wordRef, word_context));
#ifdef DEBUG
cout << "Adding: [" << w << "]"<< endl; //NEALR
#endif
// Check for compound words...
String parts = word;
int added;
int nparts = 1;
do
{
added = 0;
char *start = parts.get();
char *punctp = 0, *nextp = 0, *p;
char punct;
int n;
while (*start)
{
p = start;
for (n = 0; n < nparts; n++)
{
while (HtIsStrictWordChar((unsigned char) *p))
p++;
punctp = p;
if (!*punctp && n + 1 < nparts)
break;
while (*p && !HtIsStrictWordChar((unsigned char) *p))
p++;
if (n == 0)
nextp = p;
}
if (n < nparts)
break;
punct = *punctp;
*punctp = '\0';
if (*start && (*p || start > parts.get()))
{
w = start;
HtStripPunctuation(w);
if (w.length() >= minimumWordLength)
{
wordRef.Word(w);
words.Replace(WordReference::Merge(wordRef, word_context));
if (debug > 3)
cout << "word part: " << start << '@' << location << endl;
#ifdef DEBUG
cout << "Adding: [" << w << "]"<< endl; //NEALR
#endif
}
added++;
}
start = nextp;
*punctp = punct;
}
nparts++;
}
while (added > 2);
}
}
//*****************************************************************************
// void TextCollector::got_title(const char *title)
//
void
TextCollector::got_title(const char *title)
{
if (debug > 1)
cout << "\ntitle: " << title << endl;
current_title = title;
}
//*****************************************************************************
// void TextCollector::got_time(const char *time)
//
void
TextCollector::got_time(const char *time)
{
HtDateTime new_time(current_time);
if (debug > 1)
cout << "\ntime: " << time << endl;
//
// As defined by the Dublin Core, this should be YYYY-MM-DD
// In the future, we'll need to deal with the scheme portion
// in case someone picks a different format.
//
new_time.SetFTime(time, "%Y-%m-%d");
current_time = new_time.GetTime_t();
// If we can't convert it, current_time stays the same and we get
// the default--the date returned by the server...
}
//*****************************************************************************
// void TextCollector::got_head(const char *head)
//
void
TextCollector::got_head(const char *head)
{
if (debug > 4)
cout << "head: " << head << endl;
current_head = head;
}
//*****************************************************************************
// void TextCollector::got_meta_dsc(const char *md)
//
void
TextCollector::got_meta_dsc(const char *md)
{
if (debug > 4)
cout << "meta description: " << md << endl;
current_meta_dsc = md;
}
//*****************************************************************************
// void TextCollector::got_meta_email(const char *e)
//
void
TextCollector::got_meta_email(const char *e)
{
if (debug > 1)
cout << "\nmeta email: " << e << endl;
current_ref->DocEmail(e);
}
//*****************************************************************************
// void TextCollector::got_meta_notification(const char *e)
//
void
TextCollector::got_meta_notification(const char *e)
{
if (debug > 1)
cout << "\nmeta notification date: " << e << endl;
current_ref->DocNotification(e);
}
//*****************************************************************************
// void TextCollector::got_meta_subject(const char *e)
//
void
TextCollector::got_meta_subject(const char *e)
{
if (debug > 1)
cout << "\nmeta subect: " << e << endl;
current_ref->DocSubject(e);
}
//*****************************************************************************
// void TextCollector::got_noindex()
//
void
TextCollector::got_noindex()
{
if (debug > 1)
cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;
current_ref->DocState(Reference_noindex);
}