//--------------------------------------------------------------------
//
// TextCollector.h
//
// 2/6/2002 created for libhtdig
//
// Neal Richter nealr@rightnow.com
//
// TextCollector:
// General Purpose Text Document Indexer.
// Calls appropriate parsers.
// The parser notifies the TextCollector object that it got something
// (got_* functions) and the TextCollector object feed the databases
// and statistics accordingly.
//
// Part of the ht://Dig package
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
//
//
// $Id: TextCollector.h,v 1.4 2004/05/28 13:15:29 lha Exp $
//
//--------------------------------------------------------------------
#ifndef _TextCollector_h_
#define _TextCollector_h_
#include "BasicDocument.h"
#include "DocumentRef.h"
#include "Dictionary.h"
#include "Queue.h"
#include "HtWordReference.h"
#include "List.h"
#include "StringList.h"
#include "DocumentDB.h"
class Document;
class HtWordList;
enum TextCollectorLog {
TextCollector_noLog,
TextCollector_logUrl,
TextCollector_Restart
};
class TextCollector
{
public:
//
// Construction/Destruction
//
TextCollector(TextCollectorLog flags = TextCollector_noLog);
virtual ~TextCollector();
int IndexDoc(BasicDocument & adoc);
int FlushWordDB();
//
// Report statistics about the parser
//
void ReportStatistics(const String& name);
//
// These are the callbacks that we need to write code for
//
void got_word(const char *word, int location, int heading);
void got_href(URL &url, const char *description, int hops = 1);
void got_title(const char *title);
void got_time(const char *time);
void got_head(const char *head);
void got_meta_dsc(const char *md);
void got_anchor(const char *anchor);
void got_image(const char *src);
void got_meta_email(const char *);
void got_meta_notification(const char *);
void got_meta_subject(const char *);
void got_noindex();
private:
//
// A hash to keep track of what we've seen
//
Dictionary visited;
URL *base;
String current_title;
String current_head;
String current_meta_dsc;
time_t current_time;
int current_id;
DocumentRef *current_ref;
int current_anchor_number;
int trackWords;
int n_links;
HtWordReference word_context;
HtWordList words;
int check_unique_md5;
int check_unique_date;
TextCollectorLog log;
//
// These are weights for the words. The index is the heading level.
//
long int factor[11];
int currenthopcount;
//
// For efficiency reasons, we will only use one document object which
// we reuse.
//
BasicDocument *doc;
Database *d_md5;
// Some useful constants
int minimumWordLength;
//
// Helper routines
//
void RetrievedDocument(DocumentRef *ref);
int temp_doc_count;
};
#endif