extra-dependencies/debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.cc

//--------------------------------------------------------------------
//
// TextCollector.cc
//
// 2/6/2002 created for libhtdig
//
// Neal Richter nealr@rightnow.com
//
// TextCollector:
//            General Purpose Text Document Indexer.
//            Calls appropriate parsers. 
//            The  parser notifies the TextCollector object that it got something
//            (got_* functions) and the TextCollector object feed the databases
//            and statistics accordingly.
//
//
// Part of the ht://Dig package   <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: TextCollector.cc,v 1.4 2004/05/28 13:15:29 lha Exp $
//
//--------------------------------------------------------------------


#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#include "TextCollector.h"
#include "htdig.h"
#include "HtWordList.h"
#include "WordRecord.h"
#include "URLRef.h"
#include "Server.h"
#include "Parsable.h"
#include "BasicDocument.h"
#include "StringList.h"
#include "WordType.h"
#include "md5.h"
#include "defaults.h"

#include <signal.h>
#include <stdio.h>

#include <sys/timeb.h>


//*****************************************************************************
// TextCollector::TextCollector()
//
TextCollector::TextCollector(TextCollectorLog flags):
words(*(HtConfiguration::config()))
{
	HtConfiguration *config = HtConfiguration::config();
	//FILE *urls_parsed;

	currenthopcount = 0;

    //turn on word tracking!
    trackWords = 1;

	//
	// Initialize the flags for the various HTML factors
	//
    
	// text_factor
	factor[0] = FLAG_TEXT;
	// title_factor
	factor[1] = FLAG_TITLE;
	// heading factor (now generic)
	factor[2] = FLAG_HEADING;
	factor[3] = FLAG_HEADING;
	factor[4] = FLAG_HEADING;
	factor[5] = FLAG_HEADING;
	factor[6] = FLAG_HEADING;
	factor[7] = FLAG_HEADING;
	// img alt text
	//factor[8] = FLAG_KEYWORDS;
	factor[8] = FLAG_TEXT;	  // treat alt text as plain text, until it has
	// its own FLAG and factor.
	// keywords factor
	factor[9] = FLAG_KEYWORDS;
	// META description factor
	factor[10] = FLAG_DESCRIPTION;

	doc = NULL;
	minimumWordLength = config->Value("minimum_word_length", 3);


	//TODO put document-index log file stuff here via logs like Retriever

	check_unique_md5 = config->Boolean("check_unique_md5", 0);
	check_unique_date = config->Boolean("check_unique_date", 0);

	d_md5 = 0;
	if (check_unique_md5)
	{
		d_md5 = Database::getDatabaseInstance(DB_HASH);

		if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)
		{
			cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";
		}
	}

    temp_doc_count = 0;

}


//*****************************************************************************
// TextCollector::~TextCollector()
//
TextCollector::~TextCollector()
{
	if (d_md5)
		d_md5->Close();
	//delete doc;

    if(temp_doc_count != 0)
    {
        words.Flush();
        temp_doc_count = 0;
    }

    words.Flush();
	words.Close();
    
}


//*****************************************************************************
// void TextCollector::IndexDoc()
//
//

int
TextCollector::IndexDoc(BasicDocument & a_basicdoc)
{
	DocumentRef *ref;
    time_t		date;
    int			old_document = 0;
    static int		index = 0;

    //struct timeb tb;

	//HtConfiguration *config = HtConfiguration::config();

    doc = &a_basicdoc;

	ref = docs[doc->Location()];	// It might be nice to have just an Exists() here
	if (ref)
	{
		//
		// We already have an entry for this document in our database.
		// This means we can get the document ID and last modification
		// time from there.
		//
		current_id = ref->DocID();
		date = ref->DocTime();
		if (ref->DocAccessed())
			old_document = 1;
		else  // we haven't retrieved it yet, so we only have the first link
			old_document = 0;
		ref->DocBackLinks(ref->DocBackLinks() + 1);	// we had a new link
		ref->DocAccessed(time(0));
		ref->DocState(Reference_normal);
		currenthopcount = ref->DocHopCount();
	}
	else
	{
		//
		// Never seen this document before.  We need to create an
		// entry for it.  This implies that it gets a new document ID.
		//

        date = 0;
       
        current_id = docs.NextDocID();
		ref = new DocumentRef;
		ref->DocID(current_id);
		ref->DocURL(doc->Location());
		ref->DocState(Reference_normal);
		ref->DocAccessed(time(0));
		ref->DocHopCount(0);
		ref->DocBackLinks(1); // We had to have a link to get here!
		old_document = 0;
	}

	word_context.DocID(ref->DocID());

	if (debug > 0)
	{
		//
		// Display progress
		//
		cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << doc->Location() <<
			": ";
		cout.flush();
	}

    //printf("New Doc\n");
    //ftime(&tb);
    //fprintf(stderr, "[1] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);

	RetrievedDocument(ref);

    //ftime(&tb);
    //fprintf(stderr, "[2] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);

    if(temp_doc_count > 250)
    {
        //words.Flush();
        temp_doc_count = 0;
    }
    else
    {
        temp_doc_count++;
    }

    //ftime(&tb);
    //fprintf(stderr, "[3] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);

	docs.Add(*ref);

    //ftime(&tb);
    //fprintf(stderr, "[4] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);

    delete ref;

    words.Flush();
    //words.Close();

    if (urls_seen)
    {
        fprintf(urls_seen, "%s|%d|%s|%d|0|1\n",
                (const char *) doc->Location(), doc->Length(), doc->ContentType(),
                (int) doc->ModTime());
    }

    
    return(1);
}

int TextCollector::FlushWordDB()
{
    if(temp_doc_count != 0)
    {
        words.Flush();
        temp_doc_count = 0;
    }

    words.Flush();
    words.Close();
    return(1);
}
        
//*****************************************************************************
// void TextCollector::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)
//   We found a document that needs to be parsed.  Since we don't know the
//   document type, we'll let the Document itself return an appropriate
//   Parsable object which we can call upon to parse the document contents.
//
void
TextCollector::RetrievedDocument(DocumentRef * ref)
{
	n_links = 0;
	current_ref = ref;
	current_title = 0;
	word_context.Anchor(0);
	current_time = 0;
	current_head = 0;
	current_meta_dsc = 0;
    time_t doc_time;

    //Check if the Document is self-parseable
    //We will pass ourselves as a callback object for all the got_*() routines
	if (doc->SelfParseable() == TRUE)
	{
		doc->internalParser(*this);
	}
	else
    {
    	// Create a parser object and let it have a go at the document.
	    // We will pass ourselves as a callback object for all the got_*()
    	// routines.
	    // This will generate the Parsable object as a specific parser
	    /*
		Parsable *parsable = doc->getParsable();
		if (parsable)
			parsable->parse(*this, *base);
		else
		{				  // If we didn't get a parser, then we should get rid of this!
			ref->DocState(Reference_noindex);
			return;
		}
        */
	}

	// We don't need to dispose of the parsable object since it will
	// automatically be reused.


	//
	// Update the document reference
	//
	ref->DocTitle((char *) current_title);
	ref->DocHead((char *) current_head);
	ref->DocMetaDsc((char *) current_meta_dsc);
	
/*    if (current_time == 0)
		ref->DocTime(doc->ModTime());
	else
		ref->DocTime(current_time); */
    
    doc_time = doc->ModTime();
    if(doc_time != 0)
        ref->DocTime(doc_time);
    else
        ref->DocTime(time(NULL));
        
	ref->DocSize(doc->Length());
	ref->DocAccessed(time(0));
	ref->DocLinks(n_links);
}


//*****************************************************************************
// void TextCollector::got_word(char *word, int location, int heading)
//   The location is normalized to be in the range 0 - 1000.
//
void
TextCollector::got_word(const char *word, int location, int heading)
{
	if (debug > 3)
		cout << "word: " << word << '@' << location << endl;
	if (heading >= 11 || heading < 0)	// Current limits for headings
		heading = 0;		  // Assume it's just normal text

	if ((trackWords) && (strlen(word) >= minimumWordLength))
	{
		String w = word;
		HtWordReference wordRef;

		wordRef.Location(location);
		wordRef.Flags(factor[heading]);

		wordRef.Word(w);
		words.Replace(WordReference::Merge(wordRef, word_context));

#ifdef DEBUG
        cout << "Adding: [" << w <<  "]"<< endl;  //NEALR
#endif
            
		// Check for compound words...
		String parts = word;
		int added;
		int nparts = 1;
		do
		{
			added = 0;
			char *start = parts.get();
			char *punctp = 0, *nextp = 0, *p;
			char punct;
			int n;
			while (*start)
			{
				p = start;
				for (n = 0; n < nparts; n++)
				{
					while (HtIsStrictWordChar((unsigned char) *p))
						p++;
					punctp = p;
					if (!*punctp && n + 1 < nparts)
						break;
					while (*p && !HtIsStrictWordChar((unsigned char) *p))
						p++;
					if (n == 0)
						nextp = p;
				}
				if (n < nparts)
					break;
				punct = *punctp;
				*punctp = '\0';
				if (*start && (*p || start > parts.get()))
				{
					w = start;
					HtStripPunctuation(w);
					if (w.length() >= minimumWordLength)
					{
						wordRef.Word(w);
						words.Replace(WordReference::Merge(wordRef, word_context));
						if (debug > 3)
							cout << "word part: " << start << '@' << location << endl;

#ifdef DEBUG
                        cout << "Adding: [" << w <<  "]"<< endl;  //NEALR
#endif                            
					}
					added++;
				}
				start = nextp;
				*punctp = punct;
			}
			nparts++;
		}
		while (added > 2);
	}
}


//*****************************************************************************
// void TextCollector::got_title(const char *title)
//
void
TextCollector::got_title(const char *title)
{
	if (debug > 1)
		cout << "\ntitle: " << title << endl;
	current_title = title;
}

//*****************************************************************************
// void TextCollector::got_time(const char *time)
//
void
TextCollector::got_time(const char *time)
{
	HtDateTime new_time(current_time);

	if (debug > 1)
		cout << "\ntime: " << time << endl;

	//
	// As defined by the Dublin Core, this should be YYYY-MM-DD
	// In the future, we'll need to deal with the scheme portion
	//  in case someone picks a different format.
	//
	new_time.SetFTime(time, "%Y-%m-%d");
	current_time = new_time.GetTime_t();

	// If we can't convert it, current_time stays the same and we get
	// the default--the date returned by the server...
}

//*****************************************************************************
// void TextCollector::got_head(const char *head)
//
void
TextCollector::got_head(const char *head)
{
	if (debug > 4)
		cout << "head: " << head << endl;
	current_head = head;
}

//*****************************************************************************
// void TextCollector::got_meta_dsc(const char *md)
//
void
TextCollector::got_meta_dsc(const char *md)
{
	if (debug > 4)
		cout << "meta description: " << md << endl;
	current_meta_dsc = md;
}


//*****************************************************************************
// void TextCollector::got_meta_email(const char *e)
//
void
TextCollector::got_meta_email(const char *e)
{
	if (debug > 1)
		cout << "\nmeta email: " << e << endl;
	current_ref->DocEmail(e);
}


//*****************************************************************************
// void TextCollector::got_meta_notification(const char *e)
//
void
TextCollector::got_meta_notification(const char *e)
{
	if (debug > 1)
		cout << "\nmeta notification date: " << e << endl;
	current_ref->DocNotification(e);
}


//*****************************************************************************
// void TextCollector::got_meta_subject(const char *e)
//
void
TextCollector::got_meta_subject(const char *e)
{
	if (debug > 1)
		cout << "\nmeta subect: " << e << endl;
	current_ref->DocSubject(e);
}


//*****************************************************************************
// void TextCollector::got_noindex()
//
void
TextCollector::got_noindex()
{
	if (debug > 1)
		cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;
	current_ref->DocState(Reference_noindex);
}
DEB htdig: Added to repository. Signed-off-by: Slávek Banko <slavek.banko@axis.cz> 3 years ago			`//--------------------------------------------------------------------`
			`//`
			`// TextCollector.cc`
			`//`
			`// 2/6/2002 created for libhtdig`
			`//`
			`// Neal Richter nealr@rightnow.com`
			`//`
			`// TextCollector:`
			`// General Purpose Text Document Indexer.`
			`// Calls appropriate parsers.`
			`// The parser notifies the TextCollector object that it got something`
			`// (got_* functions) and the TextCollector object feed the databases`
			`// and statistics accordingly.`
			`//`
			`//`
			`// Part of the ht://Dig package <http://www.htdig.org/>`
			`// Copyright (c) 1995-2004 The ht://Dig Group`
			`// For copyright details, see the file COPYING in your distribution`
			`// or the GNU Library General Public License (LGPL) version 2 or later or later`
			`// <http://www.gnu.org/copyleft/lgpl.html>`
			`//`
			`// $Id: TextCollector.cc,v 1.4 2004/05/28 13:15:29 lha Exp $`
			`//`
			`//--------------------------------------------------------------------`


			`#ifdef HAVE_CONFIG_H`
			`#include "htconfig.h"`
			`#endif /* HAVE_CONFIG_H */`

			`#include "TextCollector.h"`
			`#include "htdig.h"`
			`#include "HtWordList.h"`
			`#include "WordRecord.h"`
			`#include "URLRef.h"`
			`#include "Server.h"`
			`#include "Parsable.h"`
			`#include "BasicDocument.h"`
			`#include "StringList.h"`
			`#include "WordType.h"`
			`#include "md5.h"`
			`#include "defaults.h"`

			`#include <signal.h>`
			`#include <stdio.h>`

			`#include <sys/timeb.h>`


			`//*****************************************************************************`
			`// TextCollector::TextCollector()`
			`//`
			`TextCollector::TextCollector(TextCollectorLog flags):`
			`words(*(HtConfiguration::config()))`
			`{`
			`HtConfiguration *config = HtConfiguration::config();`
			`//FILE *urls_parsed;`

			`currenthopcount = 0;`

			`//turn on word tracking!`
			`trackWords = 1;`

			`//`
			`// Initialize the flags for the various HTML factors`
			`//`

			`// text_factor`
			`factor[0] = FLAG_TEXT;`
			`// title_factor`
			`factor[1] = FLAG_TITLE;`
			`// heading factor (now generic)`
			`factor[2] = FLAG_HEADING;`
			`factor[3] = FLAG_HEADING;`
			`factor[4] = FLAG_HEADING;`
			`factor[5] = FLAG_HEADING;`
			`factor[6] = FLAG_HEADING;`
			`factor[7] = FLAG_HEADING;`
			`// img alt text`
			`//factor[8] = FLAG_KEYWORDS;`
			`factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has`
			`// its own FLAG and factor.`
			`// keywords factor`
			`factor[9] = FLAG_KEYWORDS;`
			`// META description factor`
			`factor[10] = FLAG_DESCRIPTION;`

			`doc = NULL;`
			`minimumWordLength = config->Value("minimum_word_length", 3);`


			`//TODO put document-index log file stuff here via logs like Retriever`

			`check_unique_md5 = config->Boolean("check_unique_md5", 0);`
			`check_unique_date = config->Boolean("check_unique_date", 0);`

			`d_md5 = 0;`
			`if (check_unique_md5)`
			`{`
			`d_md5 = Database::getDatabaseInstance(DB_HASH);`

			`if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK)`
			`{`
			`cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n";`
			`}`
			`}`

			`temp_doc_count = 0;`

			`}`


			`//*****************************************************************************`
			`// TextCollector::~TextCollector()`
			`//`
			`TextCollector::~TextCollector()`
			`{`
			`if (d_md5)`
			`d_md5->Close();`
			`//delete doc;`

			`if(temp_doc_count != 0)`
			`{`
			`words.Flush();`
			`temp_doc_count = 0;`
			`}`

			`words.Flush();`
			`words.Close();`

			`}`


			`//*****************************************************************************`
			`// void TextCollector::IndexDoc()`
			`//`
			`//`

			`int`
			`TextCollector::IndexDoc(BasicDocument & a_basicdoc)`
			`{`
			`DocumentRef *ref;`
			`time_t date;`
			`int old_document = 0;`
			`static int index = 0;`

			`//struct timeb tb;`

			`//HtConfiguration *config = HtConfiguration::config();`

			`doc = &a_basicdoc;`

			`ref = docs[doc->Location()]; // It might be nice to have just an Exists() here`
			`if (ref)`
			`{`
			`//`
			`// We already have an entry for this document in our database.`
			`// This means we can get the document ID and last modification`
			`// time from there.`
			`//`
			`current_id = ref->DocID();`
			`date = ref->DocTime();`
			`if (ref->DocAccessed())`
			`old_document = 1;`
			`else // we haven't retrieved it yet, so we only have the first link`
			`old_document = 0;`
			`ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link`
			`ref->DocAccessed(time(0));`
			`ref->DocState(Reference_normal);`
			`currenthopcount = ref->DocHopCount();`
			`}`
			`else`
			`{`
			`//`
			`// Never seen this document before. We need to create an`
			`// entry for it. This implies that it gets a new document ID.`
			`//`

			`date = 0;`

			`current_id = docs.NextDocID();`
			`ref = new DocumentRef;`
			`ref->DocID(current_id);`
			`ref->DocURL(doc->Location());`
			`ref->DocState(Reference_normal);`
			`ref->DocAccessed(time(0));`
			`ref->DocHopCount(0);`
			`ref->DocBackLinks(1); // We had to have a link to get here!`
			`old_document = 0;`
			`}`

			`word_context.DocID(ref->DocID());`

			`if (debug > 0)`
			`{`
			`//`
			`// Display progress`
			`//`
			`cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << doc->Location() <<`
			`": ";`
			`cout.flush();`
			`}`

			`//printf("New Doc\n");`
			`//ftime(&tb);`
			`//fprintf(stderr, "[1] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);`

			`RetrievedDocument(ref);`

			`//ftime(&tb);`
			`//fprintf(stderr, "[2] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);`

			`if(temp_doc_count > 250)`
			`{`
			`//words.Flush();`
			`temp_doc_count = 0;`
			`}`
			`else`
			`{`
			`temp_doc_count++;`
			`}`

			`//ftime(&tb);`
			`//fprintf(stderr, "[3] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);`

			`docs.Add(*ref);`

			`//ftime(&tb);`
			`//fprintf(stderr, "[4] TIME: [%s] [%d]\n", ctime(&tb.time), tb.millitm);`

			`delete ref;`

			`words.Flush();`
			`//words.Close();`

			`if (urls_seen)`
			`{`
			`fprintf(urls_seen, "%s\|%d\|%s\|%d\|0\|1\n",`
			`(const char *) doc->Location(), doc->Length(), doc->ContentType(),`
			`(int) doc->ModTime());`
			`}`


			`return(1);`
			`}`

			`int TextCollector::FlushWordDB()`
			`{`
			`if(temp_doc_count != 0)`
			`{`
			`words.Flush();`
			`temp_doc_count = 0;`
			`}`

			`words.Flush();`
			`words.Close();`
			`return(1);`
			`}`

			`//*****************************************************************************`
			`// void TextCollector::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref)`
			`// We found a document that needs to be parsed. Since we don't know the`
			`// document type, we'll let the Document itself return an appropriate`
			`// Parsable object which we can call upon to parse the document contents.`
			`//`
			`void`
			`TextCollector::RetrievedDocument(DocumentRef * ref)`
			`{`
			`n_links = 0;`
			`current_ref = ref;`
			`current_title = 0;`
			`word_context.Anchor(0);`
			`current_time = 0;`
			`current_head = 0;`
			`current_meta_dsc = 0;`
			`time_t doc_time;`

			`//Check if the Document is self-parseable`
			`//We will pass ourselves as a callback object for all the got_*() routines`
			`if (doc->SelfParseable() == TRUE)`
			`{`
			`doc->internalParser(*this);`
			`}`
			`else`
			`{`
			`// Create a parser object and let it have a go at the document.`
			`// We will pass ourselves as a callback object for all the got_*()`
			`// routines.`
			`// This will generate the Parsable object as a specific parser`
			`/*`
			`Parsable *parsable = doc->getParsable();`
			`if (parsable)`
			`parsable->parse(this, base);`
			`else`
			`{ // If we didn't get a parser, then we should get rid of this!`
			`ref->DocState(Reference_noindex);`
			`return;`
			`}`
			`*/`
			`}`

			`// We don't need to dispose of the parsable object since it will`
			`// automatically be reused.`


			`//`
			`// Update the document reference`
			`//`
			`ref->DocTitle((char *) current_title);`
			`ref->DocHead((char *) current_head);`
			`ref->DocMetaDsc((char *) current_meta_dsc);`

			`/* if (current_time == 0)`
			`ref->DocTime(doc->ModTime());`
			`else`
			`ref->DocTime(current_time); */`

			`doc_time = doc->ModTime();`
			`if(doc_time != 0)`
			`ref->DocTime(doc_time);`
			`else`
			`ref->DocTime(time(NULL));`

			`ref->DocSize(doc->Length());`
			`ref->DocAccessed(time(0));`
			`ref->DocLinks(n_links);`
			`}`


			`//*****************************************************************************`
			`// void TextCollector::got_word(char *word, int location, int heading)`
			`// The location is normalized to be in the range 0 - 1000.`
			`//`
			`void`
			`TextCollector::got_word(const char *word, int location, int heading)`
			`{`
			`if (debug > 3)`
			`cout << "word: " << word << '@' << location << endl;`
			`if (heading >= 11 \|\| heading < 0) // Current limits for headings`
			`heading = 0; // Assume it's just normal text`

			`if ((trackWords) && (strlen(word) >= minimumWordLength))`
			`{`
			`String w = word;`
			`HtWordReference wordRef;`

			`wordRef.Location(location);`
			`wordRef.Flags(factor[heading]);`

			`wordRef.Word(w);`
			`words.Replace(WordReference::Merge(wordRef, word_context));`

			`#ifdef DEBUG`
			`cout << "Adding: [" << w << "]"<< endl; //NEALR`
			`#endif`

			`// Check for compound words...`
			`String parts = word;`
			`int added;`
			`int nparts = 1;`
			`do`
			`{`
			`added = 0;`
			`char *start = parts.get();`
			`char punctp = 0, nextp = 0, *p;`
			`char punct;`
			`int n;`
			`while (*start)`
			`{`
			`p = start;`
			`for (n = 0; n < nparts; n++)`
			`{`
			`while (HtIsStrictWordChar((unsigned char) *p))`
			`p++;`
			`punctp = p;`
			`if (!*punctp && n + 1 < nparts)`
			`break;`
			`while (p && !HtIsStrictWordChar((unsigned char) p))`
			`p++;`
			`if (n == 0)`
			`nextp = p;`
			`}`
			`if (n < nparts)`
			`break;`
			`punct = *punctp;`
			`*punctp = '\0';`
			`if (start && (p \|\| start > parts.get()))`
			`{`
			`w = start;`
			`HtStripPunctuation(w);`
			`if (w.length() >= minimumWordLength)`
			`{`
			`wordRef.Word(w);`
			`words.Replace(WordReference::Merge(wordRef, word_context));`
			`if (debug > 3)`
			`cout << "word part: " << start << '@' << location << endl;`

			`#ifdef DEBUG`
			`cout << "Adding: [" << w << "]"<< endl; //NEALR`
			`#endif`
			`}`
			`added++;`
			`}`
			`start = nextp;`
			`*punctp = punct;`
			`}`
			`nparts++;`
			`}`
			`while (added > 2);`
			`}`
			`}`


			`//*****************************************************************************`
			`// void TextCollector::got_title(const char *title)`
			`//`
			`void`
			`TextCollector::got_title(const char *title)`
			`{`
			`if (debug > 1)`
			`cout << "\ntitle: " << title << endl;`
			`current_title = title;`
			`}`

			`//*****************************************************************************`
			`// void TextCollector::got_time(const char *time)`
			`//`
			`void`
			`TextCollector::got_time(const char *time)`
			`{`
			`HtDateTime new_time(current_time);`

			`if (debug > 1)`
			`cout << "\ntime: " << time << endl;`

			`//`
			`// As defined by the Dublin Core, this should be YYYY-MM-DD`
			`// In the future, we'll need to deal with the scheme portion`
			`// in case someone picks a different format.`
			`//`
			`new_time.SetFTime(time, "%Y-%m-%d");`
			`current_time = new_time.GetTime_t();`

			`// If we can't convert it, current_time stays the same and we get`
			`// the default--the date returned by the server...`
			`}`

			`//*****************************************************************************`
			`// void TextCollector::got_head(const char *head)`
			`//`
			`void`
			`TextCollector::got_head(const char *head)`
			`{`
			`if (debug > 4)`
			`cout << "head: " << head << endl;`
			`current_head = head;`
			`}`

			`//*****************************************************************************`
			`// void TextCollector::got_meta_dsc(const char *md)`
			`//`
			`void`
			`TextCollector::got_meta_dsc(const char *md)`
			`{`
			`if (debug > 4)`
			`cout << "meta description: " << md << endl;`
			`current_meta_dsc = md;`
			`}`


			`//*****************************************************************************`
			`// void TextCollector::got_meta_email(const char *e)`
			`//`
			`void`
			`TextCollector::got_meta_email(const char *e)`
			`{`
			`if (debug > 1)`
			`cout << "\nmeta email: " << e << endl;`
			`current_ref->DocEmail(e);`
			`}`


			`//*****************************************************************************`
			`// void TextCollector::got_meta_notification(const char *e)`
			`//`
			`void`
			`TextCollector::got_meta_notification(const char *e)`
			`{`
			`if (debug > 1)`
			`cout << "\nmeta notification date: " << e << endl;`
			`current_ref->DocNotification(e);`
			`}`


			`//*****************************************************************************`
			`// void TextCollector::got_meta_subject(const char *e)`
			`//`
			`void`
			`TextCollector::got_meta_subject(const char *e)`
			`{`
			`if (debug > 1)`
			`cout << "\nmeta subect: " << e << endl;`
			`current_ref->DocSubject(e);`
			`}`


			`//*****************************************************************************`
			`// void TextCollector::got_noindex()`
			`//`
			`void`
			`TextCollector::got_noindex()`
			`{`
			`if (debug > 1)`
			`cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl;`
			`current_ref->DocState(Reference_noindex);`
			`}`