You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

656 lines
16 KiB
C++

//
// DocumentDB.cc
//
// DocumentDB: This class is the interface to the database of document
// references. This database is only used while digging.
// An extract of this database is used for searching.
// This is because digging requires a different index
// than searching.
//
// Part of the ht://Dig package <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: DocumentDB.cc,v 1.34 2004/05/28 13:15:12 lha Exp $
//
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
#include "DocumentDB.h"
#include "Database.h"
#include "HtURLCodec.h"
#include "IntObject.h"
#include "HtZlibCodec.h"
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#ifndef _MSC_VER /* _WIN32 */
#include <unistd.h>
#endif
#ifdef HAVE_STD
#include <iostream>
#include <fstream>
#ifdef HAVE_NAMESPACES
using namespace std;
#endif
#else
#include <iostream.h>
#include <fstream.h>
#endif /* HAVE_STD */
#include <errno.h>
//*****************************************************************************
// DocumentDB::DocumentDB()
//
DocumentDB::DocumentDB()
{
isopen = 0;
isread = 0;
// The first document number (NEXT_DOC_ID_RECORD) is used to
// store the nextDocID number itself into. We avoid using
// an all-0 key for this, mostly for being superstitious
// about letting in bugs.
nextDocID = NEXT_DOC_ID_RECORD + 1;
}
//*****************************************************************************
// DocumentDB::~DocumentDB()
//
DocumentDB::~DocumentDB()
{
Close();
}
//*****************************************************************************
// int DocumentDB::Open(char *filename, char *indexname, char *headname)
// We will attempt to open up an existing document database. If it
// doesn't exist, we'll create a new one. If we are succesful in
// opening the database, we need to look for our special record
// which contains the next document ID to use.
// There may also be an URL -> DocID index database to take
// care of, as well as a DocID -> DocHead excerpt database.
//
int DocumentDB::Open(const String& filename, const String& indexfilename, const String& headname)
{
// If the database is already open, we'll close it
// We might be opening this object with a new filename, so we'll be safe
Close();
dbf = 0;
i_dbf = 0;
h_dbf = 0;
i_dbf = Database::getDatabaseInstance(DB_HASH);
if (i_dbf->OpenReadWrite(indexfilename, 0666) != OK) {
cerr << "DocumentDB::Open: " << indexfilename << " " << strerror(errno) << "\n";
return NOTOK;
}
h_dbf = Database::getDatabaseInstance(DB_HASH);
if (h_dbf->OpenReadWrite(headname, 0666) != OK) {
cerr << "DocumentDB::Open: " << headname << " " << strerror(errno) << "\n";
return NOTOK;
}
dbf = Database::getDatabaseInstance(DB_HASH);
if (dbf->OpenReadWrite(filename, 0666) == OK)
{
String data;
int specialRecordNumber = NEXT_DOC_ID_RECORD;
String key((char *) &specialRecordNumber,
sizeof specialRecordNumber);
if (dbf->Get(key, data) == OK)
{
memcpy(&nextDocID, data.get(), sizeof nextDocID);
}
isopen = 1;
return OK;
}
else {
cerr << "DocumentDB::Open: " << filename << " " << strerror(errno) << "\n";
return NOTOK;
}
}
//*****************************************************************************
// int DocumentDB::Read(char *filename, char *indexname, char *headname)
// We will attempt to open up an existing document database,
// and accompanying index database and excerpt database
//
int DocumentDB::Read(const String& filename, const String& indexfilename , const String& headfilename )
{
// If the database is already open, we'll close it
// We might be opening this object with a new filename, so we'll be safe
Close();
dbf = 0;
i_dbf = 0;
h_dbf = 0;
if (!indexfilename.empty())
{
i_dbf = Database::getDatabaseInstance(DB_HASH);
if (i_dbf->OpenRead(indexfilename) != OK)
return NOTOK;
}
if (!headfilename.empty())
{
h_dbf = Database::getDatabaseInstance(DB_HASH);
if (h_dbf->OpenRead(headfilename) != OK)
return NOTOK;
}
dbf = Database::getDatabaseInstance(DB_HASH);
if (dbf->OpenRead(filename) == OK)
{
isopen = 1;
isread = 1;
return OK;
}
else
return NOTOK;
}
//*****************************************************************************
// int DocumentDB::Close()
// Close the database. Before we close it, we first need to update
// the special record which keeps track our nextDocID variable.
//
int DocumentDB::Close()
{
if (!isopen) return OK;
if (!isread)
{
int specialRecordNumber = NEXT_DOC_ID_RECORD;
String key((char *) &specialRecordNumber,
sizeof specialRecordNumber);
String data((char *) &nextDocID, sizeof nextDocID);
dbf->Put(key, data);
}
if (i_dbf)
{
i_dbf->Close();
delete i_dbf;
i_dbf = 0;
}
if (h_dbf)
{
h_dbf->Close();
delete h_dbf;
h_dbf = 0;
}
dbf->Close();
delete dbf;
dbf = 0;
isopen = 0;
isread = 0;
return OK;
}
//*****************************************************************************
// int DocumentDB::Add(DocumentRef &doc)
//
int DocumentDB::Add(DocumentRef &doc)
{
int docID = doc.DocID();
String temp = 0;
doc.Serialize(temp);
String key((char *) &docID, sizeof docID);
dbf->Put(key, temp);
if (h_dbf)
{
if (doc.DocHeadIsSet())
{
temp = HtZlibCodec::instance()->encode(doc.DocHead());
h_dbf->Put(key, temp);
}
}
else
// If there was no excerpt index when we write, something is wrong.
return NOTOK;
if (i_dbf)
{
temp = doc.DocURL();
i_dbf->Put(HtURLCodec::instance()->encode(temp), key);
return OK;
}
else
// If there was no index when we write, something is wrong.
return NOTOK;
}
//*****************************************************************************
// int DocumentDB::ReadExcerpt(DocumentRef &ref)
// We will attempt to access the excerpt for this ref
//
int DocumentDB::ReadExcerpt(DocumentRef &ref)
{
String data;
int docID = ref.DocID();
String key((char *) &docID, sizeof docID);
if (!h_dbf)
return NOTOK;
if (h_dbf->Get(key, data) == NOTOK)
return NOTOK;
ref.DocHead((char*)HtZlibCodec::instance()->decode(data));
return OK;
}
//*****************************************************************************
// DocumentRef *DocumentDB::operator [] (int docID)
//
DocumentRef *DocumentDB::operator [] (int docID)
{
String data;
String key((char *) &docID, sizeof docID);
if (dbf->Get(key, data) == NOTOK)
return 0;
DocumentRef *ref = new DocumentRef;
ref->Deserialize(data);
return ref;
}
//*****************************************************************************
// DocumentRef *DocumentDB::operator [] (const String& u)
//
DocumentRef *DocumentDB::operator [] (const String& u)
{
String data;
String docIDstr;
// If there is no index db, then just give up
// (do *not* construct a list and traverse it).
if (i_dbf == 0)
return 0;
else
{
String url(u);
if (i_dbf->Get(HtURLCodec::instance()->encode(url), docIDstr) == NOTOK)
return 0;
}
if (dbf->Get(docIDstr, data) == NOTOK)
return 0;
DocumentRef *ref = new DocumentRef;
ref->Deserialize(data);
return ref;
}
//*****************************************************************************
// int DocumentDB::Exists(int docID)
//
int DocumentDB::Exists(int docID)
{
String key((char *) &docID, sizeof docID);
return dbf->Exists(key);
}
//*****************************************************************************
// int DocumentDB::Delete(int docID)
//
int DocumentDB::Delete(int docID)
{
String key((char*) &docID, sizeof docID);
String data;
if (i_dbf == 0 || dbf->Get(key, data) == NOTOK)
return NOTOK;
DocumentRef *ref = new DocumentRef;
ref->Deserialize(data);
String url = ref->DocURL();
delete ref;
// We have to be really careful about deleting by URL, we might
// have a newer "edition" with the same URL and different DocID
String docIDstr;
String encodedURL = HtURLCodec::instance()->encode(url);
if (i_dbf->Get(encodedURL, docIDstr) == NOTOK)
return NOTOK;
// Only delete if we have a match between what we want to delete
// and what's in the database
if (key == docIDstr && i_dbf->Delete(encodedURL) == NOTOK)
return NOTOK;
if (h_dbf == 0 || h_dbf->Delete(key) == NOTOK)
return NOTOK;
return dbf->Delete(key);
}
//*****************************************************************************
// int DocumentDB::DumpDB(char *filename, int verbose)
// Create an extract from our database which can be used by an
// external application. The extract will consist of lines with fields
// separated by tabs.
//
// The extract will likely not be sorted by anything in particular
//
int DocumentDB::DumpDB(const String& filename, int verbose)
{
DocumentRef *ref;
List *descriptions, *anchors;
char *strkey;
String data;
FILE *fl;
String docKey(sizeof(int));
if((fl = fopen(filename, "w")) == 0) {
perror(form("DocumentDB::DumpDB: opening %s for writing",
(const char*)filename));
return NOTOK;
}
dbf->Start_Get();
while ((strkey = dbf->Get_Next()))
{
int docID;
memcpy(&docID, strkey, sizeof docID);
docKey = 0;
docKey.append((char *) &docID, sizeof docID);
dbf->Get(docKey, data);
if (docID != NEXT_DOC_ID_RECORD)
{
ref = new DocumentRef;
ref->Deserialize(data);
if (h_dbf)
{
h_dbf->Get(docKey,data);
ref->DocHead((char*)HtZlibCodec::instance()->decode(data));
}
fprintf(fl, "%d", ref->DocID());
fprintf(fl, "\tu:%s", ref->DocURL());
fprintf(fl, "\tt:%s", ref->DocTitle());
fprintf(fl, "\ta:%d", ref->DocState());
fprintf(fl, "\tm:%d", (int) ref->DocTime());
fprintf(fl, "\ts:%d", ref->DocSize());
fprintf(fl, "\tH:%s", ref->DocHead());
fprintf(fl, "\th:%s", ref->DocMetaDsc());
fprintf(fl, "\tl:%d", (int) ref->DocAccessed());
fprintf(fl, "\tL:%d", ref->DocLinks());
fprintf(fl, "\tb:%d", ref->DocBackLinks());
fprintf(fl, "\tc:%d", ref->DocHopCount());
fprintf(fl, "\tg:%d", ref->DocSig());
fprintf(fl, "\te:%s", ref->DocEmail());
fprintf(fl, "\tn:%s", ref->DocNotification());
fprintf(fl, "\tS:%s", ref->DocSubject());
fprintf(fl, "\td:");
descriptions = ref->Descriptions();
String *description;
descriptions->Start_Get();
int first = 1;
while ((description = (String *) descriptions->Get_Next()))
{
if (!first)
fprintf(fl, "\001");
first = 0;
fprintf(fl, "%s", description->get());
}
fprintf(fl, "\tA:");
anchors = ref->DocAnchors();
String *anchor;
anchors->Start_Get();
first = 1;
while ((anchor = (String *) anchors->Get_Next()))
{
if (!first)
fprintf(fl, "\001");
first = 0;
fprintf(fl, "%s", anchor->get());
}
fprintf(fl, "\n");
delete ref;
}
}
fclose(fl);
return OK;
}
//*****************************************************************************
// int DocumentDB::LoadDB(const String &filename, int verbose)
// Load an extract to our database from an ASCII file
// The extract will consist of lines with fields separated by tabs.
// The lines need not be sorted in any fashion.
//
int DocumentDB::LoadDB(const String& filename, int verbose)
{
FILE *input;
String docKey(sizeof(int));
DocumentRef ref;
StringList descriptions, anchors;
char *token, field;
String data;
if((input = fopen(filename, "r")) == 0) {
perror(form("DocumentDB::LoadDB: opening %s for reading",
(const char*)filename));
return NOTOK;
}
while (data.readLine(input))
{
token = strtok(data, "\t");
if (token == NULL)
continue;
ref.DocID(atoi(token));
if (verbose)
cout << "\t loading document ID: " << ref.DocID() << endl;
while ( (token = strtok(0, "\t")) )
{
field = *token;
token += 2;
if (verbose > 2)
cout << "\t field: " << field;
switch(field)
{
case 'u': // URL
ref.DocURL(token);
break;
case 't': // Title
ref.DocTitle(token);
break;
case 'a': // State
ref.DocState(atoi(token));
break;
case 'm': // Modified
ref.DocTime(atoi(token));
break;
case 's': // Size
ref.DocSize(atoi(token));
break;
case 'H': // Head
ref.DocHead(token);
break;
case 'h': // Meta Description
ref.DocMetaDsc(token);
break;
case 'l': // Accessed
ref.DocAccessed(atoi(token));
break;
case 'L': // Links
ref.DocLinks(atoi(token));
break;
case 'b': // BackLinks
ref.DocBackLinks(atoi(token));
break;
case 'c': // HopCount
ref.DocHopCount(atoi(token));
break;
case 'g': // Signature
ref.DocSig(atoi(token));
break;
case 'e': // E-mail
ref.DocEmail(token);
break;
case 'n': // Notification
ref.DocNotification(token);
break;
case 'S': // Subject
ref.DocSubject(token);
break;
case 'd': // Descriptions
descriptions.Create(token, '\001');
ref.Descriptions(descriptions);
break;
case 'A': // Anchors
anchors.Create(token, '\001');
ref.DocAnchors(anchors);
break;
default:
break;
}
}
// We must be careful if the document already exists
// So we'll delete the old document and add the new one
if (Exists(ref.DocID()))
{
Delete(ref.DocID());
}
Add(ref);
// If we add a record with an ID past nextDocID, update it
if (ref.DocID() > nextDocID)
nextDocID = ref.DocID() + 1;
descriptions.Destroy();
anchors.Destroy();
}
fclose(input);
return OK;
}
//*****************************************************************************
// List *DocumentDB::URLs()
// Return a list of all the URLs in the database
// Only available when there's an URL -> DocID index db handy.
//
List *DocumentDB::URLs()
{
List *list = new List;
char *coded_key;
if (i_dbf == 0)
return 0;
i_dbf->Start_Get();
while ((coded_key = i_dbf->Get_Next()))
{
String *key = new String(HtURLCodec::instance()->decode(coded_key));
list->Add(key);
}
return list;
}
//*****************************************************************************
// List *DocumentDB::DocIDs()
// Return a list of all the DocIDs in the database
//
List *DocumentDB::DocIDs()
{
List *list = new List;
char *key;
dbf->Start_Get();
while ((key = dbf->Get_Next()))
{
int docID;
memcpy (&docID, key, sizeof docID);
if (docID != NEXT_DOC_ID_RECORD)
list->Add(new IntObject(docID));
}
return list;
}
//*****************************************************************************
// private
// int readLine(FILE *in, String &line)
//
int readLine(FILE *in, String &line)
{
char buffer[2048];
int length;
line = 0;
while (fgets(buffer, sizeof(buffer), in))
{
length = strlen(buffer);
if (buffer[length - 1] == '\n')
{
//
// A full line has been read. Return it.
//
line << buffer;
line.chop('\n');
return 1;
}
else
{
//
// Only a partial line was read. Append it to the line
// and read some more.
//
line << buffer;
}
}
return line.length() > 0;
}
// End of DocumentDB.cc