You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

400 lines
10 KiB
C++

//
// htpurge.cc
//
// htpurge: A utility to remove specified URLs and any documents
// marked for removal from the word and document databases.
//
// Part of the ht://Dig package <http://www.htdig.org/>
// Copyright (c) 1999-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: htpurge.cc,v 1.6 2004/05/28 13:15:25 lha Exp $
//
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
#include <fcntl.h>
#include "WordContext.h"
#include "HtWordReference.h"
#include "HtConfiguration.h"
#include "DocumentDB.h"
#include "DocumentRef.h"
#include "defaults.h"
#include "HtURLCodec.h"
#include <errno.h>
#ifndef _MSC_VER /* _WIN32 */
#include <unistd.h>
#endif
// If we have this, we probably want it.
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#elif HAVE_GETOPT_LOCAL
#include <getopt_local.h>
#endif
int verbose = 0;
Dictionary *purgeDocs(Dictionary *);
void purgeWords(Dictionary *);
void usage();
void reportError(char *msg);
//*****************************************************************************
// int main(int ac, char **av)
//
int main(int ac, char **av)
{
int alt_work_area = 0;
String configfile = DEFAULT_CONFIG_FILE;
int c;
extern char *optarg;
Dictionary *discard_ids = 0;
Dictionary *discard_urls = new Dictionary;
while ((c = getopt(ac, av, "vc:au:")) != -1)
{
switch (c)
{
case 'c':
configfile = optarg;
break;
case 'v':
verbose++;
break;
case 'a':
alt_work_area++;
break;
case 'u':
discard_urls->Add(optarg, NULL);
break;
case '?':
usage();
break;
}
}
HtConfiguration* config= HtConfiguration::config();
config->Defaults(&defaults[0]);
if (access((char*)configfile, R_OK) < 0)
{
reportError(form("Unable to find configuration file '%s'",
configfile.get()));
}
config->Read(configfile);
//
// Check url_part_aliases and common_url_parts for
// errors.
String url_part_errors = HtURLCodec::instance()->ErrMsg();
if (url_part_errors.length() != 0)
reportError(form("Invalid url_part_aliases or common_url_parts: %s",
url_part_errors.get()));
if (alt_work_area != 0)
{
String configValue;
configValue = config->Find("word_db");
if (configValue.length() != 0)
{
configValue << ".work";
config->Add("word_db", configValue);
}
configValue = config->Find("doc_db");
if (configValue.length() != 0)
{
configValue << ".work";
config->Add("doc_db", configValue);
}
configValue = config->Find("doc_index");
if (configValue.length() != 0)
{
configValue << ".work";
config->Add("doc_index", configValue);
}
configValue = config->Find("doc_excerpt");
if (configValue.length() != 0)
{
configValue << ".work";
config->Add("doc_excerpt", configValue);
}
}
if (optind < ac && strcmp(av[optind], "-") == 0)
{
String str;
while (!cin.eof())
{
cin >> str;
str.chop("\r\n");
if (str.length() > 0)
discard_urls->Add(str, NULL);
}
}
WordContext::Initialize(*config);
// We pass in our list of URLs (which may be empty)
// and we get back the list of IDs purged from the doc DB
// to make sure words with these IDs are purged
discard_ids = purgeDocs(discard_urls);
delete discard_urls;
discard_urls = 0;
purgeWords(discard_ids);
delete discard_ids;
discard_ids = 0;
return 0;
}
//*****************************************************************************
// Dictionary purgeDocs(Dictionary &purgeURLs)
// Pass in a hash of the URLs to delete (it could be empty)
// Return a hash of the IDs deleted from the doc DB
//
Dictionary *purgeDocs(Dictionary *purgeURLs)
{
HtConfiguration* config= HtConfiguration::config();
const String doc_db = config->Find("doc_db");
const String doc_index = config->Find("doc_index");
const String doc_excerpt = config->Find("doc_excerpt");
int remove_unused;
int remove_unretrieved;
DocumentDB db;
List *IDs;
int document_count = 0;
Dictionary *discard_list = new Dictionary;
//
// Start the conversion by going through all the URLs that are in
// the document database
//
if(db.Open(doc_db, doc_index, doc_excerpt) != OK)
return discard_list; // It's empty right now
IDs = db.DocIDs();
if (IDs->Count() == 0)
reportError("Database is empty!");
IDs->Start_Get();
IntObject *id;
String idStr;
String url;
URL u_url;
while ((id = (IntObject *) IDs->Get_Next()))
{
DocumentRef *ref = db[id->Value()];
if (!ref)
continue;
db.ReadExcerpt(*ref);
url = ref->DocURL();
u_url = URL((char *)url);
remove_unused = config->Boolean("server", u_url.host() ,"remove_bad_urls");
remove_unretrieved = config->Boolean("server", u_url.host(), "remove_unretrieved_urls");
idStr = 0;
idStr << id->Value();
if (ref->DocState() == Reference_noindex)
{
// This document either wasn't found or shouldn't be indexed.
db.Delete(ref->DocID());
if (verbose)
cout << "Deleted, noindex: ID: " << idStr << " URL: "
<< url << endl;
discard_list->Add(idStr.get(), NULL);
}
else if (ref->DocState() == Reference_obsolete)
{
// This document was replaced by a newer one
db.Delete(ref->DocID());
if (verbose)
cout << "Deleted, obsolete: ID: " << idStr << " URL: "
<< url << endl;
discard_list->Add(idStr.get(), NULL);
}
else if (remove_unused && ref->DocState() == Reference_not_found)
{
// This document wasn't actually found
db.Delete(ref->DocID());
if (verbose)
cout << "Deleted, not found: ID: " << idStr << " URL: "
<< url << endl;
discard_list->Add(idStr.get(), NULL);
}
else if (remove_unused && strlen(ref->DocHead()) == 0
&& ref->DocAccessed() != 0)
{
// For some reason, this document was retrieved, but doesn't
// have an excerpt (probably because of a noindex directive)
db.Delete(ref->DocID());
if (verbose)
cout << "Deleted, no excerpt: ID: " << idStr << " URL: "
<< url << endl;
discard_list->Add(idStr.get(), NULL);
}
else if (remove_unretrieved && ref->DocAccessed() == 0)
{
// This document has not been retrieved
db.Delete(ref->DocID());
if (verbose)
cout << "Deleted, never retrieved: ID: " << idStr << " URL: "
<< url << endl;
discard_list->Add(idStr.get(), NULL);
}
else if (purgeURLs->Exists(url))
{
// This document has been marked to be purged by the user
db.Delete(ref->DocID());
if (verbose)
cout << "Deleted, marked by user input: ID: " << idStr << " URL: "
<< url << endl;
discard_list->Add(idStr.get(), NULL);
}
else
{
// This is a valid document. Let's keep stats on it.
if (verbose > 1)
cout << "ID: " << idStr << " URL: " << url << endl;
document_count++;
if (verbose && document_count % 10 == 0)
{
cout << "htpurge: " << document_count << '\n';
cout.flush();
}
}
delete ref;
}
if (verbose)
cout << "\n";
delete IDs;
db.Close();
return discard_list;
}
//
// Callback data dedicated to Dump and dump_word communication
//
class DeleteWordData : public Object
{
public:
DeleteWordData(const Dictionary& discard_arg) : discard(discard_arg) { deleted = remains = 0; }
const Dictionary& discard;
int deleted;
int remains;
};
//*****************************************************************************
//
//
static int delete_word(WordList *words, WordDBCursor& cursor, const WordReference *word_arg, Object &data)
{
const HtWordReference *word = (const HtWordReference *)word_arg;
DeleteWordData& d = (DeleteWordData&)data;
static String docIDStr;
docIDStr = 0;
docIDStr << word->DocID();
if(d.discard.Exists(docIDStr)) {
if(words->Delete(cursor) != 0) {
cerr << "htpurge: deletion of " << (char*)word->Get() << " failed " << strerror(errno) << "\n";
return NOTOK;
}
if (verbose)
{
cout << "htpurge: Discarding ";
if(verbose > 2)
cout << (char*)word->Get();
else
cout << word->Word();
cout << "\n";
cout.flush();
}
d.deleted++;
} else {
d.remains++;
}
return OK;
}
//*****************************************************************************
// void purgeWords(Dictionary *discard_list)
//
void purgeWords(Dictionary *discard_list)
{
HtConfiguration* config= HtConfiguration::config();
HtWordList words(*config);
DeleteWordData data(*discard_list);
words.Open(config->Find("word_db"), O_RDWR);
WordCursor* search = words.Cursor(delete_word, &data);
search->Walk();
delete search;
words.Close();
if (verbose)
cout << "\n";
}
//*****************************************************************************
// void usage()
// Display program usage information
//
void usage()
{
cout << "usage: htpurge [-][-u url][-v][-a][-c configfile]\n";
cout << "This program is part of ht://Dig " << VERSION << "\n\n";
cout << "Options:\n";
cout << "\t-\tURL input. Read in a list of URLs to remove, one per line.\n\n";
cout << "\t-u\tURL input. Add this url to the list of URLs to remove.\n";
cout << "\t\t(can be specified multiple times)\n\n";
cout << "\t-v\tVerbose mode. This increases the verbosity of the\n";
cout << "\t\tprogram. Using more than 2 is probably only useful\n";
cout << "\t\tfor debugging purposes. The default verbose mode\n";
cout << "\t\tgives a progress on what it is doing and where it is.\n\n";
cout << "\t-a\tUse alternate work files.\n";
cout << "\t\tTells htpurge to append .work to the database files \n";
cout << "\t\tallowing it to operate on a second set of databases.\n\n";
cout << "\t-c configfile\n";
cout << "\t\tUse the specified configuration file instead on the\n";
cout << "\t\tdefault.\n\n";
exit(0);
}
//*****************************************************************************
// Report an error and die
//
void reportError(char *msg)
{
cout << "htpurge: " << msg << "\n\n";
exit(1);
}