You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1059 lines
28 KiB
1059 lines
28 KiB
//-------------------------------------------------------------
|
|
//
|
|
// libhtdig_htdig.cc
|
|
//
|
|
// 1/25/2002 created from htdig.cc
|
|
//
|
|
// Neal Richter nealr@rightnow.com
|
|
//
|
|
// libhtdig_htdig.cc
|
|
//
|
|
// htdig: Indexes the web sites specified in the config file
|
|
// generating several databases to be used by htmerge
|
|
//
|
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
|
// Copyright (c) 1995-2004 The ht://Dig Group
|
|
// For copyright details, see the file COPYING in your distribution
|
|
// or the GNU Library General Public License (LGPL) version 2 or later or later
|
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
|
//
|
|
// $Id: libhtdig_htdig.cc,v 1.5 2004/05/28 13:15:29 lha Exp $
|
|
//
|
|
//-------------------------------------------------------------
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "htconfig.h"
|
|
#endif /* HAVE_CONFIG_H */
|
|
|
|
#ifdef HAVE_STD
|
|
#include <iostream>
|
|
#ifdef HAVE_NAMESPACES
|
|
using namespace std;
|
|
#endif
|
|
#else
|
|
#include <iostream.h>
|
|
#endif /* HAVE_STD */
|
|
|
|
extern "C" {
|
|
#include "libhtdig_api.h"
|
|
}
|
|
|
|
#include "libhtdig_log.h"
|
|
|
|
#include "BasicDocument.h"
|
|
#include "Document.h"
|
|
#include "TextCollector.h"
|
|
#include "Retriever.h"
|
|
#include "StringList.h"
|
|
#include "htdig.h"
|
|
#include "defaults.h"
|
|
#include "HtURLCodec.h"
|
|
#include "WordContext.h"
|
|
#include "HtDateTime.h"
|
|
#include "HtURLRewriter.h"
|
|
#include "URL.h"
|
|
#include "Server.h"
|
|
|
|
////////////////////////////
|
|
// For cookie jar
|
|
////////////////////////////
|
|
#include "HtCookieJar.h"
|
|
#include "HtCookieMemJar.h"
|
|
#include "HtHTTP.h"
|
|
////////////////////////////
|
|
|
|
// If we have this, we probably want it.
|
|
//#ifdef HAVE_GETOPT_H
|
|
//#include <getopt.h>
|
|
//#endif
|
|
|
|
|
|
|
|
//Global Variables for Library
|
|
|
|
int debug = 0;
|
|
HtRegexList limits;
|
|
HtRegexList limitsn;
|
|
String configFile = DEFAULT_CONFIG_FILE;
|
|
FILE *urls_seen = NULL;
|
|
FILE *images_seen = NULL;
|
|
DocumentDB docs;
|
|
|
|
|
|
//
|
|
// Global variables for this file
|
|
//
|
|
static int report_statistics = 0;
|
|
static String minimalFile = 0;
|
|
static HtDateTime StartTime;
|
|
static HtDateTime EndTime;
|
|
|
|
//static char *max_hops = NULL;
|
|
static String credentials;
|
|
static HtCookieJar *_cookie_jar = NULL;
|
|
static HtConfiguration * config = NULL;
|
|
static WordContext * wc = NULL;
|
|
|
|
static int create_text_database = 0;
|
|
static int alt_work_area = 0;
|
|
static int initial = 0;
|
|
|
|
int htdig_index_open_flag = FALSE;
|
|
|
|
|
|
//new. URLs from 'command-line'
|
|
#define URL_SEPCHARS " ,"
|
|
static char *myURL = NULL;
|
|
|
|
|
|
BasicDocument *a_basicdoc;
|
|
TextCollector *Indexer;
|
|
|
|
BasicDocument the_basicdoc;
|
|
//TextCollector the_Indexer;
|
|
|
|
/*******************************************************
|
|
*
|
|
* LIBHTDIG API FUNCTION
|
|
*
|
|
* int htdig_index_open(...)
|
|
*
|
|
*
|
|
* opens/creates document indexes and initializes variables
|
|
* for indexing.
|
|
*
|
|
*
|
|
* see libhtdig_api.h headerfile for definition of
|
|
* htdig_parameters_struct
|
|
*
|
|
*
|
|
* TODO Examine external function calls for error return
|
|
* codes
|
|
*
|
|
*******************************************************/
|
|
|
|
int htdig_index_open(htdig_parameters_struct * htdig_parms)
|
|
{
|
|
int ret = -1;
|
|
|
|
if(htdig_index_open_flag != FALSE)
|
|
return(FALSE);
|
|
|
|
//load 'comand-line' parameters
|
|
|
|
if (htdig_parms->configFile[0] != 0)
|
|
configFile = htdig_parms->configFile;
|
|
|
|
if (htdig_parms->URL[0] != 0)
|
|
{
|
|
myURL = strdup(htdig_parms->URL);
|
|
}
|
|
|
|
debug = htdig_parms->debug;
|
|
if(debug != 0)
|
|
{
|
|
ret = logOpen(htdig_parms->logFile);
|
|
|
|
if(ret == FALSE)
|
|
{
|
|
reportError (form ("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n",
|
|
htdig_parms->logFile, errno, strerror(errno)) );
|
|
return(HTDIG_ERROR_LOGFILE_OPEN);
|
|
}
|
|
}
|
|
|
|
initial = htdig_parms->initial;
|
|
create_text_database = htdig_parms->create_text_database;
|
|
//max_hops = strdup(htdig_parms->max_hops);
|
|
report_statistics = htdig_parms->report_statistics;
|
|
credentials = htdig_parms->credentials;
|
|
alt_work_area = htdig_parms->alt_work_area;
|
|
minimalFile = htdig_parms->minimalFile;
|
|
|
|
|
|
if(htdig_parms->use_cookies == TRUE)
|
|
{
|
|
// Cookie jar dynamic creation.
|
|
|
|
_cookie_jar = new HtCookieMemJar (); // new cookie jar
|
|
if (_cookie_jar)
|
|
HtHTTP::SetCookieJar (_cookie_jar);
|
|
}
|
|
|
|
//
|
|
// First set all the defaults and then read the specified config
|
|
// file to override the defaults.
|
|
//
|
|
|
|
config = HtConfiguration::config ();
|
|
|
|
config->Defaults (&defaults[0]);
|
|
if (access ((char *) configFile, R_OK) < 0)
|
|
{
|
|
reportError (form ("[HTDIG] Unable to find configuration file '%s'",
|
|
configFile.get ()));
|
|
return(HTDIG_ERROR_CONFIG_READ);
|
|
}
|
|
config->Read (configFile);
|
|
|
|
//------- Now override config settings ------------
|
|
|
|
//------- override database path ------------
|
|
if(strlen(htdig_parms->DBpath) > 0)
|
|
{
|
|
config->Add("database_dir", htdig_parms->DBpath);
|
|
}
|
|
|
|
//------- custom filters from htdig_parms ----------
|
|
|
|
if(strlen(htdig_parms->locale) > 0)
|
|
{
|
|
config->Add("locale", htdig_parms->locale);
|
|
}
|
|
|
|
if (config->Find ("locale").empty () && debug > 0)
|
|
logEntry("Warning: unknown locale!\n");
|
|
|
|
if (strlen(htdig_parms->max_hops) > 0)
|
|
{
|
|
config->Add ("max_hop_count", htdig_parms->max_hops);
|
|
}
|
|
|
|
if(strlen(htdig_parms->limit_urls_to) > 0)
|
|
{
|
|
config->Add("limit_urls_to", htdig_parms->limit_urls_to);
|
|
}
|
|
|
|
if(strlen(htdig_parms->limit_normalized) > 0)
|
|
{
|
|
config->Add("limit_normalized", htdig_parms->limit_normalized);
|
|
}
|
|
|
|
if(strlen(htdig_parms->exclude_urls) > 0)
|
|
{
|
|
config->Add("exclude_urls", htdig_parms->exclude_urls);
|
|
}
|
|
|
|
if(strlen(htdig_parms->url_rewrite_rules) > 0)
|
|
{
|
|
config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules);
|
|
}
|
|
|
|
if(strlen(htdig_parms->bad_querystr) > 0)
|
|
{
|
|
config->Add("bad_querystr", htdig_parms->bad_querystr);
|
|
}
|
|
|
|
if(strlen(htdig_parms->locale) > 0)
|
|
{
|
|
config->Add("locale", htdig_parms->locale);
|
|
}
|
|
|
|
if(strlen(htdig_parms->meta_description_factor) > 0)
|
|
{
|
|
config->Add("meta_description_factor", htdig_parms->meta_description_factor);
|
|
}
|
|
|
|
if(strlen(htdig_parms->title_factor) > 0)
|
|
{
|
|
config->Add("title_factor", htdig_parms->title_factor);
|
|
}
|
|
|
|
if(strlen(htdig_parms->text_factor) > 0)
|
|
{
|
|
config->Add("text_factor", htdig_parms->text_factor);
|
|
}
|
|
|
|
if(strlen(htdig_parms->URL) > 0)
|
|
{
|
|
config->Add("start_url", htdig_parms->URL);
|
|
free(myURL);
|
|
myURL=NULL;
|
|
}
|
|
|
|
//------- end custom filters from htdig_parms ----------
|
|
|
|
// Set up credentials for this run
|
|
if (credentials.length ())
|
|
config->Add ("authorization", credentials);
|
|
|
|
//
|
|
// Check url_part_aliases and common_url_parts for
|
|
// errors.
|
|
String url_part_errors = HtURLCodec::instance ()->ErrMsg ();
|
|
|
|
if (url_part_errors.length () != 0)
|
|
{
|
|
reportError (form("[HTDIG] Invalid url_part_aliases or common_url_parts: %s",
|
|
url_part_errors.get ()));
|
|
return(HTDIG_ERROR_URL_PART);
|
|
}
|
|
//
|
|
// Check url_rewrite_rules for errors.
|
|
String url_rewrite_rules = HtURLRewriter::instance ()->ErrMsg ();
|
|
|
|
if (url_rewrite_rules.length () != 0)
|
|
{
|
|
reportError (form ("[HTDIG] Invalid url_rewrite_rules: %s",
|
|
url_rewrite_rules.get ()));
|
|
return(HTDIG_ERROR_URL_REWRITE);
|
|
}
|
|
|
|
//
|
|
// If indicated, change the database file names to have the .work
|
|
// extension
|
|
//
|
|
if (alt_work_area != 0)
|
|
{
|
|
String configValue = config->Find ("doc_db");
|
|
|
|
if (configValue.length () != 0)
|
|
{
|
|
configValue << ".work";
|
|
config->Add ("doc_db", configValue);
|
|
}
|
|
|
|
configValue = config->Find ("word_db");
|
|
if (configValue.length () != 0)
|
|
{
|
|
configValue << ".work";
|
|
config->Add ("word_db", configValue);
|
|
}
|
|
|
|
configValue = config->Find ("doc_index");
|
|
if (configValue.length () != 0)
|
|
{
|
|
configValue << ".work";
|
|
config->Add ("doc_index", configValue);
|
|
}
|
|
|
|
configValue = config->Find ("doc_excerpt");
|
|
if (configValue.length () != 0)
|
|
{
|
|
configValue << ".work";
|
|
config->Add ("doc_excerpt", configValue);
|
|
}
|
|
|
|
configValue = config->Find ("md5_db");
|
|
if (configValue.length () != 0)
|
|
{
|
|
configValue << ".work";
|
|
config->Add ("md5_db", configValue);
|
|
}
|
|
}
|
|
|
|
//
|
|
// If needed, we will create a list of every URL we come across.
|
|
//TODO put document-index log file stuff here
|
|
|
|
if (config->Boolean ("create_url_list"))
|
|
{
|
|
const String filename = config->Find ("url_list");
|
|
urls_seen = fopen (filename, initial ? "w" : "a");
|
|
if (urls_seen == 0)
|
|
{
|
|
reportError (form ("[HTDIG] Unable to create URL file '%s'",
|
|
filename.get ()));
|
|
return(HTDIG_ERROR_URL_CREATE_FILE);
|
|
}
|
|
}
|
|
|
|
//
|
|
// If needed, we will create a list of every image we come across.
|
|
//
|
|
if (config->Boolean ("create_image_list"))
|
|
{
|
|
const String filename = config->Find ("image_list");
|
|
images_seen = fopen (filename, initial ? "w" : "a");
|
|
if (images_seen == 0)
|
|
{
|
|
reportError (form ("[HTDIG] Unable to create images file '%s'",
|
|
filename.get ()));
|
|
return(HTDIG_ERROR_IMAGE_CREATE_FILE);
|
|
}
|
|
}
|
|
|
|
//
|
|
// Set up the limits list
|
|
//
|
|
StringList l (config->Find ("limit_urls_to"), " \t");
|
|
limits.setEscaped (l, config->Boolean ("case_sensitive"));
|
|
l.Destroy ();
|
|
|
|
l.Create (config->Find ("limit_normalized"), " \t");
|
|
limitsn.setEscaped (l, config->Boolean ("case_sensitive"));
|
|
l.Destroy ();
|
|
|
|
//
|
|
// Open the document database
|
|
//
|
|
const String filename = config->Find ("doc_db");
|
|
if (initial)
|
|
unlink (filename);
|
|
|
|
const String index_filename = config->Find ("doc_index");
|
|
if (initial)
|
|
unlink (index_filename);
|
|
|
|
const String head_filename = config->Find ("doc_excerpt");
|
|
if (initial)
|
|
unlink (head_filename);
|
|
|
|
if (docs.Open (filename, index_filename, head_filename) < 0)
|
|
{
|
|
reportError (form ("[HTDIG] Unable to open/create document database '%s'",
|
|
filename.get ()));
|
|
return(HTDIG_ERROR_OPEN_CREATE_DOCDB);
|
|
}
|
|
|
|
const String word_filename = config->Find ("word_db");
|
|
if (initial)
|
|
unlink (word_filename);
|
|
|
|
// Initialize htword
|
|
wc = new WordContext;
|
|
wc->Initialize(*config);
|
|
|
|
|
|
//a_basicdoc = new BasicDocument;
|
|
Indexer = new TextCollector;
|
|
|
|
a_basicdoc = &the_basicdoc;
|
|
a_basicdoc->Reset();
|
|
|
|
//Indexer = &the_Indexer;
|
|
|
|
if ((a_basicdoc == NULL) || (Indexer == NULL))
|
|
return(FALSE);
|
|
|
|
|
|
htdig_index_open_flag = TRUE;
|
|
|
|
return(TRUE);
|
|
|
|
}
|
|
|
|
/*******************************************************
|
|
*
|
|
* LIBHTDIG API FUNCTION
|
|
*
|
|
* int htdig_index_simple_doc(...)
|
|
*
|
|
*
|
|
* indexes a simple document supplied by parameter
|
|
*
|
|
* see libhtdig_api.h headerfile for definition of
|
|
* htdig_simple_doc_struct
|
|
*
|
|
* TODO Examine external function calls for error return
|
|
* codes
|
|
*
|
|
*******************************************************/
|
|
int htdig_index_simple_doc(htdig_simple_doc_struct * a_simple_doc)
|
|
{
|
|
int index_error = 0;
|
|
//int ret = 0;
|
|
|
|
// Reset the document to clean out any old data
|
|
a_basicdoc->Reset();
|
|
|
|
a_basicdoc->ModTime(a_simple_doc->doc_time);
|
|
a_basicdoc->Location(a_simple_doc->location);
|
|
a_basicdoc->DocumentID(a_simple_doc->documentid);
|
|
a_basicdoc->Title(a_simple_doc->title);
|
|
a_basicdoc->MetaContent(a_simple_doc->meta);
|
|
a_basicdoc->Contents(a_simple_doc->contents); //MUST ALLOCATE & FREE!!!
|
|
a_basicdoc->ContentType(a_simple_doc->content_type); //MIME-ISH string
|
|
a_basicdoc->Length();
|
|
|
|
|
|
//TODO What is this error?
|
|
index_error = Indexer->IndexDoc(*a_basicdoc);
|
|
|
|
return(TRUE);
|
|
}
|
|
|
|
/*******************************************************
|
|
*
|
|
* LIBHTDIG API FUNCTION
|
|
*
|
|
* int htdig_index_urls(...)
|
|
*
|
|
* Starts fetch & index of URL supplied in config file
|
|
* OR supplied in htdig_index_open parameter
|
|
*
|
|
* TODO Examine external function calls for error return
|
|
* codes
|
|
* TODO Blank/empty URL error?
|
|
*******************************************************/
|
|
int htdig_index_urls(void)
|
|
{
|
|
|
|
char * temp_URL_list = NULL;
|
|
char * temp_url = NULL;
|
|
|
|
// Create the Retriever object which we will use to parse all the
|
|
// HTML files.
|
|
// In case this is just an update dig, we will add all existing
|
|
// URLs?
|
|
//
|
|
Retriever retriever (Retriever_logUrl);
|
|
if (minimalFile.length () == 0)
|
|
{
|
|
List *list = docs.URLs ();
|
|
retriever.Initial (*list);
|
|
delete list;
|
|
|
|
// Add start_url to the initial list of the retriever.
|
|
// Don't check a URL twice!
|
|
// Beware order is important, if this bugs you could change
|
|
// previous line retriever.Initial(*list, 0) to Initial(*list,1)
|
|
retriever.Initial (config->Find ("start_url"), 1);
|
|
}
|
|
|
|
// Handle list of URLs given on 'command-line'
|
|
if (myURL != NULL)
|
|
{
|
|
String str;
|
|
temp_URL_list = strdup(myURL);
|
|
temp_url = strtok(temp_URL_list, URL_SEPCHARS);
|
|
while (temp_url != NULL)
|
|
{
|
|
str = temp_url;
|
|
str.chop ("\r\n");
|
|
if (str.length () > 0)
|
|
retriever.Initial (str, 1);
|
|
|
|
temp_url = strtok(NULL, URL_SEPCHARS);
|
|
}
|
|
free(temp_URL_list);
|
|
}
|
|
else if (minimalFile.length () != 0)
|
|
{
|
|
FILE *input = fopen (minimalFile.get (), "r");
|
|
char buffer[1000];
|
|
|
|
if (input)
|
|
{
|
|
while (fgets (buffer, sizeof (buffer), input))
|
|
{
|
|
String str (buffer);
|
|
str.chop ("\r\n\t ");
|
|
if (str.length () > 0)
|
|
retriever.Initial (str, 1);
|
|
}
|
|
fclose (input);
|
|
}
|
|
}
|
|
|
|
//
|
|
// Go do it!
|
|
//
|
|
retriever.Start ();
|
|
|
|
//
|
|
// All done with parsing.
|
|
//
|
|
|
|
//
|
|
// If the user so wants, create a text version of the document database.
|
|
//
|
|
|
|
if (create_text_database)
|
|
{
|
|
const String doc_list = config->Find ("doc_list");
|
|
if (initial)
|
|
unlink (doc_list);
|
|
docs.DumpDB (doc_list);
|
|
const String word_dump = config->Find ("word_dump");
|
|
if (initial)
|
|
unlink (word_dump);
|
|
HtWordList words (*config);
|
|
if (words.Open (config->Find ("word_db"), O_RDONLY) == OK)
|
|
{
|
|
words.Dump (word_dump);
|
|
}
|
|
}
|
|
|
|
//
|
|
// Cleanup
|
|
//
|
|
if (images_seen)
|
|
fclose (images_seen);
|
|
|
|
//
|
|
// If needed, report some statistics
|
|
//
|
|
if (report_statistics)
|
|
{
|
|
retriever.ReportStatistics ("htdig");
|
|
}
|
|
|
|
return(TRUE);
|
|
}
|
|
|
|
|
|
/*******************************************************
|
|
*
|
|
* LIBHTDIG API FUNCTION
|
|
*
|
|
* int htdig_index_close(...)
|
|
*
|
|
* Closes the database and destroys various objects
|
|
*
|
|
* TODO Examine external function calls for error return
|
|
* codes
|
|
*
|
|
*******************************************************/
|
|
int htdig_index_close(void)
|
|
{
|
|
int ret = -1;
|
|
|
|
if(htdig_index_open_flag == TRUE)
|
|
{
|
|
//delete a_basicdoc;
|
|
//delete Indexer;
|
|
|
|
Indexer->FlushWordDB();
|
|
|
|
if (_cookie_jar)
|
|
delete _cookie_jar;
|
|
|
|
//if (max_hops != NULL)
|
|
// free(max_hops);
|
|
|
|
if (myURL != NULL)
|
|
free(myURL);
|
|
|
|
//call destructors here
|
|
docs.~DocumentDB();
|
|
//config->~HtConfiguration();
|
|
|
|
if (debug != 0)
|
|
{
|
|
ret = logClose();
|
|
|
|
if (ret == FALSE)
|
|
{
|
|
reportError (form ("[HTDIG] Error closing log file . Error:[%d], %s\n",
|
|
errno, strerror(errno)) );
|
|
return(HTDIG_ERROR_LOGFILE_CLOSE);
|
|
}
|
|
}
|
|
|
|
/*
|
|
if(config) {
|
|
WordContext::Finish();
|
|
}
|
|
*/
|
|
|
|
if (wc)
|
|
delete wc;
|
|
|
|
if (urls_seen)
|
|
fclose (urls_seen);
|
|
|
|
htdig_index_open_flag = FALSE;
|
|
}
|
|
|
|
return(TRUE);
|
|
}
|
|
|
|
/*******************************************************
|
|
*
|
|
* LIBHTDIG API FUNCTION
|
|
*
|
|
* int htdig_index_reset(...)
|
|
*
|
|
*
|
|
* TODO Examine external function calls for error return
|
|
* codes
|
|
*
|
|
*******************************************************/
|
|
|
|
int htdig_index_reset(void)
|
|
{
|
|
Indexer->FlushWordDB();
|
|
a_basicdoc->Reset();
|
|
|
|
return(TRUE);
|
|
}
|
|
|
|
/*******************************************************
|
|
*
|
|
* LIBHTDIG API FUNCTION
|
|
*
|
|
* int htdig_get_max_head_length(...)
|
|
*
|
|
*
|
|
* Returns size of maximum document storage length
|
|
* for db.excerpts [htdig.conf:max_head_length]
|
|
*
|
|
* This represents the maximum amount of the document
|
|
* That will be available for excerpting.
|
|
*
|
|
*
|
|
*******************************************************/
|
|
|
|
int htdig_get_max_head_length()
|
|
{
|
|
int ret = -1;
|
|
|
|
if(config != NULL)
|
|
ret = config->Value("max_head_length");
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/*******************************************************
|
|
*
|
|
* LIBHTDIG API FUNCTION
|
|
*
|
|
* int htdig_index_test_url(...)
|
|
*
|
|
*
|
|
* Test a URL for filter Pass/Fail
|
|
*
|
|
* Pass = return(TRUE)
|
|
* Fail = return(XXX) [Negative Value]
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*******************************************************/
|
|
|
|
|
|
//int htdig_index_test_url(htdig_parameters_struct *htdig_parms)
|
|
int htdig_index_test_url(htdig_parameters_struct *htdig_parms)
|
|
{
|
|
//int ret = FALSE;
|
|
String the_URL(htdig_parms->URL);
|
|
HtConfiguration* config= HtConfiguration::config();
|
|
Dictionary invalids;
|
|
Dictionary valids;
|
|
URL aUrl(the_URL);
|
|
String rewritten_url(the_URL);
|
|
StringList tmpList;
|
|
HtRegex limitTo;
|
|
HtRegex excludeFrom;
|
|
|
|
//initalize outgoing-parameter rewritten_URL
|
|
htdig_parms->rewritten_URL[0] = 0;
|
|
|
|
#ifdef DEBUG
|
|
//output relevant config variables
|
|
cout << " bad_extensions = " << config->Find("bad_extensions") << endl;
|
|
cout << " valid_extensions = " << config->Find("valid_extensions") << endl;
|
|
cout << " exclude_urls = " << config->Find("exclude_urls") << endl;
|
|
cout << " bad_querystr = " << config->Find("bad_querystr") << endl;
|
|
cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl;
|
|
cout << " limit_normalized = " << config->Find("limit_normalized") << endl;
|
|
cout << " restrict = " << config->Find("restrict") << endl;
|
|
cout << " exclude = " << config->Find("exclude") << endl;
|
|
#endif
|
|
|
|
//------------ read the config file if it is given ---------------
|
|
if (htdig_parms->configFile[0] != 0)
|
|
configFile = htdig_parms->configFile;
|
|
|
|
config = HtConfiguration::config ();
|
|
|
|
config->Defaults (&defaults[0]);
|
|
if (access ((char *) configFile, R_OK) < 0)
|
|
{
|
|
reportError (form ("[HTDIG] Unable to find configuration file '%s'",
|
|
configFile.get ()));
|
|
return(HTDIG_ERROR_CONFIG_READ);
|
|
}
|
|
config->Read (configFile);
|
|
|
|
//---------- Now override config settings -----------------
|
|
|
|
//------- override database path ------------
|
|
if(strlen(htdig_parms->DBpath) > 0)
|
|
{
|
|
config->Add("database_dir", htdig_parms->DBpath);
|
|
}
|
|
|
|
//------- custom filters from htdig_parms ----------
|
|
|
|
if(strlen(htdig_parms->locale) > 0)
|
|
{
|
|
config->Add("locale", htdig_parms->locale);
|
|
}
|
|
|
|
if (config->Find ("locale").empty () && debug > 0)
|
|
logEntry("Warning: unknown locale!\n");
|
|
|
|
if (strlen(htdig_parms->max_hops) > 0)
|
|
{
|
|
config->Add ("max_hop_count", htdig_parms->max_hops);
|
|
}
|
|
|
|
if(strlen(htdig_parms->limit_urls_to) > 0)
|
|
{
|
|
config->Add("limit_urls_to", htdig_parms->limit_urls_to);
|
|
}
|
|
|
|
if(strlen(htdig_parms->limit_normalized) > 0)
|
|
{
|
|
config->Add("limit_normalized", htdig_parms->limit_normalized);
|
|
}
|
|
|
|
if(strlen(htdig_parms->exclude_urls) > 0)
|
|
{
|
|
config->Add("exclude_urls", htdig_parms->exclude_urls);
|
|
}
|
|
|
|
if(strlen(htdig_parms->url_rewrite_rules) > 0)
|
|
{
|
|
config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules);
|
|
}
|
|
|
|
if(strlen(htdig_parms->bad_querystr) > 0)
|
|
{
|
|
config->Add("bad_querystr", htdig_parms->bad_querystr);
|
|
}
|
|
|
|
if(strlen(htdig_parms->locale) > 0)
|
|
{
|
|
config->Add("locale", htdig_parms->locale);
|
|
}
|
|
|
|
if(strlen(htdig_parms->meta_description_factor) > 0)
|
|
{
|
|
config->Add("meta_description_factor", htdig_parms->meta_description_factor);
|
|
}
|
|
|
|
if(strlen(htdig_parms->title_factor) > 0)
|
|
{
|
|
config->Add("title_factor", htdig_parms->title_factor);
|
|
}
|
|
|
|
if(strlen(htdig_parms->text_factor) > 0)
|
|
{
|
|
config->Add("text_factor", htdig_parms->text_factor);
|
|
}
|
|
|
|
//-------------------------------------------------------------------
|
|
|
|
#ifdef DEBUG
|
|
//output relevant config variables
|
|
cout << " bad_extensions = " << config->Find("bad_extensions") << endl;
|
|
cout << " valid_extensions = " << config->Find("valid_extensions") << endl;
|
|
cout << " exclude_urls = " << config->Find("exclude_urls") << endl;
|
|
cout << " bad_querystr = " << config->Find("bad_querystr") << endl;
|
|
cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl;
|
|
cout << " limit_normalized = " << config->Find("limit_normalized") << endl;
|
|
cout << " restrict = " << config->Find("restrict") << endl;
|
|
cout << " exclude = " << config->Find("exclude") << endl;
|
|
#endif
|
|
|
|
|
|
//------ bad_extensions -----------------------------------------------
|
|
//A list of bad extensions, separated by spaces or tabs
|
|
|
|
String t = config->Find("bad_extensions");
|
|
String lowerp;
|
|
char *p = strtok(t, " \t");
|
|
while (p)
|
|
{
|
|
// Extensions are case insensitive
|
|
lowerp = p;
|
|
lowerp.lowercase();
|
|
invalids.Add(lowerp, 0);
|
|
p = strtok(0, " \t");
|
|
}
|
|
|
|
|
|
//------ valid_extensions ------------------------------------------------
|
|
// Valid extensions are performed similarly
|
|
// A list of valid extensions, separated by spaces or tabs
|
|
|
|
t = config->Find("valid_extensions");
|
|
p = strtok(t, " \t");
|
|
while (p)
|
|
{
|
|
// Extensions are case insensitive
|
|
lowerp = p;
|
|
lowerp.lowercase();
|
|
valids.Add(lowerp, 0);
|
|
p = strtok(0, " \t");
|
|
}
|
|
|
|
//----- rewrite the URL------------------------------------------
|
|
aUrl.rewrite();
|
|
rewritten_url = aUrl.get();
|
|
|
|
if(rewritten_url.length() <= 0)
|
|
{
|
|
//Rejected: empty rewritten URL
|
|
String temp = config->Find("url_rewrite_rules");
|
|
strcpy(htdig_parms->rewritten_URL, temp.get());
|
|
system(form("echo \"%s\" > /tmp/neal", temp.get()));
|
|
|
|
return(HTDIG_ERROR_TESTURL_REWRITE_EMPTY);
|
|
}
|
|
|
|
//cout << form("TestURL: org=[%s]\n", the_URL.get());
|
|
//cout << form(" rewritten[%s]\n", rewritten_url.get());
|
|
|
|
//copy the rewritten URL for outgoing parm pass
|
|
strcpy(htdig_parms->rewritten_URL, rewritten_url.get());
|
|
|
|
//---- exclude_urls ---------------------------------------------
|
|
// If the URL contains any of the patterns in the exclude list,
|
|
// mark it as invalid
|
|
|
|
/*if(strlen(htdig_parms->exclude_urls) > 0)
|
|
tmpList.Create(htdig_parms->exclude_urls," \t");
|
|
else*/
|
|
tmpList.Create(config->Find("exclude_urls")," \t");
|
|
|
|
HtRegexList excludes;
|
|
excludes.setEscaped(tmpList, config->Boolean("case_sensitive"));
|
|
if (excludes.match(rewritten_url, 0, 0) != 0)
|
|
{
|
|
//Rejected: item in exclude list
|
|
return(HTDIG_ERROR_TESTURL_EXCLUDE);
|
|
}
|
|
|
|
//---- bad_querystr -------------------------------------------
|
|
// If the URL has a query string and it is in the bad query list
|
|
// mark it as invalid
|
|
|
|
tmpList.Destroy();
|
|
|
|
/*if(strlen(htdig_parms->bad_querystr) > 0)
|
|
tmpList.Create(htdig_parms->bad_querystr, " \t");
|
|
else*/
|
|
tmpList.Create(config->Find("bad_querystr"), " \t");
|
|
|
|
HtRegexList badquerystr;
|
|
badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive"));
|
|
char *ext = strrchr((char*)rewritten_url, '?');
|
|
if (ext && badquerystr.match(ext, 0, 0) != 0)
|
|
{
|
|
//if (debug > 2)
|
|
// cout << endl << " Rejected: item in bad query list ";
|
|
return(HTDIG_ERROR_TESTURL_BADQUERY);
|
|
}
|
|
|
|
//------ invalid_extensions #2 ------
|
|
// See if the file extension is in the list of invalid ones
|
|
|
|
ext = strrchr((char*)rewritten_url, '.');
|
|
String lowerext;
|
|
if (ext && strchr(ext,'/')) // Ignore a dot if it's not in the
|
|
ext = NULL; // final component of the path.
|
|
if(ext)
|
|
{
|
|
lowerext.set(ext);
|
|
int parm = lowerext.indexOf('?'); // chop off URL parameter
|
|
if (parm >= 0)
|
|
lowerext.chop(lowerext.length() - parm);
|
|
lowerext.lowercase();
|
|
if (invalids.Exists(lowerext))
|
|
{
|
|
//Rejected: Extension is invalid!
|
|
return(HTDIG_ERROR_TESTURL_EXTENSION);
|
|
}
|
|
}
|
|
|
|
//------ valid_extensions #2 ------
|
|
// Or NOT in the list of valid ones
|
|
|
|
if (ext && valids.Count() > 0 && !valids.Exists(lowerext))
|
|
{
|
|
//Rejected: Extension is not valid!
|
|
return(HTDIG_ERROR_TESTURL_EXTENSION2);
|
|
}
|
|
|
|
//----- limit_urls_to & limit_normalized ------------------------------
|
|
// Set up the limits list
|
|
|
|
StringList l;
|
|
/*if(strlen(htdig_parms->limit_urls_to) > 0)
|
|
l.Create(htdig_parms->limit_urls_to, " \t");
|
|
else*/
|
|
l.Create(config->Find ("limit_urls_to"), " \t");
|
|
|
|
limits.setEscaped (l, config->Boolean ("case_sensitive"));
|
|
|
|
l.Destroy ();
|
|
|
|
/*if(strlen(htdig_parms->limit_normalized) > 0)
|
|
l.Create (htdig_parms->limit_normalized, " \t");
|
|
else*/
|
|
l.Create (config->Find ("limit_normalized"), " \t");
|
|
|
|
limitsn.setEscaped (l, config->Boolean ("case_sensitive"));
|
|
l.Destroy ();
|
|
|
|
// If any of the limits are met, we allow the URL
|
|
if (limits.match(rewritten_url, 1, 0) == 0)
|
|
{
|
|
//Rejected: URL not in the limits!;
|
|
return(HTDIG_ERROR_TESTURL_LIMITS);
|
|
}
|
|
|
|
|
|
// or not in list of normalized urls
|
|
// Warning! should be last in checks because of aUrl normalization
|
|
aUrl.normalize();
|
|
if (limitsn.match(rewritten_url.get(), 1, 0) == 0)
|
|
{
|
|
//Rejected: not in "limit_normalized" list!
|
|
return(HTDIG_ERROR_TESTURL_LIMITSNORM);
|
|
}
|
|
|
|
//----- restrict & exclude ----------------------------------
|
|
//Search-Time Filters
|
|
|
|
String temp;
|
|
|
|
/*if(strlen(htdig_parms->search_restrict) > 0)
|
|
temp = htdig_parms->search_restrict;
|
|
else*/
|
|
temp = config->Find("restrict");
|
|
|
|
if (temp.length())
|
|
{
|
|
// Create a temporary list from either the configuration
|
|
// file or the input parameter
|
|
StringList l(temp, " \t\r\n\001|");
|
|
limitTo.setEscaped(l);
|
|
}
|
|
|
|
/*if(strlen(htdig_parms->search_exclude) > 0)
|
|
temp = htdig_parms->search_exclude;
|
|
else*/
|
|
temp = config->Find("exclude");
|
|
|
|
if (temp.length())
|
|
{
|
|
// Create a temporary list from either the configuration
|
|
// file or the input parameter
|
|
StringList l(temp, " \t\r\n\001|");
|
|
excludeFrom.setEscaped(l);
|
|
}
|
|
|
|
//Restrict Test
|
|
if (limitTo.match(rewritten_url, 1, 0) == 0)
|
|
{
|
|
//Rejected URL Not in SearchTime Restrict List
|
|
return(HTDIG_ERROR_TESTURL_SRCH_RESTRICT);
|
|
}
|
|
//Exclude Test
|
|
if (excludeFrom.match(rewritten_url, 0, 0) != 0)
|
|
{
|
|
//Rejected URL in SearchTime Exclude List
|
|
return(HTDIG_ERROR_TESTURL_SRCH_EXCLUDE);
|
|
}
|
|
|
|
|
|
//Success!
|
|
return TRUE;
|
|
}
|