You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3544 lines
89 KiB

//
// search.cc
//
// search: Sample implementation of search algorithms using
// a mifluz inverted index.
//
// Each class is documented in the class definition. Before
// each method declaration a comment explains the semantic of
// the method. In the method definition comments in the code
// may contain additional information.
//
// Each virtual function is documented in the base class, not
// in the derived classes except for semantic differences.
//
// The class tree is:
//
// WordKeySemantic
//
// WordExclude
// WordExcludeMask
// WordPermute
//
// WordSearch
//
// WordMatch
//
// WordTree
// WordTreeOperand
// WordTreeOptional
// WordTreeOr
// WordTreeAnd
// WordTreeNear
// WordTreeMandatory
// WordTreeNot
// WordTreeLiteral
//
// WordParser
//
// Part of the ht://Dig package <http://www.htdig.org/>
// Copyright (c) 1999-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: search.cc,v 1.9 2004/05/28 13:15:29 lha Exp $
//
#ifdef HAVE_CONFIG_H
#include <htconfig.h>
#endif /* HAVE_CONFIG_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
// If we have this, we probably want it.
#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif /* HAVE_GETOPT_H */
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif /* HAVE_MALLOC_H */
#include <stdlib.h>
#include <htString.h>
#include <WordList.h>
#include <WordContext.h>
#include <WordCursor.h>
//
// Verbosity level set with -v (++)
//
static int verbose = 0;
// ************************* Document definition implementation ***********
#define TAG 1
#define SERVER 2
#define URL 3
#define LOCATION 4
// *********************** WordKeySemantic implementation ********************
//
// NAME
//
// encapsulate WordKey semantic for document and location
//
// SYNOPSIS
//
// #include <WordKeySemantic.h>
//
// #define SERVER 1
// #define URL 2
// #define LOCATION 3
//
// static int document[] = {
// SERVER,
// URL
// };
//
// WordKeySemantic semantic;
// semantic.Initialize(document, sizeof(document)/sizeof(int), LOCATION);
//
// DESCRIPTION
//
// Encapsulate the semantic of a WordKey object fields. It defines
// what a document and a location are. It implements the set of
// operation that a search needs to perform given the fact that it
// implements a search whose purpose is to retrieve a document and
// wants to implement proximity search based on a word location.
//
//
// END
//
// A document is a set of fields in a given order.
// A location is a field.
// The actual fields used to implement WordKeySemantic methods are
// set with the Initialize method.
//
class WordKeySemantic {
public:
WordKeySemantic();
~WordKeySemantic();
//-
// Set the actual field numbers that define what a document is and
// what a location is. The <b>document_arg<b> is a list of WordKey field
// positions of length <b>document_length_arg</b> that must be adjacent.
// The <b>location_arg</b> is the WordKey field position of the word
// location within a document.
// Return OK on success, NOTOK on failure.
//
int Initialize(int* document_arg, int document_length_arg, int location_arg);
//
// These functions and only these know what a document is.
// This should really be a class containing function pointers and be
// given as argument to the search algorithm.
//
//-
// Copy the document in <b>from</b> into <b>to.</b>
//
void DocumentSet(const WordKey& from, WordKey& to);
//-
// Increment the document in <b>key</b> using the <i>SetToFollowing</i>
// method of WordKey. <b>uniq</b> is the WordKey position at which the
// increment starts.
//
void DocumentNext(WordKey& key, int uniq);
//-
// Compare the document fields defined in both <b>a</b> and <b>b</b>
// and return the difference a - b, as in strcmp. If all document
// fields in <b>a</b> or <b>b</b> are undefined return 1.
//
int DocumentCompare(const WordKey& a, const WordKey& b);
//-
// Set all document fields to 0.
//
int DocumentClear(WordKey& key);
//
// These functions and only these know what a location is.
// This should really be a class containing function pointers and be
// given as argument to the search algorithm.
//
//-
// Copy the document and location in <b>from</b> into <b>to.</b>
//
void LocationSet(const WordKey& from, WordKey& to);
//-
// Increment the document and location in <b>key</b>
// using the <i>SetToFollowing</i>
// method of WordKey.
//
void LocationNext(WordKey& key);
//-
// Compare <b>expected</b> location to <b>actual</b> location. Compares equal
// as long as expected location is at a maximum distance of <b>proximity</b>
// of actual. If <b>actual</b> only has undefined field, return > 0.
// <b>expected</b> must always be the lowest possible bound.
// <b>actual</b> is tolerated if it is greater than <b>actual</b> but not
// greater than <b>proximity</b> if <b>proximity</b> > 0 or abs(<b>proximity</b>) * 2 if
// <b>proximity</b> < 0.
// Return the difference expected - actual.
//
int LocationCompare(const WordKey& expected, const WordKey& actual, int proximity = 0);
//-
// <b>key</b> is the expected location of a searched key.
// LocationNearLowest modifies <b>key</b> to add tolerance accroding to
// <b>proximity</b>.
//
// The idea is that <b>key</b> will be the lowest possible match for
// for the <b>proximity</b> range. If <proxmity> is positive, <b>key</b>
// is already the lowest possible match since we accept [0 proximity].
// If <b>proximity</b> is negative, substract it since we accept
// [-proximity proximity].
//
// For better understanding see the functions in which it is used.
//
void LocationNearLowest(WordKey& key, int proximity);
//-
// Undefined the location field in <b>key.</b>.
//
void Location2Document(WordKey& key);
protected:
int* document;
int document_length;
int location;
};
WordKeySemantic::WordKeySemantic()
{
int nfields = WordKey::NFields();
document = new int[nfields];
document_length = 0;
location = -1;
}
WordKeySemantic::~WordKeySemantic()
{
if(document) delete [] document;
}
int WordKeySemantic::Initialize(int* document_arg, int document_length_arg, int location_arg)
{
memcpy((char*)document, (char*)document_arg, document_length_arg * sizeof(int));
document_length = document_length_arg;
location = location_arg;
return OK;
}
void WordKeySemantic::DocumentSet(const WordKey& from, WordKey& to)
{
to.Clear();
for(int i = 0; i < document_length; i++)
to.Set(document[i], from.Get(document[i]));
}
int WordKeySemantic::DocumentCompare(const WordKey& a, const WordKey& b)
{
int ret = 1;
for(int i = 0; i < document_length; i++) {
int idx = document[i];
if((a.IsDefined(idx) && b.IsDefined(idx)) &&
(ret = a.Get(idx) - b.Get(idx)) != 0) return ret;
}
return ret;
}
int WordKeySemantic::DocumentClear(WordKey& key)
{
for(int i = 0; i < document_length; i++)
key.Set(document[i], 0);
return 0;
}
void WordKeySemantic::DocumentNext(WordKey& key, int uniq)
{
if(uniq)
key.SetToFollowing(uniq);
else
key.SetToFollowing(document[document_length-1]);
}
void WordKeySemantic::LocationSet(const WordKey& from, WordKey& to)
{
DocumentSet(from, to);
to.Set(location, from.Get(location));
}
int WordKeySemantic::LocationCompare(const WordKey& expected, const WordKey& actual, int proximity)
{
int ret = 1;
if((ret = DocumentCompare(expected, actual)) != 0) return ret;
//
// Only compare location if defined.
//
if((expected.IsDefined(location) && actual.IsDefined(location)) &&
(ret = expected.Get(location) - actual.Get(location))) {
if(proximity < 0) {
//
// -N means ok if in range [-N +N]
//
proximity *= 2;
if(ret < 0 && ret >= proximity)
ret = 0;
} else {
//
// N means ok if in range [0 +N]
//
if(ret < 0 && ret >= -proximity)
ret = 0;
}
}
return ret;
}
void WordKeySemantic::LocationNext(WordKey& key)
{
key.SetToFollowing(location);
}
void WordKeySemantic::LocationNearLowest(WordKey& key, int proximity)
{
if(proximity < 0) {
if(key.Underflow(location, proximity))
key.Get(location) = 0;
else
key.Get(location) += proximity;
}
}
void WordKeySemantic::Location2Document(WordKey& key)
{
key.Undefined(location);
}
// ************************* WordExclude implementation ********************
//
// NAME
//
// permute bits in bit field
//
// SYNOPSIS
//
// #include <WordExclude.h>
//
// #define BITS 5
//
// WordExclude permute;
// permute.Initialize(BITS);
// while(permute.Next() == WORD_EXCLUDE_OK)
// ...
//
// DESCRIPTION
//
// Count from 1 to the specified maximum. A variable++ loop does the same.
// The <b>WordExclude</b> class counts in a specific order.
// It first step thru all the permutations containing only 1 bit set, in
// increasing order. Then thru all the permutations containing 2 bits set,
// in increasing order. As so forth until the maximum number is reached.
// See the <b>Permute</b> method for more information.
//
//
// END
//
// Helper that displays an unsigned int in binary/hexa/decimal
//
static inline void show_bits(unsigned int result)
{
int i;
for(i = 0; i < 10; i++) {
fprintf(stderr, "%c", (result & (1 << i)) ? '1' : '0');
}
fprintf(stderr, " (0x%08x - %15d)\n", result, result);
}
//
// WordExclude methods return values
//
#define WORD_EXCLUDE_OK 1
#define WORD_EXCLUDE_END 2
//
// Maximum number of bits
//
#define WORD_EXCLUDE_MAX (sizeof(unsigned int) * 8)
//
// Convert a position <p> in a <l> bits mask into a bit offset (from 0)
//
#define WORD_EXCLUDE_POSITION2BIT(l,p) ((l) - (p) - 1)
class WordExclude {
public:
//-
// Reset the generator and prepare it for <b>length</b> bits generation.
// The <b>length</b> cannot be greater than <i>WORD_EXCLUDE_MAX.</i>
// Returns OK if no error occurs, NOTOK otherwise.
//
virtual int Initialize(unsigned int length);
//-
// Move to next exclude mask. Returns WORD_EXCLUDE_OK if successfull,
// WORD_EXCLUDE_END if at the end of the permutations. It starts by
// calling <i>Permute</i> with one bit set, then two and up to
// <i>Maxi()</i> included. The last permutation only generates one
// possibility since all the bits are set.
//
virtual int Next();
//-
// Exclude bit for <b>position</b> starts at most significant bit. That is
// position 0 exclude bit is most significant bit of the current mask.
// Returns true if position is excluded, false otherwise.
//
virtual inline unsigned int Excluded(int position) { return mask & (1 << WORD_EXCLUDE_POSITION2BIT(maxi, position)); }
//-
// Returns how many bits are not excluded with current mask.
//
virtual inline int NotExcludedCount() const { return maxi - bits; }
//-
// Returns how many bits are excluded with current mask.
//
virtual inline int ExcludedCount() const { return bits; }
//
// Save and restore in string
//
//-
// Write an ascii representation of the WordExclude object in <b>buffer.</b>
// Each bit is represented by the character 0 or 1. The most significant
// bit is the last character in the string. For instance
// 1000 is the string representation of a WordExclude object initialized
// with length = 4 after the first <i>Next</i> operation.
//
virtual void Get(String& buffer) const;
//-
// Initialize the object from the string representation in <b>buffer.</b>
// Returns OK on success, NOTOK on failure.
//
virtual int Set(const String& buffer);
//-
// Generate all the permutations
// containing <i>n</i> bits in a <b>bits</b> bit word in increasing order.
// The <b>mask</b> argument is originally filled by the caller
// with the <i>n</i> least significant bits set. A call to Permute
// generates the next permutation immediately greater (numerically)
// than the one contained in <b>mask</b>.
//
// Permute returns the next permutation or 0 if it reached the
// maximum.
//
// To understand the algorithm, imagine 1 is a ball and 0 a space.
//
// When playing the game you start with a rack of <b>bits</b> slots filled
// with <i>n</i> balls all on the left side. You end the game when all
// the balls are on the right side.
//
// Sarting from the left, search for the first ball that has an empty
// space to the right. While searching remove all the balls you find.
// Place a ball in the empty space you found, at the right of the last
// ball removed. Sarting from the left, fill all empty spaces with
// the removed balls. Repeat until all balls are to the right.
//
// Here is a sample generated by repeated calls to WordExclude::Permute:
// (left most bit is least significant)
// <pre>
// mask = 1111100000
// while(mask = WordExclude::Permute(mask, 7))
// show_bits(mask)
//
// 1111100000 (0x0000001f - 31)
// 1111010000 (0x0000002f - 47)
// 1110110000 (0x00000037 - 55)
// 1101110000 (0x0000003b - 59)
// 1011110000 (0x0000003d - 61)
// 0111110000 (0x0000003e - 62)
// 1111001000 (0x0000004f - 79)
// 1110101000 (0x00000057 - 87)
// 1101101000 (0x0000005b - 91)
// 1011101000 (0x0000005d - 93)
// 0111101000 (0x0000005e - 94)
// 1110011000 (0x00000067 - 103)
// 1101011000 (0x0000006b - 107)
// 1011011000 (0x0000006d - 109)
// 0111011000 (0x0000006e - 110)
// 1100111000 (0x00000073 - 115)
// 1010111000 (0x00000075 - 117)
// 0110111000 (0x00000076 - 118)
// 1001111000 (0x00000079 - 121)
// 0101111000 (0x0000007a - 122)
// 0011111000 (0x0000007c - 124)
// </pre>
// A recursive implementation would be:
// <pre>
// /* Recursive */
// void permute(unsigned int result, int bits_count, int bits_toset)
// {
// if(bits_toset <= 0 || bits_count <= 0) {
// if(bits_toset <= 0)
// do_something(result);
// } else {
// permute(result, bits_count - 1, bits_toset);
// permute(result | (1 << (bits_count - 1)), bits_count - 1, bits_toset - 1);
// }
// }
// </pre>
// Which is more elegant but not practical at all in our case.
//
inline unsigned int Permute(unsigned int mask, unsigned int bits);
//-
// Return the current bit field value.
//
virtual inline unsigned int& Mask() { return mask; }
virtual inline unsigned int Mask() const { return mask; }
virtual inline unsigned int& Maxi() { return maxi; }
virtual inline unsigned int Maxi() const { return maxi; }
virtual inline unsigned int& Bits() { return bits; }
virtual inline unsigned int Bits() const { return bits; }
private:
unsigned int mask;
unsigned int maxi;
unsigned int bits;
};
int WordExclude::Initialize(unsigned int length)
{
if(length > WORD_EXCLUDE_MAX) {
fprintf(stderr, "WordExclude::Initialize: length must be < %d\n", (int)WORD_EXCLUDE_MAX);
return NOTOK;
}
mask = 0;
bits = 0;
maxi = length;
return OK;
}
inline unsigned int WordExclude::Permute(unsigned int mask, unsigned int bits)
{
unsigned int bits_cleared = 0;
unsigned int j;
for(j = 0; j < bits; j++) {
if(mask & (1 << j)) {
bits_cleared++;
mask &= ~(1 << j);
} else {
if(bits_cleared) {
bits_cleared--;
mask |= (1 << j);
break;
}
}
}
if(j >= bits)
return 0;
for(j = 0; j < bits_cleared; j++)
mask |= (1 << j);
return mask;
}
int WordExclude::Next()
{
mask = Permute(mask, maxi);
int ret = WORD_EXCLUDE_OK;
if(mask == 0) {
bits++;
if(bits > maxi)
ret = WORD_EXCLUDE_END;
else {
unsigned int i;
for(i = 0; i < bits; i++)
mask |= (1 << i);
ret = WORD_EXCLUDE_OK;
}
}
if(verbose > 2) show_bits(mask);
return ret;
}
void WordExclude::Get(String& buffer) const
{
buffer.trunc();
unsigned int i;
for(i = 0; i < maxi; i++) {
buffer << ((mask & (1 << i)) ? '1' : '0');
}
}
int WordExclude::Set(const String& buffer)
{
if(Initialize(buffer.length()) == NOTOK)
return NOTOK;
unsigned int i;
for(i = 0; i < maxi; i++) {
if(buffer[i] == '1') {
mask |= (1 << i);
bits++;
}
}
return OK;
}
// ************************* WordExcludeMask implementation *******************
//
// NAME
//
// WordExclude specialization that ignore some bits
//
// SYNOPSIS
//
// #include <WordExcludeMask.h>
//
// #define BITS 9
// #define IGNORE 0x0f0
// #define IGNORE_MASK 0x050
//
// WordExcludeMask permute;
// permute.Initialize(BITS, IGNORE, IGNORE_MASK);
// while(permute.Next() == WORD_EXCLUDE_OK)
// ...
//
// DESCRIPTION
//
// Only perform WordExclude operations on the bits that are not set in
// <i>ignore.</i> The bits of <i>ignore_mask</i> that are set in
// <i>ignore</i> are untouched. In the synopsis section, for instance,
// bits 1,2,3,4 and 9 will be permuted and the bits 5,6,7,8 will be
// left untouched.
//
//
// END
//
#define WORD_EXCLUDE_IGNORED (-1)
class WordExcludeMask : public WordExclude {
public:
//-
// <b>ignore</b> gives the mask of bits to ignore. The actual WordExclude
// operations are made on a number of bits that is <b>length</b> - (the number
// of bits set in <b>ignore).</b>
// The <b>ignore_mask_arg</b> contains the actual values of the bits ignored by
// the <b>ignore</b> argument.
//
virtual inline int Initialize(unsigned int length, unsigned int ignore, unsigned int ignore_mask_arg) {
ignore_mask = ignore_mask_arg;
ignore_maxi = length;
unsigned int maxi = 0;
unsigned int i;
for(i = 0, ignore_bits = 0; i < length; i++) {
if(ignore & (1 << i)) {
bit2bit[i] = WORD_EXCLUDE_IGNORED;
if(ignore_mask & (1 << i)) ignore_bits++;
} else {
bit2bit[i] = maxi++;
}
}
return WordExclude::Initialize(maxi);
}
virtual inline unsigned int Excluded(int position) {
position = WORD_EXCLUDE_POSITION2BIT(ignore_maxi, position);
if(bit2bit[position] == WORD_EXCLUDE_IGNORED)
return ignore_mask & (1 << position);
else
return WordExclude::Mask() & (1 << bit2bit[position]);
}
virtual inline int NotExcludedCount() const {
return ignore_maxi - ignore_bits - WordExclude::Bits();
}
virtual inline int ExcludedCount() const {
return ignore_bits - WordExclude::Bits();
}
//-
// The semantic is the same as the Get method of Wordexclude
// except that ignored bits are assigned 3 and 2 instead of 1 and 0
// respectively.
//
virtual void Get(String& buffer) const;
//-
// The semantic is the same as the Get method of Wordexclude
// except that ignored bits are assigned 3 and 2 instead of 1 and 0
// respectively.
//
virtual int Set(const String& buffer);
virtual inline unsigned int Mask() const {
unsigned int ret = ignore_mask;
unsigned int i;
for(i = 0; i < ignore_maxi; i++) {
if(bit2bit[i] != WORD_EXCLUDE_IGNORED) {
if(WordExclude::Mask() & (1 << bit2bit[i]))
ret |= (1 << i);
}
}
return ret;
}
virtual inline unsigned int Maxi() const { return ignore_maxi; }
virtual inline unsigned int Bits() const { return ignore_bits + WordExclude::Bits(); }
private:
unsigned int ignore_mask;
unsigned int ignore_maxi;
unsigned int ignore_bits;
int bit2bit[WORD_EXCLUDE_MAX];
};
void WordExcludeMask::Get(String& buffer) const
{
buffer.trunc();
unsigned int i;
for(i = 0; i < ignore_maxi; i++) {
if(bit2bit[i] == WORD_EXCLUDE_IGNORED)
buffer << ((ignore_mask & (1 << i)) ? '3' : '2');
else
buffer << ((WordExclude::Mask() & (1 << bit2bit[i])) ? '1' : '0');
}
}
int WordExcludeMask::Set(const String& buffer)
{
WordExclude::Initialize(0);
unsigned int& maxi = WordExclude::Maxi();
unsigned int& mask = WordExclude::Mask();
unsigned int& bits = WordExclude::Bits();
ignore_mask = 0;
ignore_bits = 0;
ignore_maxi = buffer.length();
unsigned int i;
for(i = 0; i < ignore_maxi; i++) {
if(buffer[i] == '1' || buffer[i] == '0') {
if(buffer[i] == '1') {
mask |= (1 << maxi);
bits++;
}
bit2bit[i] = maxi;
maxi++;
} else if(buffer[i] == '3' || buffer[i] == '2') {
if(buffer[i] == '3') {
ignore_mask |= (1 << i);
ignore_bits++;
}
bit2bit[i] = WORD_EXCLUDE_IGNORED;
}
}
return OK;
}
// ************************* WordPermute implementation ********************
//
// NAME
//
// WordExclude specialization with proximity toggle
//
// SYNOPSIS
//
// #include <WordPermute.h>
//
// #define BITS 5
//
// WordPermute permute;
// permute.Initialize(BITS);
// while(permute.Next() == WORD_EXCLUDE_OK)
// if(permute.UseProximity()) ...
//
// DESCRIPTION
//
// Each WordExclude permutation is used twice by Next. Once with
// the proximity flag set and once with the proximity flag cleared.
// If the length of the bit field (length argument of Initialize) is
// lower or equal to 1, then the proximity flag is always false.
//
//
// END
//
// WordPermute methods return values
//
#define WORD_PERMUTE_OK WORD_EXCLUDE_OK
#define WORD_PERMUTE_END WORD_EXCLUDE_END
//
// Use or don't use proximity flag
//
#define WORD_PERMUTE_PROXIMITY_NO 0
#define WORD_PERMUTE_PROXIMITY_TOGGLE 1
#define WORD_PERMUTE_PROXIMITY_ONLY 2
//
// Deals with word exclusion and proximity permutations for
// the implementation of the Optional retrieval model.
//
class WordPermute : public WordExcludeMask {
public:
//-
// The <b>nuse_proximity</b> may be set to the following:
//
// WORD_PERMUTE_PROXIMITY_NO so that the object behaves as
// WordExcludeMask and Proximity() always return false.
//
// WORD_PERMUTE_PROXIMITY_TOGGLE so that each permutation is issued twice:
// once with the proximity flag set (Proximity() method) and once with
// the proximity flag cleared.
//
// WORD_PERMUTE_PROXIMITY_ONLY so that the object behaves as
// WordExcludeMask and Proximity() always return true.
//
virtual inline int Initialize(unsigned int length, unsigned int ignore, unsigned int ignore_mask_arg, int nuse_proximity) {
use_proximity = nuse_proximity;
switch(use_proximity) {
case WORD_PERMUTE_PROXIMITY_NO:
proximity = 0;
break;
case WORD_PERMUTE_PROXIMITY_TOGGLE:
//
// Don't bother to try proximity search if only one word
// is involved.
//
proximity = length > 1;
break;
case WORD_PERMUTE_PROXIMITY_ONLY:
proximity = 1;
break;
default:
fprintf(stderr, "WordPermute::Initialize: unexpected use_proximity = %d\n", use_proximity);
return 0;
}
return WordExcludeMask::Initialize(length, ignore, ignore_mask_arg);
}
//-
// Return true if the proximity flag is set, false if it is
// cleared.
//
inline int Proximity() {
switch(use_proximity) {
case WORD_PERMUTE_PROXIMITY_NO:
return 0;
break;
case WORD_PERMUTE_PROXIMITY_TOGGLE:
return proximity;
break;
case WORD_PERMUTE_PROXIMITY_ONLY:
return 1;
break;
default:
fprintf(stderr, "WordPermute::Proximity: unexpected use_proximity = %d\n", use_proximity);
return 0;
break;
}
}
//-
// Return WORD_PERMUTE_PROXIMITY_NO, WORD_PERMUTE_PROXIMITY_TOGGLE or
// WORD_PERMUTE_PROXIMITY_ONLY.
//
inline int UseProximity() { return use_proximity; }
//-
// Find the next permutation. If <b>WORD_PERMUTE_PROXIMITY_TOGGLE<b> was
// specified in Initialize each permutation is issued twice (see
// Proximity() to differentiate them), except when the mask
// only contains one non exluded bit (NotExcludeCount() <= 1).
// In both case the last permutation with all bits excluded
// (i.e. when NotExcludedCount() <= 0) is never returned because
// it is useless.
//
virtual int Next() {
if(Maxi() <= 0)
return WORD_PERMUTE_END;
int ret = WORD_PERMUTE_OK;
int check_useless = 0;
if(use_proximity == WORD_PERMUTE_PROXIMITY_TOGGLE) {
//
// Move to next permutation as follows:
// exclude mask 1 + use proximity
// exclude mask 1 + don't use proximity
// exclude mask 2 + use proximity
// exclude mask 2 + don't use proximity
// and so on.
// If only one word is involved never use proximity.
//
if(proximity) {
proximity = 0;
} else {
proximity = 1;
if((ret = WordExcludeMask::Next()) == WORD_PERMUTE_OK) {
//
// Do not toggle proximity for only one non excluded word
//
if(NotExcludedCount() <= 1)
proximity = 0;
check_useless = 1;
} else if(ret == WORD_PERMUTE_END)
proximity = 0;
}
} else {
ret = WordExcludeMask::Next();
check_useless = 1;
}
if(check_useless && ret == WORD_PERMUTE_OK) {
//
// If no bits are ignored or all ignore_mask bits are set to
// one, the last permutation has all exclude bits set, which
// is useless. Just skip it and expect to be at the end of
// all permutations.
//
if(NotExcludedCount() <= 0) {
ret = WordExcludeMask::Next();
if(ret != WORD_PERMUTE_END) {
fprintf(stderr, "WordPermute::Next: expected WORD_PERMUTE_END\n");
ret = NOTOK;
}
}
}
return ret;
}
//-
// The semantic is the same as the Get method of Wordexclude
// but a letter T is appended to the string if the proximity
// flag is set, or F is appended to the string if the proximity
// is clear.
//
virtual inline void Get(String& buffer) const {
WordExcludeMask::Get(buffer);
if(use_proximity == WORD_PERMUTE_PROXIMITY_TOGGLE)
buffer << (proximity ? 'T' : 'F');
}
//-
// The semantic is the same as the Get method of Wordexclude
// but if the string end with a T the proximity flag is set
// and if the string end with a F the proximity flag is cleared.
//
virtual inline int Set(const String& buffer) {
if(buffer.length() < 1) {
fprintf(stderr, "WordPermute::Set: buffer length < 1\n");
return NOTOK;
}
int ret = OK;
if(use_proximity == WORD_PERMUTE_PROXIMITY_TOGGLE) {
if((ret = WordExcludeMask::Set(buffer.sub(0, buffer.length() - 1))) == OK)
proximity = buffer.last() == 'T';
} else {
ret = WordExcludeMask::Set(buffer);
}
return ret;
}
protected:
int use_proximity;
int proximity;
};
// ************************* WordTree implementation ********************
//
// NAME
//
// Base class for query resolution nodes
//
// SYNOPSIS
//
// #include <WordTree.h>
//
// class WordTreeMethod : public WordTree {
// ...
// };
//
// DESCRIPTION
//
// The WordTree class is derived from the WordCursor class and implement
// the basic operations and data structures needed for query resolution.
// It is the common base class of all the classes that actually implement
// a query resolution. The derived classes must be implemented to follow
// the WordCursor semantic for Walk* operations.
//
//
// END
//
#define WORD_WALK_REDO 0x1000
#define WORD_WALK_RESTART 0x2000
#define WORD_WALK_NEXT 0x4000
//
// Return values of CursorsObeyProximity method
//
#define WORD_SEARCH_NOPROXIMITY 1
//
// operand values
//
#define WORD_TREE_OR 1
#define WORD_TREE_AND 2
#define WORD_TREE_NEAR 3
#define WORD_TREE_OPTIONAL 4
#define WORD_TREE_LITERAL 5
#define WORD_TREE_MANDATORY 6
#define WORD_TREE_NOT 7
#define WORD_TREE_OP_SIZE 20
//
// Default proximity is to search for adjacent words in order
//
#ifndef WORD_SEARCH_DEFAULT_PROXIMITY
#define WORD_SEARCH_DEFAULT_PROXIMITY 1
#endif /* WORD_SEARCH_DEFAULT_PROXIMITY */
static char* operator_name[WORD_TREE_OP_SIZE] = {
"",
"or",
"and",
"near",
"optional",
"literal",
"mandatory",
"not",
0
};
class WordTree : public WordCursor {
public:
WordTree() {
proximity = 0;
uniq = 0;
}
virtual int ContextSaveList(StringList& list) const {
return OK;
}
virtual int ContextRestoreList(StringList& list) {
return OK;
}
//-
// Initialize the object. <b>words</b> is used to initialize the
// WordCursor base class, <b>document, document_length</b> and
// <b>location</b> are used to initialize the WordKeySemantic data
// member. The <b>nuniq</b> is the WordKey field position used by
// the WordKeySemantic::DocumentNext function. The <b>nproximity</b>
// is the proximity factor used by the WordKeySemantic::LocationCompare
// method.
// Return OK on success, NOTOK on failure.
//
virtual int Prepare(WordList *words, int nuniq, int nproximity, int *document, int document_length, int location) {
int ret;
proximity = nproximity;
uniq = nuniq;
if((ret = key_semantic.Initialize(document, document_length, location)) != OK)
return ret;
WordKey key;
if(!scope.empty()) {
if(key.Set(scope) != OK) {
fprintf(stderr, "WordTree::Prepare: setting scope %s failed\n", (char*)scope);
return NOTOK;
}
}
key.SetWord(search);
return WordCursor::Initialize(words, key, 0, 0, HTDIG_WORDLIST_WALKER);
}
//-
// Return a copy of the last document found.
//
WordKey GetDocument() {
WordKey found;
key_semantic.DocumentSet(GetFound().Key(), found);
return found;
}
//-
// Store in the <i>info</i> data member textual information about
// the latest match found.
//
virtual void SetInfo() { info = GetFound().Key().GetWord(); }
//-
// Return a copy of the <i>info</i> data member. Should be
// called after SetInfo().
//
String GetInfo() { return info; }
//-
// Sort WordTree data members (if any) in ascending frequency order.
// Return OK on success, NOTOK on failure.
//
virtual int AscendingFrequency() { return OK; }
//-
// Delete WordTree data members (if any) that have a zero frequency.
// The number of data members deleted is returned in <b>stripped</b>.
// Return OK on success, NOTOK on failure.
//
virtual int StripNonExistent(unsigned int& stripped) {
stripped = 0;
return OK;
}
//
// Input
//
//-
// Proximity factor. See WordKeySemantic::LocationCompare.
//
int proximity;
//-
// Uniq WordKey field position. See WordKeySemantic::DocumentNext.
//
int uniq;
//-
// Semantic of the WordKey object.
//
WordKeySemantic key_semantic;
//-
// Textual representation of the search scope.
//
String scope;
//-
// Original search criterion that may be different from the
// WordCursor::searchKey data member.
//
String search;
//
// Internal state
//
//-
// Textual information about the latest match.
//
String info;
};
// ************************* WordTreeLiteral implementation ****************
class WordTreeLiteral : public WordTree {
public:
//-
// Constructor. The search criterion is <b>string</b> and the
// scope is <b>nscope.</b>.
//
WordTreeLiteral(const char* string, const char* nscope = "") {
search.set((char*)string);
scope.set((char*)nscope);
}
//-
// Returns WORD_TREE_LITERAL.
//
int IsA() const { return WORD_TREE_LITERAL; }
virtual int WalkRewind();
//-
// Only return a match for each distinct document.
//
virtual int WalkNext();
virtual int Seek(const WordKey& patch);
//-
// If scope is set the <b>bufferout</b> is filled with
// <pre>
// ( word "scope" )
// </pre>
// otherwise the <b>bufferout</b> only contains the word.
//
virtual int Get(String& bufferout) const {
if(scope.empty())
bufferout << search;
else
bufferout << "( " << operator_name[IsA()] << " \"" << scope << "\" " << search << " )";
return OK;
}
protected:
WordKey current_document;
};
int WordTreeLiteral::WalkRewind()
{
current_document.Clear();
return WordCursor::WalkRewind();
}
int WordTreeLiteral::WalkNext()
{
int ret;
do {
ret = WordCursor::WalkNext();
if(verbose > 3) fprintf(stderr, "WordTreeLiteral::WalkNext: reached %s\n", (char*)GetDocument().Get());
} while(ret == OK &&
key_semantic.DocumentCompare(current_document, GetDocument()) == 0);
if(ret == OK)
current_document = GetDocument();
else
current_document.Clear();
return ret;
}
int WordTreeLiteral::Seek(const WordKey& position)
{
current_document.Clear();
return WordCursor::Seek(position);
}
// ************************* WordTreeOperand implementation ****************
//
// NAME
//
// Base class for boolean query resolution nodes
//
// SYNOPSIS
//
// #include <WordTree.h>
//
// class WordTreeMethod : public WordTreeOperand {
// ...
// };
//
// DESCRIPTION
//
// The WordTreeOperand class is derived from WordTree and implemet
// the basic operations and data structures needed for query resultion
// of boolean operators. It contains a list of WordTree objects (the
// operands or cursors) and redefine the basic WordCursor methods
// to operate on all of them according to the logic defined by the
// derived class.
//
//
// END
//
//
// Helper for debugging that returns the string representation
// of the return codes.
//
static char* ret2str(int ret)
{
if(ret == WORD_WALK_REDO)
return "REDO";
if(ret == WORD_WALK_RESTART)
return "RESTART";
if(ret == WORD_WALK_NEXT)
return "NEXT";
if(ret == OK)
return "OK";
if(ret == NOTOK)
return "NOTOK";
if(ret == WORD_WALK_ATEND)
return "ATEND";
return "???";
}
class WordTreeOperand : public WordTree
{
public:
//-
// Constructor. The scope is <b>nscope</b>.
//
WordTreeOperand(const char* nscope) {
scope.set((char*)nscope);
}
//-
// Free the objects pointed by <i>cursors</i> with delete as well
// as the <i>cursors</i> array itself with delete [].
//
virtual ~WordTreeOperand();
virtual void Clear() {
cursors = 0;
cursors_length = 0;
WordCursor::Clear();
}
//-
// Recursively call Optimize on each <i>cursors</i>.
//
virtual int Optimize();
//-
// Change the <i>permutation</i> data member ignore mask according
// to WORD_TREE_MANDATORY and WORD_TREE_NOT nodes found in
// <i>cursors</i>. MANDATORY and NOT nodes are reduced (replaced
// by their first child cursor. For each MANDATORY and NOT nodes
// the bit (see WordExcludeMask for information)
// corresponding to their position is ignored (set in the <b>ignore</b>
// argument of the WordExcludeMask::Initialize function. For NOT
// nodes, the bit corresponding to their position is set in
// the <b>ignore_mask</b> of the WordExcludeMask::Initialize function
// (i.e. implementing a <i>not</i> operation).
// The <b>proximity</b> argument may be WORD_PERMUTE_PROXIMITY_TOGGLE or
// WORD_PERMUTE_PROXIMITY_NO.
// Returns OK on success, NOTOK on failure.
//
int OptimizeOr(int proximity);
virtual int ContextSave(String& buffer) const {
StringList list;
int ret;
if((ret = ContextSaveList(list)) != OK)
return ret;
buffer.trunc();
String* element;
list.Start_Get();
while((element = (String*)list.Get_Next())) {
buffer << (*element) << ';';
}
//
// Trim last ;
//
buffer.chop(1);
return OK;
}
virtual int ContextSaveList(StringList& list) const {
//
// Apply to each cursor
//
unsigned int i;
for(i = 0; i < cursors_length; i++)
if(cursors[i]->ContextSaveList(list) == NOTOK)
return NOTOK;
return OK;
}
virtual int ContextRestore(const String& buffer) {
if(!buffer.empty()) {
StringList list(buffer, ";");
return ContextRestoreList(list);
} else {
return OK;
}
}
virtual int ContextRestoreList(StringList& list) {
//
// Apply to each cursor
//
unsigned int i;
for(i = 0; i < cursors_length; i++)
if(cursors[i]->ContextRestoreList(list) == NOTOK)
return NOTOK;
return OK;
}
//-
// Recursively call WalkInit on each <i>cursors</i>.
//
virtual int WalkInit();
//-
// Recursively call WalkRewind on each <i>cursors</i>.
// Reset the <i>pos</i> data member with WordKeySemantic::DocumentClear.
//
virtual int WalkRewind();
//-
// Recursively call WalkFinish on each <i>cursors</i>.
//
virtual int WalkFinish();
//-
// Recursively call Seek on each <i>cursors</i>.
// Save the <b>patch</b> argument in the <i>pos</i> data
// member.
//
virtual int Seek(const WordKey& patch);
//-
// The number of occurrence of a WordTreeOperand is the sum of the
// number of occurrence of each term.
//
virtual int Noccurrence(unsigned int& noccurrence) const {
noccurrence = 0;
unsigned int i;
for(i = 0; i < cursors_length; i++) {
unsigned int frequency;
if(cursors[i]->Noccurrence(frequency) != OK)
return NOTOK;
noccurrence += frequency;
}
return OK;
}
//-
// The <b>bufferout</b> argument is filled with a lisp like representation
// of the tree starting at this node.
//
virtual int Get(String& bufferout) const {
bufferout << "( " << operator_name[IsA()] << " \"" << scope << "\" ";
unsigned int i;
for(i = 0; i < cursors_length; i++)
bufferout << cursors[i]->Get() << " ";
bufferout << " )";
return OK;
}
//-
// Call Prepare on each <i>cursors</i>. Set the <i>search</i> member
// with an textual representation of the tree starting at this node.
//
virtual int Prepare(WordList *words, int nuniq, int nproximity, int *document, int document_length, int location) {
int ret;
if((ret = WordTree::Prepare(words, nuniq, nproximity, document, document_length, location)) != OK)
return ret;
unsigned int i;
for(i = 0; i < cursors_length; i++) {
if((ret = cursors[i]->Prepare(words, nuniq, nproximity, document, document_length, location)) != OK)
return ret;
}
return Get(GetSearch().GetWord());
}
//-
// The current cursor offset (set by Seek for instance). It
// duplicates the function of the WordCursor <i>key</i> data member
// because the data type is different (WordKey instead of String).
//
WordKey pos;
//-
// Sub nodes array.
//
WordTree** cursors;
//-
// Number of valid entries in the <i>cursors</i> member.
//
unsigned int cursors_length;
//-
// Permutation generator with proximity toggle
//
WordPermute permutation;
};
WordTreeOperand::~WordTreeOperand()
{
if(cursors) {
unsigned int i;
for(i = 0; i < cursors_length; i++)
delete cursors[i];
free(cursors);
}
}
int
WordTreeOperand::Optimize()
{
//
// Apply to each cursor
//
unsigned int i;
for(i = 0; i < cursors_length; i++)
if(cursors[i]->Optimize() == NOTOK)
return NOTOK;
return OK;
}
int WordTreeOperand::OptimizeOr(int proximity)
{
unsigned int ignore = 0;
unsigned int ignore_mask = 0;
unsigned int i;
for(i = 0; i < cursors_length; i++) {
int reduce;
//
// Set ignore & ignore_mask if cursor is NOT or MANDATORY
//
switch(cursors[i]->IsA()) {
case WORD_TREE_MANDATORY:
ignore |= (1 << WORD_EXCLUDE_POSITION2BIT(cursors_length, i));
reduce = 1;
break;
case WORD_TREE_NOT:
ignore |= (1 << WORD_EXCLUDE_POSITION2BIT(cursors_length, i));
ignore_mask |= (1 << WORD_EXCLUDE_POSITION2BIT(cursors_length, i));
reduce = 1;
break;
default:
reduce = 0;
break;
}
//
// Replace the NOT or MANDATORY node by its only child
//
if(reduce) {
WordTreeOperand* old = (WordTreeOperand*)cursors[i];
cursors[i] = old->cursors[0];
old->cursors[0] = 0;
old->cursors_length--;
if(old->cursors_length > 0) {
fprintf(stderr, "WordTreeOptional::OptimizeOr: too many cursors\n");
return NOTOK;
}
delete old;
}
}
return permutation.Initialize(cursors_length, ignore, ignore_mask, proximity);
}
int
WordTreeOperand::WalkInit()
{
unsigned int i;
int ret = WORD_WALK_ATEND;
for(i = 0; i < cursors_length; i++)
if((ret = cursors[i]->WalkInit()) != OK)
return ret;
return (status = ret);
}
int
WordTreeOperand::WalkRewind()
{
unsigned int i;
int ret = OK;
for(i = 0; i < cursors_length; i++)
if((ret = cursors[i]->WalkRewind()) != OK)
return ret;
status = OK;
key_semantic.DocumentClear(pos);
cursor_get_flags = DB_SET_RANGE;
found.Clear();
return ret;
}
int
WordTreeOperand::WalkFinish()
{
unsigned int i;
int ret = OK;
for(i = 0; i < cursors_length; i++)
if((ret = cursors[i]->WalkFinish()) != OK)
return ret;
return ret;
}
int
WordTreeOperand::Seek(const WordKey& patch)
{
pos.CopyFrom(patch);
cursor_get_flags = DB_SET_RANGE;
unsigned int i;
int ret = OK;
for(i = 0; i < cursors_length; i++)
if((ret = cursors[i]->Seek(patch)) != OK &&
ret != WORD_WALK_ATEND)
return ret;
status = OK;
return OK;
}
// ************************* WordTreeOptional implementation ****************
class WordTreeOptional : public WordTreeOperand {
public:
WordTreeOptional(const char* nscope) : WordTreeOperand(nscope) { }
//-
// Return WORD_TREE_OPTIONAL
//
virtual int IsA() const { return WORD_TREE_OPTIONAL; }
virtual int Optimize();
virtual int ContextSaveList(StringList& list) const;
virtual int ContextRestoreList(StringList& list);
//-
// Multipass walk of the occurrences according to the <i>permutation</i>
// data member specifications. First search for documents containing
// all occurrences near to each other. Then documents that
// contain all occurrences far appart. Then ignore the most frequent
// search criterion and search for documents that contain all the others
// near to each other. The logic goes on until there only remains the
// most frequent word.
//
virtual int WalkNext();
//-
// Only seek the first non excluded cursor. The implementation
// of WalkNext makes it useless to seek the others.
//
virtual int Seek(const WordKey& position);
virtual int Prepare(WordList *words, int nuniq, int nproximity, int *document, int document_length, int location) {
int ret;
if((ret = permutation.Initialize(cursors_length, 0, 0, WORD_PERMUTE_PROXIMITY_TOGGLE)) != OK)
return ret;
return WordTreeOperand::Prepare(words, nuniq, nproximity, document, document_length, location);
}
virtual void SetInfo();
virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_TOGGLE; }
virtual int UsePermutation() const { return 1; }
//-
// Returns true if all cursors must have a frequency > 0, false otherwise.
//
virtual int AllOrNothing() const { return 0; }
//-
// Comparison between <b>cursor</b> and <b>constraint</b> is made
// with WordKeySemantic::LocationCompare using the <b>proximity</b>
// argument. If <b>master</b> is NULL it is set to point to <b>
// <b>cursor</b>.
//
// Return WORD_WALK_NEXT if <b>cursor</b> is at <b>constraint</b> and
// set <b>constraint</b> if <b>cursor</b> is <b>master</b>.
//
// Return WORD_WALK_REDO if <b>cursor</b> is above <b>constraint</b> and
// call cursor.WalkNext().
//
// Return WORD_WALK_RESTART if <b>cursor</b> is below <b>constraint</b> and
// set <b>constraint</b> from <b>cursor</b> using
// WordKeySemantic::DocumentSet if <b>cursor</b> is not <b>master</b>
// otherwise also set location of <b>constraint</b> using
// WordKeySemantic::LocationSet and call WordKeySemantic::LocationNext
// on <b>constraint.</b>
//
// Return WORD_WALK_ATEND if no more match possible.
//
// Return NOTOK on failure.
//
int SearchCursorNear(WordTree& cursor, WordTree*& master, WordKey& constraint, int proximity);
//-
// Comparison between <b>cursor</b> and <b>document</b> is made
// with WordKeySemantic::DocumentCompare.
//
// Return WORD_WALK_NEXT if <b>cursor</b> is above <b>document.</b>
//
// Return WORD_WALK_REDO if <b>cursor</b> is below <b>document</b>
// and call cursor.WalkNext().
//
// Return WORD_WALK_RESTART if <b>cursor</b> is at <b>document</b>
// and call WordKeySemantic::DocumentNext method on <b>document.</b>
//
// Return WORD_WALK_ATEND if no more match possible.
//
// Return NOTOK on failure.
//
int SearchCursorNot(WordTree& cursor, WordKey& document);
//-
// Comparison between <b>cursor</b> and <b>document</b> is made
// with WordKeySemantic::DocumentCompare.
//
// Return WORD_WALK_NEXT if <b>cursor</b> is at <b>document.</b>.
//
// Return WORD_WALK_REDO if <b>cursor</b> is below <b>document</b>
//
// Return WORD_WALK_RESTART if <b>cursor</b> is above <b>document</b>
// and call WordKeySemantic::DocumentNext method on <b>document.</b>
//
// Return WORD_WALK_ATEND if no more match possible.
//
// Return NOTOK on failure.
//
//
int SearchCursorAnd(WordTree& cursor, WordKey& document, WordExclude& permutation);
//
// We know that :
// 1) document does not contain any excluded words.
// 2) contains at least one occurrence of each non excluded word.
// The logic, although very similar to WordSearchNear::SearchOne
// is therefore simpler. We ignore all excluded cursors and
// return WORD_SEARCH_NOPROXIMITY as soon as a cursor move outside
// <document>.
//
//-
// If <b>document</b> contains words that match proximity
// requirement, return OK. Return WORD_SEARCH_NOPROXIMITY if proximity
// requirement cannot be matched for <document>.
//
int CursorsObeyProximity(WordKey& document);
//-
// Sort the <i>cursors</i> in ascending frequency order using the
// Noccurrence method on each cursor.
// Return OK on success, NOTOK on failure.
//
virtual int AscendingFrequency();
//-
// Delete all elements of the <i>cursors</i> array that have a
// zero frequency. The <i>cursors</i> array is shrinked and the
// <i>cursors_length</i> set accordingly. Returns the number of
// deletions in the <b>stripped</i> argument.
// Return OK on success, NOTOK on failure.
//
virtual int StripNonExistent(unsigned int& stripped);
};
int WordTreeOptional::Optimize()
{
int ret;
if((ret = WordTreeOperand::Optimize()) != OK)
return ret;
if(UseProximity() != WORD_PERMUTE_PROXIMITY_ONLY) {
if((ret = AscendingFrequency()) != OK)
return ret;
}
unsigned int stripped;
if((ret = StripNonExistent(stripped)) != OK)
return ret;
if(AllOrNothing() && stripped) {
//
// One word is missing and everything is lost,
// Just kill the remaining cursors.
//
unsigned int i;
for(i = 0; i < cursors_length; i++)
delete cursors[i];
cursors_length = 0;
return OK;
} else {
return OptimizeOr(UseProximity());
}
}
int WordTreeOptional::ContextSaveList(StringList& list) const
{
int ret;
if((ret = WordTreeOperand::ContextSaveList(list)) != OK)
return ret;
if(UsePermutation()) {
String* buffer = new String();
permutation.Get(*buffer);
list.Add(buffer);
}
{
String* buffer = new String();
if((ret = WordCursor::ContextSave(*buffer)) != OK)
return ret;
list.Add(buffer);
}
return OK;
}
int WordTreeOptional::ContextRestoreList(StringList& list)
{
int ret;
if((ret = WordTreeOperand::ContextRestoreList(list)) != OK)
return ret;
if(UsePermutation()) {
char* buffer = list[0];
if((ret = permutation.Set(buffer)) != OK)
return ret;
list.Remove(0);
}
{
char* buffer = list[0];
if(!buffer) return NOTOK;
WordKey key(buffer);
if((ret = Seek(key)) != OK)
return ret;
cursor_get_flags = DB_NEXT;
list.Remove(0);
}
return OK;
}
int WordTreeOptional::WalkNext()
{
WordKey& constraint = pos;
//
// Set constraint with all 0
//
if(constraint.Empty())
key_semantic.DocumentClear(constraint);
//
// Advance cursors to next constraint, if not at the
// beginning of the search.
//
int ret = OK;
int match_ok = 0;
do {
//
// Advance cursors so that next call fetches another constraint
//
if(cursor_get_flags == DB_NEXT)
key_semantic.DocumentNext(constraint, uniq);
if((ret = Seek(constraint)) != OK)
return ret;
int near = permutation.Proximity();
WordTree* first = 0;
for(unsigned int i = 0; i < cursors_length;) {
WordTree& cursor = *(cursors[i]);
near = permutation.Proximity();
int excluded = permutation.Excluded(i);
if(verbose) fprintf(stderr, "WordTreeOptional::WalkNext: %s excluded = %s, proximity = %s\n", (char*)cursor.GetSearch().GetWord(), (excluded ? "yes" : "no"), (near ? "yes" : "no" ));
int ret;
if(excluded) {
ret = SearchCursorNot(cursor, constraint);
if(verbose > 2) fprintf(stderr, "WordTreeOptional::WalkNext: Not -> %s\n", ret2str(ret));
} else {
if(near) {
ret = SearchCursorNear(cursor, first, constraint, proximity);
if(verbose > 2) fprintf(stderr, "WordTreeOptional::WalkNext: Near -> %s\n", ret2str(ret));
} else {
ret = SearchCursorAnd(cursor, constraint, permutation);
if(verbose > 2) fprintf(stderr, "WordTreeOptional::WalkNext: And -> %s\n", ret2str(ret));
}
}
switch(ret) {
case WORD_WALK_ATEND:
if(UsePermutation()) {
//
// The search is over with this permutation, try another one.
//
switch(permutation.Next()) {
//
// No permutations left, the end
//
case WORD_PERMUTE_END:
return (status = WORD_WALK_ATEND);
break;
//
// Sart over with this permutation
//
case WORD_PERMUTE_OK:
if(WalkRewind() != OK)
return NOTOK;
break;
}
first = 0;
i = 0;
} else {
return (status = WORD_WALK_ATEND);
}
break;
case WORD_WALK_REDO:
break;
case WORD_WALK_RESTART:
first = 0;
i = 0;
break;
case WORD_WALK_NEXT:
i++;
break;
case NOTOK:
default:
return ret;
break;
}
}
cursor_get_flags = DB_NEXT;
SetInfo();
//
// Save possible result, i.e. first non excluded cursor
//
for(unsigned int i = 0; i < cursors_length; i++) {
WordTree& cursor = *(cursors[i]);
if(!permutation.Excluded(i)) {
found.Key().CopyFrom(cursor.GetFound().Key());
break;
}
}
match_ok = 1;
//
// Only bother if near and non near search are involved
//
if(UseProximity() == WORD_PERMUTE_PROXIMITY_TOGGLE) {
//
// If we reach this point in the function and
// either proximity search is active or there is
// only one word involved, the match is valid.
// Otherwise it may be excluded, see below.
//
if(!near && permutation.NotExcludedCount() > 1) {
//
// If not using proximity, a match that fits the proximity
// requirements must be skipped because it was matched by
// the previous permutation (see WordPermute).
//
switch(CursorsObeyProximity(constraint)) {
case OK:
match_ok = 0;
break;
case WORD_SEARCH_NOPROXIMITY:
match_ok = 1;
break;
default:
case NOTOK:
return NOTOK;
break;
}
}
}
} while(!match_ok && ret == OK);
return ret;
}
int WordTreeOptional::Seek(const WordKey& position)
{
pos.CopyFrom(position);
cursor_get_flags = DB_SET_RANGE;
status = OK;
unsigned int i;
for(i = 0; i < cursors_length; i++) {
if(!permutation.Excluded(i)) {
WordTree& cursor = *(cursors[i]);
return cursor.Seek(position);
}
}
fprintf(stderr, "WordTreeOptional::Seek: failed\n");
return NOTOK;
}
void WordTreeOptional::SetInfo()
{
unsigned int i;
for(i = 0; i < cursors_length; i++)
cursors[i]->SetInfo();
info.trunc();
for(i = 0; i < cursors_length; i++) {
WordTree& cursor = *(cursors[i]);
if(!permutation.Excluded(i))
info << cursor.info << " ";
}
info << (permutation.Proximity() ? "proximity" : "");
}
int WordTreeOptional::SearchCursorNear(WordTree& cursor, WordTree*& master, WordKey& constraint, int proximity)
{
int is_master = master == 0 || master == &cursor;
if(master == 0) master = &cursor;
const WordKey& masterKey = master->GetFound().Key();
int direction = key_semantic.LocationCompare(constraint, cursor.GetFound().Key(), proximity);
if(verbose > 2) fprintf(stderr, "WordTreeOptional::SearchCursorNear: LocationCompare(\n\t%s,\n\t%s)\n\t = %d\n", (char*)(constraint.Get()), (char*)(cursor.GetFound().Key().Get()), direction);
//
// If the cursor is in the authorized locations, consider
// next cursor
//
if(direction == 0) {
//
// master cursor makes the rules for location : its location
// is the base to calculate other words mandatory loacations.
//
if(is_master)
key_semantic.LocationSet(cursor.GetFound().Key(), constraint);
//
// Fix location constraint to accomodate proximity tolerance.
//
key_semantic.LocationNearLowest(constraint, proximity);
return WORD_WALK_NEXT;
//
// If current location is above cursor location
//
} else if(direction > 0) {
//
// Move the cursor up to the location.
//
cursor.Seek(constraint);
if(verbose > 1) fprintf(stderr, "WordTreeOptional::SearchCursorNear: leap to %s\n", (char*)constraint.Get());
int ret;
if((ret = cursor.WalkNext()) == OK) {
//
// Remove the location constraint for the master word
// so that it matches and then enforce location for other
// keys.
//
if(is_master)
key_semantic.Location2Document(constraint);
//
// Reconsider the situation for this cursor
//
return WORD_WALK_REDO;
} else {
return ret;
}
//
// If current location is lower than cursor location,
// meaning that the cursor found no match for the current
// location.
//
} else if(direction < 0) {
//
// The cursor document becomes the current document.
// The master cursor is forced to catch up.
//
key_semantic.DocumentSet(cursor.GetDocument(), constraint);
//
// It is possible that this cursor document is the same
// as the master cursor document (if this cursor hit in the
// same document but a higher location). In this case we must
// increase the location of the master cursor otherwise it will
// match without moving and loop forever.
//
if(!is_master && key_semantic.DocumentCompare(masterKey, constraint) == 0) {
key_semantic.LocationSet(masterKey, constraint);
key_semantic.LocationNext(constraint);
}
//
// Since the current location changed, start over.
//
return WORD_WALK_RESTART;
} else {
fprintf(stderr, "WordTreeOptional::WordCursorNear: reached unreachable statement\n");
return NOTOK;
}
return NOTOK;
}
int WordTreeOptional::SearchCursorNot(WordTree& cursor, WordKey& document)
{
int direction = key_semantic.DocumentCompare(document, cursor.GetFound().Key());
if(verbose > 2) fprintf(stderr, "WordTreeOptional::SearchCursorNot: DocumentCompare(\n\t%s,\n\t%s)\n\t = %d\n", (char*)(document.Get()), (char*)(cursor.GetFound().Key().Get()), direction);
//
// If the cursor is above the current document
// (being at the end of walk is being above all documents).
//
// Means that the cursor is positioned in an acceptable document
// and proceed to the next cursor.
//
if(direction < 0 || cursor.IsAtEnd()) {
return WORD_WALK_NEXT;
//
// If the cursor is below current document
//
} else if(direction > 0) {
//
// Move the cursor up to the document
//
cursor.Seek(document);
if(verbose > 1) fprintf(stderr, "WordTreeOptional::SearchCursorNot: leap to %s\n", (char*)document.Get());
int ret;
if((ret = cursor.WalkNext()) != OK && ret != WORD_WALK_ATEND)
return NOTOK;
//
// It is expected in this case that the cursor has moved above
// the current document and another visit in the loop will
// tell us.
//
return WORD_WALK_REDO;
//
// If the cursor matches the current document.
//
// Means that the current document is not a possible match
// since it is pointed by this cursor.
//
} else if(direction == 0) {
//
// The cursor does not give any hint on a possible
// next document, just go to the next possible one.
//
key_semantic.DocumentNext(document, uniq);
//
// Since the current document changed, start over.
//
return WORD_WALK_RESTART;
} else {
fprintf(stderr, "WordTreeOptional::WordCursorNot: reached unreachable statement\n");
return NOTOK;
}
return NOTOK;
}
int WordTreeOptional::SearchCursorAnd(WordTree& cursor, WordKey& document, WordExclude& permutation)
{
int direction = key_semantic.DocumentCompare(document, cursor.GetFound().Key());
if(verbose > 2) fprintf(stderr, "WordTreeOptional::SearchCursorAnd: DocumentCompare(\n\t%s,\n\t%s)\n\t = %d\n", (char*)(document.Get()), (char*)(cursor.GetFound().Key().Get()), direction);
//
// If the cursor is in the current document.
//
// Means that the cursor is positioned in an acceptable document
// and proceed to the next cursor.
//
if(direction == 0) {
return WORD_WALK_NEXT;
//
// If the cursor is below current document
//
} else if(direction > 0) {
//
// Move the cursor up to the document
//
cursor.Seek(document);
if(verbose > 1) fprintf(stderr, "WordTreeOptional::SearchCursorAnd: leap to %s\n", (char*)document.Get());
int ret;
if((ret = cursor.WalkNext()) == OK)
return WORD_WALK_REDO;
else
return ret;
//
// If the cursor is above current document.
//
// Means the the current document is not a possible match
// since it will never reach it because it's already
// above it.
//
} else if(direction < 0) {
//
// The cursor document becomes the current document.
//
key_semantic.DocumentSet(cursor.GetDocument(), document);
//
// Since the current document changed, start over.
//
return WORD_WALK_RESTART;
} else {
fprintf(stderr, "WordTreeOptional::WordCursorAnd: reached unreachable statement\n");
return NOTOK;
}
return NOTOK;
}
int WordTreeOptional::CursorsObeyProximity(WordKey& document)
{
//
// Run if more than one word is involved, proximity
// is always true if there is only one word.
//
if(permutation.NotExcludedCount() <= 1) return OK;
WordKey location;
//
// The first non excluded cursor contains anchor location.
//
unsigned int master_index = 0;
for(unsigned int i = 0; i < cursors_length; i++) {
if(!permutation.Excluded(i)) {
master_index = i;
break;
}
}
const WordKey& masterKey = cursors[master_index]->GetFound().Key();
key_semantic.DocumentSet(masterKey, location);
for(unsigned int i = 0; i < cursors_length;) {
if(permutation.Excluded(i)) {
i++;
continue;
}
WordTree& cursor = *(cursors[i]);
if(cursor.IsAtEnd()) return WORD_SEARCH_NOPROXIMITY;
// if(cursor.status & WORD_WALK_FAILED) return NOTOK;
//
// If the cursor moved outside of the tested document,
// no proximity match is possible.
//
if(key_semantic.DocumentCompare(cursor.GetFound().Key(), document) != 0)
return WORD_SEARCH_NOPROXIMITY;
int direction = key_semantic.LocationCompare(location, cursor.GetFound().Key(), proximity);
//
// If the cursor is in the authorized locations, consider
// next cursor
//
if(direction == 0) {
//
// master cursor makes the rules for location : its location
// is the base to calculate other words mandatory loacations.
//
if(i == master_index)
key_semantic.LocationSet(cursor.GetFound().Key(), location);
//
// Fix location constraint to accomodate proximity tolerance.
//
key_semantic.LocationNearLowest(location, proximity);
i++;
//
// If current location is greater than cursor location
//
} else if(direction > 0) {
//
// Move the cursor up to the location.
//
cursor.Seek(location);
if(verbose > 1) fprintf(stderr, "WordTreeOptional::CursorsObeyProximity: leap to %s\n", (char*)location.Get());
int ret;
if((ret = cursor.WalkNext()) != OK) {
if(ret == WORD_WALK_ATEND) {
return WORD_SEARCH_NOPROXIMITY;
} else {
return NOTOK;
}
}
//
// Remove the location constraint for the master word
// so that it matches and then enforce location for other
// keys.
//
if(i == master_index)
key_semantic.Location2Document(location);
//
// Reconsider the situation for this cursor
//
//
// If current location is lower than cursor location,
// meaning that the cursor found no match in the current
// document.
//
} else if(direction < 0) {
//
// Move to next master key, if possible.
//
if(i != master_index) {
key_semantic.LocationSet(masterKey, location);
key_semantic.LocationNext(location);
}
//
// Since the current location changed, start over.
//
i = 0;
}
}
return OK;
}
//
// Helper class for AscendingFrequency method
//
class WordSort {
public:
unsigned int frequency;
WordTree *cursor;
};
//
// Helper function for AscendingFrequency method
//
static int ascending_frequency(const void *a, const void *b)
{
const WordSort& a_cursor = *(WordSort*)a;
const WordSort& b_cursor = *(WordSort*)b;
return a_cursor.frequency - b_cursor.frequency;
}
int WordTreeOptional::AscendingFrequency()
{
//
// Reorder cursors
//
WordSort *tmp = new WordSort[cursors_length];
memset((char*)tmp, '\0', cursors_length * sizeof(WordSort));
unsigned int i;
for(i = 0; i < cursors_length; i++) {
unsigned int frequency;
if(cursors[i]->Noccurrence(frequency) != OK) {
delete [] tmp;
return NOTOK;
}
if(verbose > 2) fprintf(stderr, "WordTreeOptional::AscendingFrequency: %s occurs %d times\n", (char*)cursors[i]->GetSearch().Get(), frequency);
tmp[i].frequency = frequency;
tmp[i].cursor = cursors[i];
}
memset((char*)cursors, '\0', cursors_length * sizeof(WordTree*));
qsort((void *)tmp, cursors_length, sizeof(WordSort), &ascending_frequency);
for(i = 0; i < cursors_length; i++)
cursors[i] = tmp[i].cursor;
delete [] tmp;
return OK;
}
int WordTreeOptional::StripNonExistent(unsigned int& stripped)
{
stripped = 0;
WordTree** tmp = new WordTree*[cursors_length];
memset((char*)tmp, '\0', cursors_length * sizeof(WordTree*));
unsigned int from;
unsigned int to;
for(to = from = 0; from < cursors_length; from++) {
unsigned int frequency;
if(cursors[from]->Noccurrence(frequency) != OK) {
delete [] tmp;
return NOTOK;
}
if(verbose > 2) fprintf(stderr, "WordTreeOptional::StripNonExistent: %s occurs %d times\n", (char*)cursors[from]->GetSearch().Get(), frequency);
if(frequency > 0) {
tmp[to++] = cursors[from];
} else {
delete cursors[from];
stripped++;
}
}
memset((char*)cursors, '\0', cursors_length * sizeof(WordTree*));
cursors_length = to;
unsigned int i;
for(i = 0; i < cursors_length; i++)
cursors[i] = tmp[i];
delete [] tmp;
return OK;
}
// ************************* WordTreeOr implementation ********************
class WordTreeOr : public WordTreeOperand {
public:
WordTreeOr(const char* nscope) : WordTreeOperand(nscope) { }
//-
// Return WORD_TREE_OR
//
virtual int IsA() const { return WORD_TREE_OR; }
virtual int Optimize();
virtual int ContextSaveList(StringList& list) const;
virtual int ContextRestoreList(StringList& list);
virtual void SetInfo();
virtual int WalkNext();
virtual int UsePermutation() const { return 0; }
virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_NO; }
};
int WordTreeOr::Optimize()
{
int ret;
if((ret = WordTreeOperand::Optimize()) != OK)
return ret;
if((ret = AscendingFrequency()) != OK)
return ret;
unsigned int stripped;
if((ret = StripNonExistent(stripped)) != OK)
return ret;
return OptimizeOr(WORD_PERMUTE_PROXIMITY_NO);
}
int WordTreeOr::ContextSaveList(StringList& list) const
{
int ret;
if((ret = WordTreeOperand::ContextSaveList(list)) != OK)
return ret;
{
String* buffer = new String();
permutation.Get(*buffer);
list.Add(buffer);
}
{
String* buffer = new String();
if((ret = WordCursor::ContextSave(*buffer)) != OK)
return ret;
list.Add(buffer);
}
return OK;
}
int WordTreeOr::ContextRestoreList(StringList& list)
{
int ret;
if((ret = WordTreeOperand::ContextRestoreList(list)) != OK)
return ret;
{
char* buffer = list[0];
if((ret = permutation.Set(buffer)) != OK)
return ret;
list.Remove(0);
}
{
char* buffer = list[0];
if(!buffer) return NOTOK;
WordKey key(buffer);
if((ret = Seek(key)) != OK)
return ret;
cursor_get_flags = DB_NEXT;
list.Remove(0);
}
return OK;
}
void WordTreeOr::SetInfo()
{
unsigned int i;
for(i = 0; i < cursors_length; i++)
cursors[i]->SetInfo();
info.trunc();
for(i = 0; i < cursors_length; i++) {
WordTree& cursor = *(cursors[i]);
if(!permutation.Excluded(i) &&
!cursor.IsAtEnd() &&
key_semantic.DocumentCompare(cursor.GetFound().Key(), GetFound().Key()) == 0) {
info << cursor.info << " ";
}
}
}
int WordTreeOr::WalkNext()
{
WordKey& constraint = pos;
//
// Set constraint with all 0
//
if(constraint.Empty())
key_semantic.DocumentClear(constraint);
WordKey candidate;
int match_ok;
do {
int ret;
unsigned int i;
candidate.Clear();
//
// Advance cursors so that next call fetches another constraint
//
if(cursor_get_flags == DB_NEXT)
key_semantic.DocumentNext(constraint, uniq);
if((ret = Seek(constraint)) != OK)
return ret;
match_ok = 1;
//
// All non excluded cursors are about to move
// at or beyond constraint. Search for the one (candidate) that
// is located at the lowest location beyond the constraint.
//
for(i = 0; i < cursors_length; i++) {
if(permutation.Excluded(i))
continue;
WordTree& cursor = *(cursors[i]);
switch((ret = cursor.WalkNext())) {
case WORD_WALK_ATEND:
//
// Constraint is above all matches for this cursor
//
break;
case OK:
//
// If candidate is not set or current cursor is below
// the current candidate, the curent cursor document becomes
// the candidate.
//
if(candidate.Empty() ||
key_semantic.DocumentCompare(candidate, cursor.GetFound().Key()) > 0) {
key_semantic.DocumentSet(cursor.GetDocument(), candidate);
}
break;
default:
return ret;
break;
}
}
//
// No candidate ? It's the end of the match list.
//
if(candidate.Empty())
return WORD_WALK_ATEND;
found.Key().CopyFrom(candidate);
SetInfo();
if(permutation.ExcludedCount() > 0) {
if((ret = Seek(candidate)) != OK)
return ret;
//
// Restart loop if candidate matches an excluded cursor.
//
for(i = 0; i < cursors_length && match_ok; i++) {
if(!permutation.Excluded(i))
continue;
WordTree& cursor = *(cursors[i]);
switch((ret = cursor.WalkNext())) {
case WORD_WALK_ATEND:
//
// This excluded cursor can't match the candidate, fine.
//
break;
case OK:
//
// This excluded cursor matches candidate therefore it's
// not a valid candidate. Restart search with this candidate
// as the constraint.
//
if(key_semantic.DocumentCompare(candidate, cursor.GetFound().Key()) == 0) {
constraint = candidate;
match_ok = 0;
}
break;
default:
return ret;
break;
}
}
}
cursor_get_flags = DB_NEXT;
} while(!match_ok);
constraint = candidate;
return OK;
}
// ************************* WordTreeAnd implementation ********************
class WordTreeAnd : public WordTreeOptional {
public:
WordTreeAnd(const char* nscope) : WordTreeOptional(nscope) { }
//-
// Return WORD_TREE_AND
//
virtual int IsA() const { return WORD_TREE_AND; }
virtual int UsePermutation() const { return 0; }
virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_NO; }
virtual int AllOrNothing() const { return 1; }
};
// ************************* WordTreeNear implementation ********************
class WordTreeNear : public WordTreeOptional {
public:
WordTreeNear(const char* nscope) : WordTreeOptional(nscope) { }
//-
// Return WORD_TREE_NEAR
//
virtual int IsA() const { return WORD_TREE_NEAR; }
virtual int UsePermutation() const { return 0; }
virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_ONLY; }
virtual int AllOrNothing() const { return 1; }
};
// ************************* WordTreeMandatory implementation ***************
class WordTreeMandatory : public WordTreeOperand {
public:
WordTreeMandatory(const char* nscope) : WordTreeOperand(nscope) { }
//-
// Return WORD_TREE_MANDATORY
//
virtual int IsA() const { return WORD_TREE_MANDATORY; }
};
// ************************* WordTreeNot implementation ***************
class WordTreeNot : public WordTreeOperand {
public:
WordTreeNot(const char* nscope) : WordTreeOperand(nscope) { }
//-
// Return WORD_TREE_NOT
//
virtual int IsA() const { return WORD_TREE_NOT; }
};
// ************************* WordMatch implementation ********************
//
// Return value of the Search method, tells us which document
// matched and why.
//
class WordMatch {
public:
//-
// Return a textual representation of the object.
//
String Get() const;
//-
// The document that matched
//
WordKey match;
//-
// An ascii description of why it matched.
//
String info;
};
String WordMatch::Get() const
{
String tmp;
match.Get(tmp);
if(!info.empty())
tmp << "(" << info << ")";
return tmp;
}
// ************************* WordSearch implementation ********************
//
// NAME
//
// Solve a query from a WordTree syntax tree
//
// SYNOPSIS
//
// #include <WordSearch.h>
//
// WordTree* expr = get_query();
// WordSearch search;
// search.limit_count = NUMBER_OF_RESULTS;
// WordMatch* search.Search(expr);
// ...
//
// DESCRIPTION
//
// The WordSearch class is a wrapper to query an inverted index
// using a WordTree syntax tree.
//
// END
//
class WordSearch {
public:
WordSearch();
//-
// Perform a search from the <b>expr</b> specifications.
// Restore the context from <i>context_in</i> on <b>expr</b>.
// Then skip (using WalkNext) <i>limit_bottom</i> entries.
// Then collect in a WordMatch array of size <i>limit_count</i>
// each match returned by WalkNext. When finished store
// the context (ContextSave) in <i>context_out</i>.
// It is the responsibility of the caller to free the WordMatch
// array. If no match are found a null pointer is returned.
//
WordMatch *Search();
//
// Search backend, only run the WalkNext loop but does not
// allocate/deallocate data.
//
int SearchLoop(WordTree *expr);
//
// Return a context description string to resume the
// search at a given point.
//
const String& Context() const { return context_out; }
//
// Input
//
unsigned int limit_bottom;
unsigned int limit_count;
String context_in;
WordTree* expr;
//
// Output
//
WordMatch* matches;
unsigned int matches_size;
unsigned int matches_length;
String context_out;
};
WordSearch::WordSearch()
{
//
// Input
//
limit_bottom = 0;
limit_count = 0;
context_in.trunc();
expr = 0;
//
// Output
//
matches = 0;
matches_size = 0;
matches_length = 0;
context_out.trunc();
}
WordMatch *WordSearch::Search()
{
int ret = 0;
if(verbose) fprintf(stderr, "WordSearch::Search: non optimized expression %s\n", (char*)expr->Get());
if(expr->Optimize() != OK)
return 0;
if(verbose) fprintf(stderr, "WordSearch::Search: optimized expression %s\n", (char*)expr->Get());
//
// Build space for results
//
matches_size = limit_count + 1;
matches = new WordMatch[matches_size];
matches_length = 0;
//
// Move to first possible position.
//
if(expr->WalkInit() != OK)
goto end;
if(expr->ContextRestore(context_in) == NOTOK)
goto end;
ret = SearchLoop(expr);
//
// Don't bother saving the context if at end of
// search (WORD_WALK_ATEND) or error (NOTOK)
//
if(ret == OK && expr->ContextSave(context_out) == NOTOK)
goto end;
end:
expr->WalkFinish();
if(ret == NOTOK || matches_length <= 0) {
delete [] matches;
matches = 0;
}
return matches;
}
int WordSearch::SearchLoop(WordTree *expr)
{
int ret = OK;
unsigned int i;
//
// Skip the first <limit_bottom> documents
//
{
for(i = 0; i < limit_bottom; i++) {
if((ret = expr->WalkNext()) != OK)
return ret;
}
}
//
// Get documents up to <limit_count> or exhaustion
//
for(matches_length = 0; matches_length < limit_count; matches_length++) {
if((ret = expr->WalkNext()) != OK) {
break;
} else {
matches[matches_length].match = expr->GetDocument();
if(expr->IsA() != WORD_TREE_LITERAL)
matches[matches_length].info = ((WordTreeOperand*)expr)->GetInfo();
if(verbose) fprintf(stderr, "WordSearch::Search: match %s\n", (char*)matches[matches_length].match.Get());
}
}
if(ret == WORD_WALK_ATEND)
matches[matches_length].match.Clear();
return ret;
}
// ************************* WordParser implementation ********************
//
// NAME
//
// Textual query parser for test purpose
//
// SYNOPSIS
//
// #include <WordParser.h>
//
// WordParser parser;
// WordTree* expr = parser.Parse("( or \"scope1\" a query )");
// ...
// delete expr;
//
// DESCRIPTION
//
// The WordParser class implement a lisp-like parser for queries
// implemented by the WordTree derived classes. The syntax is rigid
// and should not be used for human input. The generic syntax of an
// expression is
// <pre>
// ( operator "scope" operand [operand ...] )
// </pre>
// The parenthesis must <b>always</b> be surrounded by white space otherwise
// the parser will be lost. The separator is white space and newline.
// Tabulation may be used in scope to separate key fields.
//
// As a special case a single word is strictly equivalent
// to
// <pre>
// ( literal "" word )
// </pre>
//
// Operators can be lower case or upper case. There is almost no syntax
// checking and it's the responsibility of the caller to associate meaningfull
// operands. For instance ( near ( not foo ) bar ) is meaningless.
//
// OPERATORS
//
// <dl>
//
// <dt> optional
// <dd> WordTreeOptional
//
// <dt> or
// <dd> WordTreeOr
//
// <dt> and
// <dd> WordTreeAnd
//
// <dt> near
// <dd> WordTreeNear
//
// <dt> not,forbiden
// <dd> WordTreeNot
//
// <dt> mandatory
// <dd> WordTreeMandatory
//
// <dt> literal
// <dd> WordTreeLiteral
//
// </dl>
//
//
// END
//
// Possible values of the info argument of ParseOperands
//
#define WORD_TREE_MANY 0x01
#define WORD_TREE_ONE 0x02
#define WORD_TREE_TWO 0x04
class WordParser {
public:
WordTree *Parse(const String& expr);
WordTree *ParseList(StringList& terms);
WordTree *ParseExpr(StringList& terms);
WordTree *ParseUnary(StringList& terms);
WordTree *ParseConj(StringList& terms);
void ParseOperands(StringList& terms, int info, WordTreeOperand* expr);
WordTree *ParseLiteral(StringList& terms);
char *ParseScope(StringList& terms);
void Shift(StringList& terms);
char *Term(StringList& terms);
};
WordTree *WordParser::Parse(const String& expr)
{
StringList terms(expr, " \n");
return ParseList(terms);
}
WordTree *WordParser::ParseList(StringList& terms)
{
WordTree *expr = ParseExpr(terms);
return expr;
}
WordTree *WordParser::ParseExpr(StringList& terms)
{
WordTree *expr = 0;
char* term = strdup(Term(terms));
if(!strcmp(term, "(")) {
Shift(terms);
expr = ParseExpr(terms);
} else if(!strcmp(term, ")")) {
//
// At end of expression, return null
//
} else if(!mystrcasecmp(term, "optional") ||
!mystrcasecmp(term, "or") ||
!mystrcasecmp(term, "and") ||
!mystrcasecmp(term, "near")) {
expr = ParseConj(terms);
} else if(!mystrcasecmp(term, "not") ||
!mystrcasecmp(term, "mandatory") ||
!mystrcasecmp(term, "forbiden")) {
expr = ParseUnary(terms);
} else {
expr = ParseLiteral(terms);
}
free(term);
return expr;
}
WordTree *WordParser::ParseUnary(StringList& terms)
{
int op = 0;
if(!mystrcasecmp(Term(terms), "mandatory"))
op = WORD_TREE_MANDATORY;
else if(!mystrcasecmp(Term(terms), "forbiden") ||
!mystrcasecmp(Term(terms), "not"))
op = WORD_TREE_NOT;
Shift(terms);
char* scope = ParseScope(terms);
WordTreeOperand *expr = 0;
switch(op) {
case WORD_TREE_MANDATORY:
expr = new WordTreeMandatory(scope);
break;
case WORD_TREE_NOT:
expr = new WordTreeNot(scope);
break;
default:
fprintf(stderr, "WordParser::ParseUnary: unexpected operator %d\n", op);
exit(1);
break;
}
free(scope);
ParseOperands(terms, WORD_TREE_ONE, expr);
return expr;
}
WordTree *WordParser::ParseConj(StringList& terms)
{
int op = 0;
if(!mystrcasecmp(Term(terms), "optional"))
op = WORD_TREE_OPTIONAL;
else if(!mystrcasecmp(Term(terms), "or"))
op = WORD_TREE_OR;
else if(!mystrcasecmp(Term(terms), "and"))
op = WORD_TREE_AND;
else if(!mystrcasecmp(Term(terms), "near"))
op = WORD_TREE_NEAR;
Shift(terms);
char* scope = ParseScope(terms);
WordTreeOperand *expr = 0;
switch(op) {
case WORD_TREE_OR:
expr = new WordTreeOr(scope);
break;
case WORD_TREE_OPTIONAL:
expr = new WordTreeOptional(scope);
break;
case WORD_TREE_AND:
expr = new WordTreeAnd(scope);
break;
case WORD_TREE_NEAR:
expr = new WordTreeNear(scope);
break;
default:
fprintf(stderr, "WordParser::ParseOrAnd: unexpected operator %d\n", op);
exit(1);
break;
}
free(scope);
ParseOperands(terms, WORD_TREE_MANY, expr);
return expr;
}
void WordParser::ParseOperands(StringList& terms, int info, WordTreeOperand* expr)
{
unsigned int operands_length = 0;
unsigned int operands_size = 1;
WordTree **operands = (WordTree**)malloc(operands_size * sizeof(WordTree*));
WordTree *subexpr = 0;
while((subexpr = ParseExpr(terms))) {
operands_length++;
if((info & WORD_TREE_ONE) && operands_length > 1) {
fprintf(stderr, "WordParser::ParseOperands: expected only one operands\n");
exit(1);
} else if((info & WORD_TREE_TWO) && operands_length > 2) {
fprintf(stderr, "WordParser::ParseOperands: expected only two operands\n");
exit(1);
}
if(operands_length > operands_size) {
operands_size = operands_length * 2;
operands = (WordTree**)realloc(operands, operands_size * sizeof(WordTree*));
}
operands[operands_length - 1] = subexpr;
}
//
// Discard close parenthesis
//
if(strcmp(Term(terms), ")")) {
fprintf(stderr, "WordParser::ParseOperands: expected close parenthesis\n");
exit(1);
}
Shift(terms);
expr->cursors = operands;
expr->cursors_length = operands_length;
}
WordTree *WordParser::ParseLiteral(StringList& terms)
{
char* term = strdup(Term(terms));
char* scope = 0;
if(!mystrcasecmp(term, "literal")) {
Shift(terms);
scope = ParseScope(terms);
free(term);
term = strdup(Term(terms));
Shift(terms);
} else {
scope = strdup("");
}
WordTreeLiteral *expr = new WordTreeLiteral(term, scope);
Shift(terms);
free(scope);
free(term);
return expr;
}
char *WordParser::ParseScope(StringList& terms)
{
char *scope = Term(terms);
int scope_length = strlen(scope);
//
// Remove surrounding quotes, if any
//
if(scope_length > 0) {
if(scope[scope_length - 1] == '"')
scope[--scope_length] = '\0';
if(scope[0] == '"')
scope++;
}
scope = strdup(scope);
Shift(terms);
return scope;
}
char *WordParser::Term(StringList& terms)
{
char *term = terms[0];
if(!term) {
fprintf(stderr, "WordParser::Term: unexpected end of expression\n");
exit(1);
}
return term;
}
void WordParser::Shift(StringList& terms)
{
terms.Shift(LIST_REMOVE_DESTROY);
}
// ************************* main loop implementation ********************
//
// Store all options from the command line
//
class params_t
{
public:
char* dbfile;
char* find;
unsigned int bottom;
unsigned int count;
char* context;
int uniq_server;
int proximity;
int nop;
int exclude;
};
//
// Explain options
//
static void usage();
//
// Torture WordExclude* classes
//
static void exclude_test();
int main(int ac, char **av)
{
int c;
extern char *optarg;
params_t params;
params.dbfile = strdup("test");
params.find = 0;
params.bottom = 0;
params.count = 10;
params.context = 0;
params.uniq_server = 0;
params.proximity = WORD_SEARCH_DEFAULT_PROXIMITY;
params.nop = 0;
params.exclude = 0;
while ((c = getopt(ac, av, "vB:f:b:c:C:SP:ne")) != -1)
{
switch (c)
{
case 'v':
verbose++;
break;
case 'B':
free(params.dbfile);
params.dbfile = strdup(optarg);
break;
case 'f':
params.find = strdup(optarg);
break;
case 'b':
params.bottom = (unsigned int)atoi(optarg);
break;
case 'c':
params.count = (unsigned int)atoi(optarg);
break;
case 'C':
params.context = strdup(optarg);
break;
case 'P':
params.proximity = atoi(optarg);
break;
case 'S':
params.uniq_server = SERVER;
break;
case 'n':
params.nop = 1;
break;
case 'e':
params.exclude = 1;
break;
case '?':
usage();
break;
}
}
if(params.exclude) {
exclude_test();
exit(0);
}
if(!params.find)
usage();
Configuration* config = WordContext::Initialize();
if(!config) {
fprintf(stderr, "search: no config file found\n");
exit(1);
}
//
// Forward command line verbosity to htword library.
//
if(verbose > 1) {
String tmp;
tmp << (verbose - 1);
config->Add("wordlist_verbose", tmp);
}
//
// Prepare the index (-B).
//
WordList words(*config);
words.Open(params.dbfile, O_RDONLY);
//
// Try the query parser alone
//
if(params.nop) {
WordTree* expr = WordParser().Parse(params.find);
printf("%s\n", (char*)expr->Get());
exit(0);
}
//
// Build a syntax tree from the expression provided by user
//
WordTree* expr = WordParser().Parse(params.find);
//
// Define the semantic of the key
//
{
#define DOCUMENT_LENGTH 3
static int document[DOCUMENT_LENGTH] = {
TAG,
SERVER,
URL
};
int document_length = DOCUMENT_LENGTH;
int location = LOCATION;
if(expr->Prepare(&words, params.uniq_server, params.proximity, document, document_length, location) != OK)
exit(1);
}
WordSearch* search = new WordSearch();
//
// Forward query options to WordSearch object
//
search->limit_bottom = params.bottom; // -b
search->limit_count = params.count; // -c
if(params.context) // -C
search->context_in.set(params.context, strlen(params.context));
//
// Perform the search (-f)
//
search->expr = expr;
WordMatch* matches = search->Search();
//
// Display results, if any.
//
if(matches) {
int i;
for(i = 0; !matches[i].match.Empty(); i++)
printf("match: %s\n", (char*)matches[i].Get());
const String& context = search->Context();
if(!context.empty())
printf("context: %s\n", (const char*)context);
delete [] matches;
} else {
printf("match: none\n");
}
//
// Cleanup
//
delete search;
if(params.context) free(params.context);
if(params.find) free(params.find);
if(params.dbfile) free(params.dbfile);
delete expr;
words.Close();
delete config;
}
static void exclude_test()
{
static unsigned int expected[] = {
0x00000001,
0x00000002,
0x00000004,
0x00000008,
0x00000010,
0x00000003,
0x00000005,
0x00000006,
0x00000009,
0x0000000a,
0x0000000c,
0x00000011,
0x00000012,
0x00000014,
0x00000018,
0x00000007,
0x0000000b,
0x0000000d,
0x0000000e,
0x00000013,
0x00000015,
0x00000016,
0x00000019,
0x0000001a,
0x0000001c,
0x0000000f,
0x00000017,
0x0000001b,
0x0000001d,
0x0000001e,
0x0000001f
};
//
// WordExclude
//
if(verbose) fprintf(stderr, "exclude_test: testing WordExclude\n");
{
WordExclude exclude;
exclude.Initialize(5);
int count = 0;
while(exclude.Next() == WORD_EXCLUDE_OK) {
if(expected[count] != exclude.Mask()) {
fprintf(stderr, "exclude_test: WordExclude iteration %d expected 0x%08x but got 0x%08x\n", count, expected[count], exclude.Mask());
exit(1);
}
count++;
}
if(count != sizeof(expected)/sizeof(unsigned int)) {
fprintf(stderr, "exclude_test: WordExclude expected %d iterations but got %d\n", (int)(sizeof(expected)/sizeof(unsigned int)), count);
exit(1);
}
}
//
// WordExcludeMask without ignore bits behaves exactly the same
// as WordExclude.
//
if(verbose) fprintf(stderr, "exclude_test: testing WordExcludeMask behaving like WordExclude\n");
{
WordExcludeMask exclude;
exclude.Initialize(5, 0, 0);
int count = 0;
while(exclude.Next() == WORD_EXCLUDE_OK) {
if(expected[count] != exclude.Mask()) {
fprintf(stderr, "exclude_test: WordExcludeMask 1 iteration %d expected 0x%08x but got 0x%08x\n", count, expected[count], exclude.Mask());
exit(1);
}
count++;
}
if(count != sizeof(expected)/sizeof(unsigned int)) {
fprintf(stderr, "exclude_test: WordExcludeMask 1 expected %d iterations but got %d\n", (int)(sizeof(expected)/sizeof(unsigned int)), count);
exit(1);
}
}
//
// WordExcludeMask
//
if(verbose) fprintf(stderr, "exclude_test: testing WordExcludeMask\n");
{
static unsigned int expected[] = {
0x00000102,
0x00000108,
0x00000120,
0x00000180,
0x0000010a,
0x00000122,
0x00000128,
0x00000182,
0x00000188,
0x000001a0,
0x0000012a,
0x0000018a,
0x000001a2,
0x000001a8,
0x000001aa
};
static unsigned int excluded[] = {
1,
0,
0,
0,
1,
1,
0,
1,
0,
0,
1,
1,
1,
0,
1
};
WordExcludeMask exclude;
unsigned int ignore = 0x155;
unsigned int ignore_mask = 0x100;
exclude.Initialize(9, ignore, ignore_mask);
if(verbose) {
fprintf(stderr, "exclude_test: ignore\n");
show_bits(ignore);
fprintf(stderr, "exclude_test: ignore_mask\n");
show_bits(ignore_mask);
}
if(exclude.NotExcludedCount() != 8) {
fprintf(stderr, "exclude_test: WordExcludeMask 2 expected NoExcludedCount = 8 but got %d\n", exclude.NotExcludedCount());
exit(1);
}
int count = 0;
while(exclude.Next() == WORD_EXCLUDE_OK) {
if(expected[count] != exclude.Mask()) {
fprintf(stderr, "exclude_test: WordExcludeMask 2 iteration %d expected 0x%08x but got 0x%08x\n", count, expected[count], exclude.Mask());
exit(1);
}
//
// Test Excluded() method on ignored bit
// Is bit 5 set ? (9 - 4) = 5 (counting from 1)
//
if(exclude.Excluded(4)) {
fprintf(stderr, "exclude_test: WordExcludeMask 2 iteration %d bit 5 was set 0x%08x\n", count, exclude.Mask());
exit(1);
}
//
// Test Excluded() method on variable bit
// Is bit 2 set ? (9 - 2) = 7 (counting from 1)
//
if((exclude.Excluded(7) && !excluded[count]) ||
(!exclude.Excluded(7) && excluded[count])) {
fprintf(stderr, "exclude_test: WordExcludeMask 2 iteration %d expected bit 2 %s but was %s in 0x%08x\n", count, (excluded[count] ? "set" : "not set"), (excluded[count] ? "not set" : "set"), expected[count]);
exit(1);
}
count++;
}
if(count != sizeof(expected)/sizeof(unsigned int)) {
fprintf(stderr, "exclude_test: WordExcludeMask 2 expected %d iterations but got %d\n", (int)(sizeof(expected)/sizeof(unsigned int)), count);
exit(1);
}
}
{
WordExclude exclude;
String ascii("110101");
String tmp;
exclude.Set(ascii);
exclude.Get(tmp);
if(tmp != ascii) {
fprintf(stderr, "exclude_test: WordExclude::Get/Set expected %s but got %s\n", (char*)ascii, (char*)tmp);
exit(1);
}
if(exclude.Mask() != 0x2b) {
fprintf(stderr, "exclude_test: WordExclude::Mask expected 0x2b but got 0x%02x\n", exclude.Mask());
exit(1);
}
}
{
WordExcludeMask exclude;
String ascii("12031");
String tmp;
exclude.Set(ascii);
exclude.Get(tmp);
if(tmp != ascii) {
fprintf(stderr, "exclude_test: WordExcludeMask::Get/Set expected %s but got %s\n", (char*)ascii, (char*)tmp);
exit(1);
}
if(exclude.Mask() != 0x19) {
fprintf(stderr, "exclude_test: WordExcludeMask::Mask expected 0x19 but got 0x%02x\n", exclude.Mask());
exit(1);
}
}
}
// *****************************************************************************
// void usage()
// Display program usage information
//
static void usage()
{
printf("usage:\tsearch -f words [options]\n");
printf("\tsearch -e\n");
printf("Options:\n");
printf("\t-v\t\tIncreases the verbosity.\n");
printf("\t-B dbfile\tUse <dbfile> as a db file name (default test).\n");
printf("\t-f expr\t\tLisp like search expression.\n");
printf("\t\t\tSee WordParser comments in source for more information.\n");
printf("\t-b number\tSkip number documents before retrieving.\n");
printf("\t-c number\tRetrieve number documents at most.\n");
printf("\t-n\t\tOnly parse the search expression and print it.\n");
printf("\t-P proximity\tUse with near/optional, proximity tolerance is <proximity>\n");
printf("\t\t\tif negative order of terms is not meaningful\n");
printf("\t\t\t(default 1).\n");
printf("\t-C context\tResume search at <context>.\n");
printf("\t-S\t\tReturn at most one match per server.\n");
printf("\n");
printf("\t-e\t\tRun tests on WordExclude and WordExcludeMask.\n");
exit(1);
}