You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3544 lines
89 KiB
3544 lines
89 KiB
3 years ago
|
//
|
||
|
// search.cc
|
||
|
//
|
||
|
// search: Sample implementation of search algorithms using
|
||
|
// a mifluz inverted index.
|
||
|
//
|
||
|
// Each class is documented in the class definition. Before
|
||
|
// each method declaration a comment explains the semantic of
|
||
|
// the method. In the method definition comments in the code
|
||
|
// may contain additional information.
|
||
|
//
|
||
|
// Each virtual function is documented in the base class, not
|
||
|
// in the derived classes except for semantic differences.
|
||
|
//
|
||
|
// The class tree is:
|
||
|
//
|
||
|
// WordKeySemantic
|
||
|
//
|
||
|
// WordExclude
|
||
|
// WordExcludeMask
|
||
|
// WordPermute
|
||
|
//
|
||
|
// WordSearch
|
||
|
//
|
||
|
// WordMatch
|
||
|
//
|
||
|
// WordTree
|
||
|
// WordTreeOperand
|
||
|
// WordTreeOptional
|
||
|
// WordTreeOr
|
||
|
// WordTreeAnd
|
||
|
// WordTreeNear
|
||
|
// WordTreeMandatory
|
||
|
// WordTreeNot
|
||
|
// WordTreeLiteral
|
||
|
//
|
||
|
// WordParser
|
||
|
//
|
||
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
||
|
// Copyright (c) 1999-2004 The ht://Dig Group
|
||
|
// For copyright details, see the file COPYING in your distribution
|
||
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
||
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
||
|
//
|
||
|
// $Id: search.cc,v 1.9 2004/05/28 13:15:29 lha Exp $
|
||
|
//
|
||
|
|
||
|
#ifdef HAVE_CONFIG_H
|
||
|
#include <htconfig.h>
|
||
|
#endif /* HAVE_CONFIG_H */
|
||
|
|
||
|
#ifdef HAVE_UNISTD_H
|
||
|
#include <unistd.h>
|
||
|
#endif /* HAVE_UNISTD_H */
|
||
|
|
||
|
// If we have this, we probably want it.
|
||
|
#ifdef HAVE_GETOPT_H
|
||
|
#include <getopt.h>
|
||
|
#endif /* HAVE_GETOPT_H */
|
||
|
#ifdef HAVE_MALLOC_H
|
||
|
#include <malloc.h>
|
||
|
#endif /* HAVE_MALLOC_H */
|
||
|
#include <stdlib.h>
|
||
|
|
||
|
#include <htString.h>
|
||
|
#include <WordList.h>
|
||
|
#include <WordContext.h>
|
||
|
#include <WordCursor.h>
|
||
|
|
||
|
//
|
||
|
// Verbosity level set with -v (++)
|
||
|
//
|
||
|
static int verbose = 0;
|
||
|
|
||
|
// ************************* Document definition implementation ***********
|
||
|
|
||
|
#define TAG 1
|
||
|
#define SERVER 2
|
||
|
#define URL 3
|
||
|
#define LOCATION 4
|
||
|
|
||
|
// *********************** WordKeySemantic implementation ********************
|
||
|
//
|
||
|
// NAME
|
||
|
//
|
||
|
// encapsulate WordKey semantic for document and location
|
||
|
//
|
||
|
// SYNOPSIS
|
||
|
//
|
||
|
// #include <WordKeySemantic.h>
|
||
|
//
|
||
|
// #define SERVER 1
|
||
|
// #define URL 2
|
||
|
// #define LOCATION 3
|
||
|
//
|
||
|
// static int document[] = {
|
||
|
// SERVER,
|
||
|
// URL
|
||
|
// };
|
||
|
//
|
||
|
// WordKeySemantic semantic;
|
||
|
// semantic.Initialize(document, sizeof(document)/sizeof(int), LOCATION);
|
||
|
//
|
||
|
// DESCRIPTION
|
||
|
//
|
||
|
// Encapsulate the semantic of a WordKey object fields. It defines
|
||
|
// what a document and a location are. It implements the set of
|
||
|
// operation that a search needs to perform given the fact that it
|
||
|
// implements a search whose purpose is to retrieve a document and
|
||
|
// wants to implement proximity search based on a word location.
|
||
|
//
|
||
|
//
|
||
|
// END
|
||
|
//
|
||
|
// A document is a set of fields in a given order.
|
||
|
// A location is a field.
|
||
|
// The actual fields used to implement WordKeySemantic methods are
|
||
|
// set with the Initialize method.
|
||
|
//
|
||
|
class WordKeySemantic {
|
||
|
public:
|
||
|
WordKeySemantic();
|
||
|
~WordKeySemantic();
|
||
|
|
||
|
//-
|
||
|
// Set the actual field numbers that define what a document is and
|
||
|
// what a location is. The <b>document_arg<b> is a list of WordKey field
|
||
|
// positions of length <b>document_length_arg</b> that must be adjacent.
|
||
|
// The <b>location_arg</b> is the WordKey field position of the word
|
||
|
// location within a document.
|
||
|
// Return OK on success, NOTOK on failure.
|
||
|
//
|
||
|
int Initialize(int* document_arg, int document_length_arg, int location_arg);
|
||
|
|
||
|
//
|
||
|
// These functions and only these know what a document is.
|
||
|
// This should really be a class containing function pointers and be
|
||
|
// given as argument to the search algorithm.
|
||
|
//
|
||
|
//-
|
||
|
// Copy the document in <b>from</b> into <b>to.</b>
|
||
|
//
|
||
|
void DocumentSet(const WordKey& from, WordKey& to);
|
||
|
//-
|
||
|
// Increment the document in <b>key</b> using the <i>SetToFollowing</i>
|
||
|
// method of WordKey. <b>uniq</b> is the WordKey position at which the
|
||
|
// increment starts.
|
||
|
//
|
||
|
void DocumentNext(WordKey& key, int uniq);
|
||
|
//-
|
||
|
// Compare the document fields defined in both <b>a</b> and <b>b</b>
|
||
|
// and return the difference a - b, as in strcmp. If all document
|
||
|
// fields in <b>a</b> or <b>b</b> are undefined return 1.
|
||
|
//
|
||
|
int DocumentCompare(const WordKey& a, const WordKey& b);
|
||
|
//-
|
||
|
// Set all document fields to 0.
|
||
|
//
|
||
|
int DocumentClear(WordKey& key);
|
||
|
|
||
|
//
|
||
|
// These functions and only these know what a location is.
|
||
|
// This should really be a class containing function pointers and be
|
||
|
// given as argument to the search algorithm.
|
||
|
//
|
||
|
//-
|
||
|
// Copy the document and location in <b>from</b> into <b>to.</b>
|
||
|
//
|
||
|
void LocationSet(const WordKey& from, WordKey& to);
|
||
|
//-
|
||
|
// Increment the document and location in <b>key</b>
|
||
|
// using the <i>SetToFollowing</i>
|
||
|
// method of WordKey.
|
||
|
//
|
||
|
void LocationNext(WordKey& key);
|
||
|
//-
|
||
|
// Compare <b>expected</b> location to <b>actual</b> location. Compares equal
|
||
|
// as long as expected location is at a maximum distance of <b>proximity</b>
|
||
|
// of actual. If <b>actual</b> only has undefined field, return > 0.
|
||
|
// <b>expected</b> must always be the lowest possible bound.
|
||
|
// <b>actual</b> is tolerated if it is greater than <b>actual</b> but not
|
||
|
// greater than <b>proximity</b> if <b>proximity</b> > 0 or abs(<b>proximity</b>) * 2 if
|
||
|
// <b>proximity</b> < 0.
|
||
|
// Return the difference expected - actual.
|
||
|
//
|
||
|
int LocationCompare(const WordKey& expected, const WordKey& actual, int proximity = 0);
|
||
|
//-
|
||
|
// <b>key</b> is the expected location of a searched key.
|
||
|
// LocationNearLowest modifies <b>key</b> to add tolerance accroding to
|
||
|
// <b>proximity</b>.
|
||
|
//
|
||
|
// The idea is that <b>key</b> will be the lowest possible match for
|
||
|
// for the <b>proximity</b> range. If <proxmity> is positive, <b>key</b>
|
||
|
// is already the lowest possible match since we accept [0 proximity].
|
||
|
// If <b>proximity</b> is negative, substract it since we accept
|
||
|
// [-proximity proximity].
|
||
|
//
|
||
|
// For better understanding see the functions in which it is used.
|
||
|
//
|
||
|
void LocationNearLowest(WordKey& key, int proximity);
|
||
|
|
||
|
//-
|
||
|
// Undefined the location field in <b>key.</b>.
|
||
|
//
|
||
|
void Location2Document(WordKey& key);
|
||
|
|
||
|
protected:
|
||
|
int* document;
|
||
|
int document_length;
|
||
|
int location;
|
||
|
};
|
||
|
|
||
|
WordKeySemantic::WordKeySemantic()
|
||
|
{
|
||
|
int nfields = WordKey::NFields();
|
||
|
document = new int[nfields];
|
||
|
document_length = 0;
|
||
|
location = -1;
|
||
|
}
|
||
|
|
||
|
WordKeySemantic::~WordKeySemantic()
|
||
|
{
|
||
|
if(document) delete [] document;
|
||
|
}
|
||
|
|
||
|
int WordKeySemantic::Initialize(int* document_arg, int document_length_arg, int location_arg)
|
||
|
{
|
||
|
memcpy((char*)document, (char*)document_arg, document_length_arg * sizeof(int));
|
||
|
document_length = document_length_arg;
|
||
|
location = location_arg;
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
void WordKeySemantic::DocumentSet(const WordKey& from, WordKey& to)
|
||
|
{
|
||
|
to.Clear();
|
||
|
for(int i = 0; i < document_length; i++)
|
||
|
to.Set(document[i], from.Get(document[i]));
|
||
|
}
|
||
|
|
||
|
int WordKeySemantic::DocumentCompare(const WordKey& a, const WordKey& b)
|
||
|
{
|
||
|
int ret = 1;
|
||
|
for(int i = 0; i < document_length; i++) {
|
||
|
int idx = document[i];
|
||
|
if((a.IsDefined(idx) && b.IsDefined(idx)) &&
|
||
|
(ret = a.Get(idx) - b.Get(idx)) != 0) return ret;
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
int WordKeySemantic::DocumentClear(WordKey& key)
|
||
|
{
|
||
|
for(int i = 0; i < document_length; i++)
|
||
|
key.Set(document[i], 0);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
void WordKeySemantic::DocumentNext(WordKey& key, int uniq)
|
||
|
{
|
||
|
if(uniq)
|
||
|
key.SetToFollowing(uniq);
|
||
|
else
|
||
|
key.SetToFollowing(document[document_length-1]);
|
||
|
}
|
||
|
|
||
|
|
||
|
void WordKeySemantic::LocationSet(const WordKey& from, WordKey& to)
|
||
|
{
|
||
|
DocumentSet(from, to);
|
||
|
to.Set(location, from.Get(location));
|
||
|
}
|
||
|
|
||
|
int WordKeySemantic::LocationCompare(const WordKey& expected, const WordKey& actual, int proximity)
|
||
|
{
|
||
|
int ret = 1;
|
||
|
if((ret = DocumentCompare(expected, actual)) != 0) return ret;
|
||
|
//
|
||
|
// Only compare location if defined.
|
||
|
//
|
||
|
if((expected.IsDefined(location) && actual.IsDefined(location)) &&
|
||
|
(ret = expected.Get(location) - actual.Get(location))) {
|
||
|
if(proximity < 0) {
|
||
|
//
|
||
|
// -N means ok if in range [-N +N]
|
||
|
//
|
||
|
proximity *= 2;
|
||
|
if(ret < 0 && ret >= proximity)
|
||
|
ret = 0;
|
||
|
} else {
|
||
|
//
|
||
|
// N means ok if in range [0 +N]
|
||
|
//
|
||
|
if(ret < 0 && ret >= -proximity)
|
||
|
ret = 0;
|
||
|
}
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
void WordKeySemantic::LocationNext(WordKey& key)
|
||
|
{
|
||
|
key.SetToFollowing(location);
|
||
|
}
|
||
|
|
||
|
void WordKeySemantic::LocationNearLowest(WordKey& key, int proximity)
|
||
|
{
|
||
|
if(proximity < 0) {
|
||
|
if(key.Underflow(location, proximity))
|
||
|
key.Get(location) = 0;
|
||
|
else
|
||
|
key.Get(location) += proximity;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void WordKeySemantic::Location2Document(WordKey& key)
|
||
|
{
|
||
|
key.Undefined(location);
|
||
|
}
|
||
|
|
||
|
// ************************* WordExclude implementation ********************
|
||
|
//
|
||
|
// NAME
|
||
|
//
|
||
|
// permute bits in bit field
|
||
|
//
|
||
|
// SYNOPSIS
|
||
|
//
|
||
|
// #include <WordExclude.h>
|
||
|
//
|
||
|
// #define BITS 5
|
||
|
//
|
||
|
// WordExclude permute;
|
||
|
// permute.Initialize(BITS);
|
||
|
// while(permute.Next() == WORD_EXCLUDE_OK)
|
||
|
// ...
|
||
|
//
|
||
|
// DESCRIPTION
|
||
|
//
|
||
|
// Count from 1 to the specified maximum. A variable++ loop does the same.
|
||
|
// The <b>WordExclude</b> class counts in a specific order.
|
||
|
// It first step thru all the permutations containing only 1 bit set, in
|
||
|
// increasing order. Then thru all the permutations containing 2 bits set,
|
||
|
// in increasing order. As so forth until the maximum number is reached.
|
||
|
// See the <b>Permute</b> method for more information.
|
||
|
//
|
||
|
//
|
||
|
// END
|
||
|
|
||
|
//
|
||
|
// Helper that displays an unsigned int in binary/hexa/decimal
|
||
|
//
|
||
|
static inline void show_bits(unsigned int result)
|
||
|
{
|
||
|
int i;
|
||
|
for(i = 0; i < 10; i++) {
|
||
|
fprintf(stderr, "%c", (result & (1 << i)) ? '1' : '0');
|
||
|
}
|
||
|
fprintf(stderr, " (0x%08x - %15d)\n", result, result);
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// WordExclude methods return values
|
||
|
//
|
||
|
#define WORD_EXCLUDE_OK 1
|
||
|
#define WORD_EXCLUDE_END 2
|
||
|
|
||
|
//
|
||
|
// Maximum number of bits
|
||
|
//
|
||
|
#define WORD_EXCLUDE_MAX (sizeof(unsigned int) * 8)
|
||
|
|
||
|
//
|
||
|
// Convert a position <p> in a <l> bits mask into a bit offset (from 0)
|
||
|
//
|
||
|
#define WORD_EXCLUDE_POSITION2BIT(l,p) ((l) - (p) - 1)
|
||
|
|
||
|
class WordExclude {
|
||
|
public:
|
||
|
//-
|
||
|
// Reset the generator and prepare it for <b>length</b> bits generation.
|
||
|
// The <b>length</b> cannot be greater than <i>WORD_EXCLUDE_MAX.</i>
|
||
|
// Returns OK if no error occurs, NOTOK otherwise.
|
||
|
//
|
||
|
virtual int Initialize(unsigned int length);
|
||
|
//-
|
||
|
// Move to next exclude mask. Returns WORD_EXCLUDE_OK if successfull,
|
||
|
// WORD_EXCLUDE_END if at the end of the permutations. It starts by
|
||
|
// calling <i>Permute</i> with one bit set, then two and up to
|
||
|
// <i>Maxi()</i> included. The last permutation only generates one
|
||
|
// possibility since all the bits are set.
|
||
|
//
|
||
|
virtual int Next();
|
||
|
//-
|
||
|
// Exclude bit for <b>position</b> starts at most significant bit. That is
|
||
|
// position 0 exclude bit is most significant bit of the current mask.
|
||
|
// Returns true if position is excluded, false otherwise.
|
||
|
//
|
||
|
virtual inline unsigned int Excluded(int position) { return mask & (1 << WORD_EXCLUDE_POSITION2BIT(maxi, position)); }
|
||
|
//-
|
||
|
// Returns how many bits are not excluded with current mask.
|
||
|
//
|
||
|
virtual inline int NotExcludedCount() const { return maxi - bits; }
|
||
|
//-
|
||
|
// Returns how many bits are excluded with current mask.
|
||
|
//
|
||
|
virtual inline int ExcludedCount() const { return bits; }
|
||
|
//
|
||
|
// Save and restore in string
|
||
|
//
|
||
|
//-
|
||
|
// Write an ascii representation of the WordExclude object in <b>buffer.</b>
|
||
|
// Each bit is represented by the character 0 or 1. The most significant
|
||
|
// bit is the last character in the string. For instance
|
||
|
// 1000 is the string representation of a WordExclude object initialized
|
||
|
// with length = 4 after the first <i>Next</i> operation.
|
||
|
//
|
||
|
virtual void Get(String& buffer) const;
|
||
|
//-
|
||
|
// Initialize the object from the string representation in <b>buffer.</b>
|
||
|
// Returns OK on success, NOTOK on failure.
|
||
|
//
|
||
|
virtual int Set(const String& buffer);
|
||
|
|
||
|
//-
|
||
|
// Generate all the permutations
|
||
|
// containing <i>n</i> bits in a <b>bits</b> bit word in increasing order.
|
||
|
// The <b>mask</b> argument is originally filled by the caller
|
||
|
// with the <i>n</i> least significant bits set. A call to Permute
|
||
|
// generates the next permutation immediately greater (numerically)
|
||
|
// than the one contained in <b>mask</b>.
|
||
|
//
|
||
|
// Permute returns the next permutation or 0 if it reached the
|
||
|
// maximum.
|
||
|
//
|
||
|
// To understand the algorithm, imagine 1 is a ball and 0 a space.
|
||
|
//
|
||
|
// When playing the game you start with a rack of <b>bits</b> slots filled
|
||
|
// with <i>n</i> balls all on the left side. You end the game when all
|
||
|
// the balls are on the right side.
|
||
|
//
|
||
|
// Sarting from the left, search for the first ball that has an empty
|
||
|
// space to the right. While searching remove all the balls you find.
|
||
|
// Place a ball in the empty space you found, at the right of the last
|
||
|
// ball removed. Sarting from the left, fill all empty spaces with
|
||
|
// the removed balls. Repeat until all balls are to the right.
|
||
|
//
|
||
|
// Here is a sample generated by repeated calls to WordExclude::Permute:
|
||
|
// (left most bit is least significant)
|
||
|
// <pre>
|
||
|
// mask = 1111100000
|
||
|
// while(mask = WordExclude::Permute(mask, 7))
|
||
|
// show_bits(mask)
|
||
|
//
|
||
|
// 1111100000 (0x0000001f - 31)
|
||
|
// 1111010000 (0x0000002f - 47)
|
||
|
// 1110110000 (0x00000037 - 55)
|
||
|
// 1101110000 (0x0000003b - 59)
|
||
|
// 1011110000 (0x0000003d - 61)
|
||
|
// 0111110000 (0x0000003e - 62)
|
||
|
// 1111001000 (0x0000004f - 79)
|
||
|
// 1110101000 (0x00000057 - 87)
|
||
|
// 1101101000 (0x0000005b - 91)
|
||
|
// 1011101000 (0x0000005d - 93)
|
||
|
// 0111101000 (0x0000005e - 94)
|
||
|
// 1110011000 (0x00000067 - 103)
|
||
|
// 1101011000 (0x0000006b - 107)
|
||
|
// 1011011000 (0x0000006d - 109)
|
||
|
// 0111011000 (0x0000006e - 110)
|
||
|
// 1100111000 (0x00000073 - 115)
|
||
|
// 1010111000 (0x00000075 - 117)
|
||
|
// 0110111000 (0x00000076 - 118)
|
||
|
// 1001111000 (0x00000079 - 121)
|
||
|
// 0101111000 (0x0000007a - 122)
|
||
|
// 0011111000 (0x0000007c - 124)
|
||
|
// </pre>
|
||
|
// A recursive implementation would be:
|
||
|
// <pre>
|
||
|
// /* Recursive */
|
||
|
// void permute(unsigned int result, int bits_count, int bits_toset)
|
||
|
// {
|
||
|
// if(bits_toset <= 0 || bits_count <= 0) {
|
||
|
// if(bits_toset <= 0)
|
||
|
// do_something(result);
|
||
|
// } else {
|
||
|
// permute(result, bits_count - 1, bits_toset);
|
||
|
// permute(result | (1 << (bits_count - 1)), bits_count - 1, bits_toset - 1);
|
||
|
// }
|
||
|
// }
|
||
|
// </pre>
|
||
|
// Which is more elegant but not practical at all in our case.
|
||
|
//
|
||
|
inline unsigned int Permute(unsigned int mask, unsigned int bits);
|
||
|
|
||
|
//-
|
||
|
// Return the current bit field value.
|
||
|
//
|
||
|
virtual inline unsigned int& Mask() { return mask; }
|
||
|
virtual inline unsigned int Mask() const { return mask; }
|
||
|
|
||
|
virtual inline unsigned int& Maxi() { return maxi; }
|
||
|
virtual inline unsigned int Maxi() const { return maxi; }
|
||
|
|
||
|
virtual inline unsigned int& Bits() { return bits; }
|
||
|
virtual inline unsigned int Bits() const { return bits; }
|
||
|
|
||
|
private:
|
||
|
unsigned int mask;
|
||
|
unsigned int maxi;
|
||
|
unsigned int bits;
|
||
|
};
|
||
|
|
||
|
int WordExclude::Initialize(unsigned int length)
|
||
|
{
|
||
|
if(length > WORD_EXCLUDE_MAX) {
|
||
|
fprintf(stderr, "WordExclude::Initialize: length must be < %d\n", (int)WORD_EXCLUDE_MAX);
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
mask = 0;
|
||
|
bits = 0;
|
||
|
maxi = length;
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
inline unsigned int WordExclude::Permute(unsigned int mask, unsigned int bits)
|
||
|
{
|
||
|
unsigned int bits_cleared = 0;
|
||
|
unsigned int j;
|
||
|
for(j = 0; j < bits; j++) {
|
||
|
if(mask & (1 << j)) {
|
||
|
bits_cleared++;
|
||
|
mask &= ~(1 << j);
|
||
|
} else {
|
||
|
if(bits_cleared) {
|
||
|
bits_cleared--;
|
||
|
mask |= (1 << j);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(j >= bits)
|
||
|
return 0;
|
||
|
|
||
|
for(j = 0; j < bits_cleared; j++)
|
||
|
mask |= (1 << j);
|
||
|
|
||
|
return mask;
|
||
|
}
|
||
|
|
||
|
int WordExclude::Next()
|
||
|
{
|
||
|
mask = Permute(mask, maxi);
|
||
|
|
||
|
int ret = WORD_EXCLUDE_OK;
|
||
|
|
||
|
if(mask == 0) {
|
||
|
bits++;
|
||
|
if(bits > maxi)
|
||
|
ret = WORD_EXCLUDE_END;
|
||
|
else {
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < bits; i++)
|
||
|
mask |= (1 << i);
|
||
|
ret = WORD_EXCLUDE_OK;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(verbose > 2) show_bits(mask);
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
void WordExclude::Get(String& buffer) const
|
||
|
{
|
||
|
buffer.trunc();
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < maxi; i++) {
|
||
|
buffer << ((mask & (1 << i)) ? '1' : '0');
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int WordExclude::Set(const String& buffer)
|
||
|
{
|
||
|
if(Initialize(buffer.length()) == NOTOK)
|
||
|
return NOTOK;
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < maxi; i++) {
|
||
|
if(buffer[i] == '1') {
|
||
|
mask |= (1 << i);
|
||
|
bits++;
|
||
|
}
|
||
|
}
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
// ************************* WordExcludeMask implementation *******************
|
||
|
//
|
||
|
// NAME
|
||
|
//
|
||
|
// WordExclude specialization that ignore some bits
|
||
|
//
|
||
|
// SYNOPSIS
|
||
|
//
|
||
|
// #include <WordExcludeMask.h>
|
||
|
//
|
||
|
// #define BITS 9
|
||
|
// #define IGNORE 0x0f0
|
||
|
// #define IGNORE_MASK 0x050
|
||
|
//
|
||
|
// WordExcludeMask permute;
|
||
|
// permute.Initialize(BITS, IGNORE, IGNORE_MASK);
|
||
|
// while(permute.Next() == WORD_EXCLUDE_OK)
|
||
|
// ...
|
||
|
//
|
||
|
// DESCRIPTION
|
||
|
//
|
||
|
// Only perform WordExclude operations on the bits that are not set in
|
||
|
// <i>ignore.</i> The bits of <i>ignore_mask</i> that are set in
|
||
|
// <i>ignore</i> are untouched. In the synopsis section, for instance,
|
||
|
// bits 1,2,3,4 and 9 will be permuted and the bits 5,6,7,8 will be
|
||
|
// left untouched.
|
||
|
//
|
||
|
//
|
||
|
// END
|
||
|
//
|
||
|
#define WORD_EXCLUDE_IGNORED (-1)
|
||
|
|
||
|
class WordExcludeMask : public WordExclude {
|
||
|
public:
|
||
|
//-
|
||
|
// <b>ignore</b> gives the mask of bits to ignore. The actual WordExclude
|
||
|
// operations are made on a number of bits that is <b>length</b> - (the number
|
||
|
// of bits set in <b>ignore).</b>
|
||
|
// The <b>ignore_mask_arg</b> contains the actual values of the bits ignored by
|
||
|
// the <b>ignore</b> argument.
|
||
|
//
|
||
|
virtual inline int Initialize(unsigned int length, unsigned int ignore, unsigned int ignore_mask_arg) {
|
||
|
ignore_mask = ignore_mask_arg;
|
||
|
ignore_maxi = length;
|
||
|
unsigned int maxi = 0;
|
||
|
unsigned int i;
|
||
|
for(i = 0, ignore_bits = 0; i < length; i++) {
|
||
|
if(ignore & (1 << i)) {
|
||
|
bit2bit[i] = WORD_EXCLUDE_IGNORED;
|
||
|
if(ignore_mask & (1 << i)) ignore_bits++;
|
||
|
} else {
|
||
|
bit2bit[i] = maxi++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return WordExclude::Initialize(maxi);
|
||
|
}
|
||
|
|
||
|
virtual inline unsigned int Excluded(int position) {
|
||
|
position = WORD_EXCLUDE_POSITION2BIT(ignore_maxi, position);
|
||
|
if(bit2bit[position] == WORD_EXCLUDE_IGNORED)
|
||
|
return ignore_mask & (1 << position);
|
||
|
else
|
||
|
return WordExclude::Mask() & (1 << bit2bit[position]);
|
||
|
}
|
||
|
|
||
|
virtual inline int NotExcludedCount() const {
|
||
|
return ignore_maxi - ignore_bits - WordExclude::Bits();
|
||
|
}
|
||
|
|
||
|
virtual inline int ExcludedCount() const {
|
||
|
return ignore_bits - WordExclude::Bits();
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// The semantic is the same as the Get method of Wordexclude
|
||
|
// except that ignored bits are assigned 3 and 2 instead of 1 and 0
|
||
|
// respectively.
|
||
|
//
|
||
|
virtual void Get(String& buffer) const;
|
||
|
//-
|
||
|
// The semantic is the same as the Get method of Wordexclude
|
||
|
// except that ignored bits are assigned 3 and 2 instead of 1 and 0
|
||
|
// respectively.
|
||
|
//
|
||
|
virtual int Set(const String& buffer);
|
||
|
|
||
|
virtual inline unsigned int Mask() const {
|
||
|
unsigned int ret = ignore_mask;
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < ignore_maxi; i++) {
|
||
|
if(bit2bit[i] != WORD_EXCLUDE_IGNORED) {
|
||
|
if(WordExclude::Mask() & (1 << bit2bit[i]))
|
||
|
ret |= (1 << i);
|
||
|
}
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
virtual inline unsigned int Maxi() const { return ignore_maxi; }
|
||
|
|
||
|
virtual inline unsigned int Bits() const { return ignore_bits + WordExclude::Bits(); }
|
||
|
|
||
|
private:
|
||
|
unsigned int ignore_mask;
|
||
|
unsigned int ignore_maxi;
|
||
|
unsigned int ignore_bits;
|
||
|
int bit2bit[WORD_EXCLUDE_MAX];
|
||
|
};
|
||
|
|
||
|
void WordExcludeMask::Get(String& buffer) const
|
||
|
{
|
||
|
buffer.trunc();
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < ignore_maxi; i++) {
|
||
|
if(bit2bit[i] == WORD_EXCLUDE_IGNORED)
|
||
|
buffer << ((ignore_mask & (1 << i)) ? '3' : '2');
|
||
|
else
|
||
|
buffer << ((WordExclude::Mask() & (1 << bit2bit[i])) ? '1' : '0');
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int WordExcludeMask::Set(const String& buffer)
|
||
|
{
|
||
|
WordExclude::Initialize(0);
|
||
|
|
||
|
unsigned int& maxi = WordExclude::Maxi();
|
||
|
unsigned int& mask = WordExclude::Mask();
|
||
|
unsigned int& bits = WordExclude::Bits();
|
||
|
ignore_mask = 0;
|
||
|
ignore_bits = 0;
|
||
|
ignore_maxi = buffer.length();
|
||
|
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < ignore_maxi; i++) {
|
||
|
if(buffer[i] == '1' || buffer[i] == '0') {
|
||
|
if(buffer[i] == '1') {
|
||
|
mask |= (1 << maxi);
|
||
|
bits++;
|
||
|
}
|
||
|
bit2bit[i] = maxi;
|
||
|
maxi++;
|
||
|
} else if(buffer[i] == '3' || buffer[i] == '2') {
|
||
|
if(buffer[i] == '3') {
|
||
|
ignore_mask |= (1 << i);
|
||
|
ignore_bits++;
|
||
|
}
|
||
|
bit2bit[i] = WORD_EXCLUDE_IGNORED;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
// ************************* WordPermute implementation ********************
|
||
|
//
|
||
|
// NAME
|
||
|
//
|
||
|
// WordExclude specialization with proximity toggle
|
||
|
//
|
||
|
// SYNOPSIS
|
||
|
//
|
||
|
// #include <WordPermute.h>
|
||
|
//
|
||
|
// #define BITS 5
|
||
|
//
|
||
|
// WordPermute permute;
|
||
|
// permute.Initialize(BITS);
|
||
|
// while(permute.Next() == WORD_EXCLUDE_OK)
|
||
|
// if(permute.UseProximity()) ...
|
||
|
//
|
||
|
// DESCRIPTION
|
||
|
//
|
||
|
// Each WordExclude permutation is used twice by Next. Once with
|
||
|
// the proximity flag set and once with the proximity flag cleared.
|
||
|
// If the length of the bit field (length argument of Initialize) is
|
||
|
// lower or equal to 1, then the proximity flag is always false.
|
||
|
//
|
||
|
//
|
||
|
// END
|
||
|
//
|
||
|
// WordPermute methods return values
|
||
|
//
|
||
|
#define WORD_PERMUTE_OK WORD_EXCLUDE_OK
|
||
|
#define WORD_PERMUTE_END WORD_EXCLUDE_END
|
||
|
|
||
|
//
|
||
|
// Use or don't use proximity flag
|
||
|
//
|
||
|
#define WORD_PERMUTE_PROXIMITY_NO 0
|
||
|
#define WORD_PERMUTE_PROXIMITY_TOGGLE 1
|
||
|
#define WORD_PERMUTE_PROXIMITY_ONLY 2
|
||
|
|
||
|
//
|
||
|
// Deals with word exclusion and proximity permutations for
|
||
|
// the implementation of the Optional retrieval model.
|
||
|
//
|
||
|
class WordPermute : public WordExcludeMask {
|
||
|
public:
|
||
|
//-
|
||
|
// The <b>nuse_proximity</b> may be set to the following:
|
||
|
//
|
||
|
// WORD_PERMUTE_PROXIMITY_NO so that the object behaves as
|
||
|
// WordExcludeMask and Proximity() always return false.
|
||
|
//
|
||
|
// WORD_PERMUTE_PROXIMITY_TOGGLE so that each permutation is issued twice:
|
||
|
// once with the proximity flag set (Proximity() method) and once with
|
||
|
// the proximity flag cleared.
|
||
|
//
|
||
|
// WORD_PERMUTE_PROXIMITY_ONLY so that the object behaves as
|
||
|
// WordExcludeMask and Proximity() always return true.
|
||
|
//
|
||
|
virtual inline int Initialize(unsigned int length, unsigned int ignore, unsigned int ignore_mask_arg, int nuse_proximity) {
|
||
|
use_proximity = nuse_proximity;
|
||
|
switch(use_proximity) {
|
||
|
case WORD_PERMUTE_PROXIMITY_NO:
|
||
|
proximity = 0;
|
||
|
break;
|
||
|
case WORD_PERMUTE_PROXIMITY_TOGGLE:
|
||
|
//
|
||
|
// Don't bother to try proximity search if only one word
|
||
|
// is involved.
|
||
|
//
|
||
|
proximity = length > 1;
|
||
|
break;
|
||
|
case WORD_PERMUTE_PROXIMITY_ONLY:
|
||
|
proximity = 1;
|
||
|
break;
|
||
|
default:
|
||
|
fprintf(stderr, "WordPermute::Initialize: unexpected use_proximity = %d\n", use_proximity);
|
||
|
return 0;
|
||
|
}
|
||
|
return WordExcludeMask::Initialize(length, ignore, ignore_mask_arg);
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// Return true if the proximity flag is set, false if it is
|
||
|
// cleared.
|
||
|
//
|
||
|
inline int Proximity() {
|
||
|
switch(use_proximity) {
|
||
|
case WORD_PERMUTE_PROXIMITY_NO:
|
||
|
return 0;
|
||
|
break;
|
||
|
case WORD_PERMUTE_PROXIMITY_TOGGLE:
|
||
|
return proximity;
|
||
|
break;
|
||
|
case WORD_PERMUTE_PROXIMITY_ONLY:
|
||
|
return 1;
|
||
|
break;
|
||
|
default:
|
||
|
fprintf(stderr, "WordPermute::Proximity: unexpected use_proximity = %d\n", use_proximity);
|
||
|
return 0;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// Return WORD_PERMUTE_PROXIMITY_NO, WORD_PERMUTE_PROXIMITY_TOGGLE or
|
||
|
// WORD_PERMUTE_PROXIMITY_ONLY.
|
||
|
//
|
||
|
inline int UseProximity() { return use_proximity; }
|
||
|
|
||
|
//-
|
||
|
// Find the next permutation. If <b>WORD_PERMUTE_PROXIMITY_TOGGLE<b> was
|
||
|
// specified in Initialize each permutation is issued twice (see
|
||
|
// Proximity() to differentiate them), except when the mask
|
||
|
// only contains one non exluded bit (NotExcludeCount() <= 1).
|
||
|
// In both case the last permutation with all bits excluded
|
||
|
// (i.e. when NotExcludedCount() <= 0) is never returned because
|
||
|
// it is useless.
|
||
|
//
|
||
|
virtual int Next() {
|
||
|
if(Maxi() <= 0)
|
||
|
return WORD_PERMUTE_END;
|
||
|
|
||
|
int ret = WORD_PERMUTE_OK;
|
||
|
int check_useless = 0;
|
||
|
if(use_proximity == WORD_PERMUTE_PROXIMITY_TOGGLE) {
|
||
|
//
|
||
|
// Move to next permutation as follows:
|
||
|
// exclude mask 1 + use proximity
|
||
|
// exclude mask 1 + don't use proximity
|
||
|
// exclude mask 2 + use proximity
|
||
|
// exclude mask 2 + don't use proximity
|
||
|
// and so on.
|
||
|
// If only one word is involved never use proximity.
|
||
|
//
|
||
|
if(proximity) {
|
||
|
proximity = 0;
|
||
|
} else {
|
||
|
proximity = 1;
|
||
|
if((ret = WordExcludeMask::Next()) == WORD_PERMUTE_OK) {
|
||
|
//
|
||
|
// Do not toggle proximity for only one non excluded word
|
||
|
//
|
||
|
if(NotExcludedCount() <= 1)
|
||
|
proximity = 0;
|
||
|
check_useless = 1;
|
||
|
} else if(ret == WORD_PERMUTE_END)
|
||
|
proximity = 0;
|
||
|
}
|
||
|
} else {
|
||
|
ret = WordExcludeMask::Next();
|
||
|
check_useless = 1;
|
||
|
}
|
||
|
|
||
|
if(check_useless && ret == WORD_PERMUTE_OK) {
|
||
|
//
|
||
|
// If no bits are ignored or all ignore_mask bits are set to
|
||
|
// one, the last permutation has all exclude bits set, which
|
||
|
// is useless. Just skip it and expect to be at the end of
|
||
|
// all permutations.
|
||
|
//
|
||
|
if(NotExcludedCount() <= 0) {
|
||
|
ret = WordExcludeMask::Next();
|
||
|
if(ret != WORD_PERMUTE_END) {
|
||
|
fprintf(stderr, "WordPermute::Next: expected WORD_PERMUTE_END\n");
|
||
|
ret = NOTOK;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// The semantic is the same as the Get method of Wordexclude
|
||
|
// but a letter T is appended to the string if the proximity
|
||
|
// flag is set, or F is appended to the string if the proximity
|
||
|
// is clear.
|
||
|
//
|
||
|
virtual inline void Get(String& buffer) const {
|
||
|
WordExcludeMask::Get(buffer);
|
||
|
if(use_proximity == WORD_PERMUTE_PROXIMITY_TOGGLE)
|
||
|
buffer << (proximity ? 'T' : 'F');
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// The semantic is the same as the Get method of Wordexclude
|
||
|
// but if the string end with a T the proximity flag is set
|
||
|
// and if the string end with a F the proximity flag is cleared.
|
||
|
//
|
||
|
virtual inline int Set(const String& buffer) {
|
||
|
if(buffer.length() < 1) {
|
||
|
fprintf(stderr, "WordPermute::Set: buffer length < 1\n");
|
||
|
return NOTOK;
|
||
|
}
|
||
|
int ret = OK;
|
||
|
if(use_proximity == WORD_PERMUTE_PROXIMITY_TOGGLE) {
|
||
|
if((ret = WordExcludeMask::Set(buffer.sub(0, buffer.length() - 1))) == OK)
|
||
|
proximity = buffer.last() == 'T';
|
||
|
} else {
|
||
|
ret = WordExcludeMask::Set(buffer);
|
||
|
}
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
int use_proximity;
|
||
|
int proximity;
|
||
|
};
|
||
|
|
||
|
// ************************* WordTree implementation ********************
|
||
|
//
|
||
|
// NAME
|
||
|
//
|
||
|
// Base class for query resolution nodes
|
||
|
//
|
||
|
// SYNOPSIS
|
||
|
//
|
||
|
// #include <WordTree.h>
|
||
|
//
|
||
|
// class WordTreeMethod : public WordTree {
|
||
|
// ...
|
||
|
// };
|
||
|
//
|
||
|
// DESCRIPTION
|
||
|
//
|
||
|
// The WordTree class is derived from the WordCursor class and implement
|
||
|
// the basic operations and data structures needed for query resolution.
|
||
|
// It is the common base class of all the classes that actually implement
|
||
|
// a query resolution. The derived classes must be implemented to follow
|
||
|
// the WordCursor semantic for Walk* operations.
|
||
|
//
|
||
|
//
|
||
|
// END
|
||
|
//
|
||
|
|
||
|
#define WORD_WALK_REDO 0x1000
|
||
|
#define WORD_WALK_RESTART 0x2000
|
||
|
#define WORD_WALK_NEXT 0x4000
|
||
|
|
||
|
//
|
||
|
// Return values of CursorsObeyProximity method
|
||
|
//
|
||
|
#define WORD_SEARCH_NOPROXIMITY 1
|
||
|
|
||
|
//
|
||
|
// operand values
|
||
|
//
|
||
|
#define WORD_TREE_OR 1
|
||
|
#define WORD_TREE_AND 2
|
||
|
#define WORD_TREE_NEAR 3
|
||
|
#define WORD_TREE_OPTIONAL 4
|
||
|
#define WORD_TREE_LITERAL 5
|
||
|
#define WORD_TREE_MANDATORY 6
|
||
|
#define WORD_TREE_NOT 7
|
||
|
|
||
|
#define WORD_TREE_OP_SIZE 20
|
||
|
|
||
|
//
|
||
|
// Default proximity is to search for adjacent words in order
|
||
|
//
|
||
|
#ifndef WORD_SEARCH_DEFAULT_PROXIMITY
|
||
|
#define WORD_SEARCH_DEFAULT_PROXIMITY 1
|
||
|
#endif /* WORD_SEARCH_DEFAULT_PROXIMITY */
|
||
|
|
||
|
static char* operator_name[WORD_TREE_OP_SIZE] = {
|
||
|
"",
|
||
|
"or",
|
||
|
"and",
|
||
|
"near",
|
||
|
"optional",
|
||
|
"literal",
|
||
|
"mandatory",
|
||
|
"not",
|
||
|
0
|
||
|
};
|
||
|
|
||
|
class WordTree : public WordCursor {
|
||
|
public:
|
||
|
WordTree() {
|
||
|
proximity = 0;
|
||
|
uniq = 0;
|
||
|
}
|
||
|
|
||
|
virtual int ContextSaveList(StringList& list) const {
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
virtual int ContextRestoreList(StringList& list) {
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// Initialize the object. <b>words</b> is used to initialize the
|
||
|
// WordCursor base class, <b>document, document_length</b> and
|
||
|
// <b>location</b> are used to initialize the WordKeySemantic data
|
||
|
// member. The <b>nuniq</b> is the WordKey field position used by
|
||
|
// the WordKeySemantic::DocumentNext function. The <b>nproximity</b>
|
||
|
// is the proximity factor used by the WordKeySemantic::LocationCompare
|
||
|
// method.
|
||
|
// Return OK on success, NOTOK on failure.
|
||
|
//
|
||
|
virtual int Prepare(WordList *words, int nuniq, int nproximity, int *document, int document_length, int location) {
|
||
|
int ret;
|
||
|
proximity = nproximity;
|
||
|
uniq = nuniq;
|
||
|
if((ret = key_semantic.Initialize(document, document_length, location)) != OK)
|
||
|
return ret;
|
||
|
WordKey key;
|
||
|
if(!scope.empty()) {
|
||
|
if(key.Set(scope) != OK) {
|
||
|
fprintf(stderr, "WordTree::Prepare: setting scope %s failed\n", (char*)scope);
|
||
|
return NOTOK;
|
||
|
}
|
||
|
}
|
||
|
key.SetWord(search);
|
||
|
return WordCursor::Initialize(words, key, 0, 0, HTDIG_WORDLIST_WALKER);
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// Return a copy of the last document found.
|
||
|
//
|
||
|
WordKey GetDocument() {
|
||
|
WordKey found;
|
||
|
key_semantic.DocumentSet(GetFound().Key(), found);
|
||
|
return found;
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// Store in the <i>info</i> data member textual information about
|
||
|
// the latest match found.
|
||
|
//
|
||
|
virtual void SetInfo() { info = GetFound().Key().GetWord(); }
|
||
|
|
||
|
//-
|
||
|
// Return a copy of the <i>info</i> data member. Should be
|
||
|
// called after SetInfo().
|
||
|
//
|
||
|
String GetInfo() { return info; }
|
||
|
|
||
|
//-
|
||
|
// Sort WordTree data members (if any) in ascending frequency order.
|
||
|
// Return OK on success, NOTOK on failure.
|
||
|
//
|
||
|
virtual int AscendingFrequency() { return OK; }
|
||
|
|
||
|
//-
|
||
|
// Delete WordTree data members (if any) that have a zero frequency.
|
||
|
// The number of data members deleted is returned in <b>stripped</b>.
|
||
|
// Return OK on success, NOTOK on failure.
|
||
|
//
|
||
|
virtual int StripNonExistent(unsigned int& stripped) {
|
||
|
stripped = 0;
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Input
|
||
|
//
|
||
|
//-
|
||
|
// Proximity factor. See WordKeySemantic::LocationCompare.
|
||
|
//
|
||
|
int proximity;
|
||
|
//-
|
||
|
// Uniq WordKey field position. See WordKeySemantic::DocumentNext.
|
||
|
//
|
||
|
int uniq;
|
||
|
//-
|
||
|
// Semantic of the WordKey object.
|
||
|
//
|
||
|
WordKeySemantic key_semantic;
|
||
|
//-
|
||
|
// Textual representation of the search scope.
|
||
|
//
|
||
|
String scope;
|
||
|
//-
|
||
|
// Original search criterion that may be different from the
|
||
|
// WordCursor::searchKey data member.
|
||
|
//
|
||
|
String search;
|
||
|
|
||
|
//
|
||
|
// Internal state
|
||
|
//
|
||
|
//-
|
||
|
// Textual information about the latest match.
|
||
|
//
|
||
|
String info;
|
||
|
};
|
||
|
|
||
|
// ************************* WordTreeLiteral implementation ****************
|
||
|
|
||
|
class WordTreeLiteral : public WordTree {
|
||
|
public:
|
||
|
//-
|
||
|
// Constructor. The search criterion is <b>string</b> and the
|
||
|
// scope is <b>nscope.</b>.
|
||
|
//
|
||
|
WordTreeLiteral(const char* string, const char* nscope = "") {
|
||
|
search.set((char*)string);
|
||
|
scope.set((char*)nscope);
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// Returns WORD_TREE_LITERAL.
|
||
|
//
|
||
|
int IsA() const { return WORD_TREE_LITERAL; }
|
||
|
|
||
|
virtual int WalkRewind();
|
||
|
//-
|
||
|
// Only return a match for each distinct document.
|
||
|
//
|
||
|
virtual int WalkNext();
|
||
|
virtual int Seek(const WordKey& patch);
|
||
|
|
||
|
//-
|
||
|
// If scope is set the <b>bufferout</b> is filled with
|
||
|
// <pre>
|
||
|
// ( word "scope" )
|
||
|
// </pre>
|
||
|
// otherwise the <b>bufferout</b> only contains the word.
|
||
|
//
|
||
|
virtual int Get(String& bufferout) const {
|
||
|
if(scope.empty())
|
||
|
bufferout << search;
|
||
|
else
|
||
|
bufferout << "( " << operator_name[IsA()] << " \"" << scope << "\" " << search << " )";
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
WordKey current_document;
|
||
|
};
|
||
|
|
||
|
int WordTreeLiteral::WalkRewind()
|
||
|
{
|
||
|
current_document.Clear();
|
||
|
return WordCursor::WalkRewind();
|
||
|
}
|
||
|
|
||
|
int WordTreeLiteral::WalkNext()
|
||
|
{
|
||
|
int ret;
|
||
|
do {
|
||
|
ret = WordCursor::WalkNext();
|
||
|
if(verbose > 3) fprintf(stderr, "WordTreeLiteral::WalkNext: reached %s\n", (char*)GetDocument().Get());
|
||
|
} while(ret == OK &&
|
||
|
key_semantic.DocumentCompare(current_document, GetDocument()) == 0);
|
||
|
|
||
|
if(ret == OK)
|
||
|
current_document = GetDocument();
|
||
|
else
|
||
|
current_document.Clear();
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
int WordTreeLiteral::Seek(const WordKey& position)
|
||
|
{
|
||
|
current_document.Clear();
|
||
|
return WordCursor::Seek(position);
|
||
|
}
|
||
|
|
||
|
// ************************* WordTreeOperand implementation ****************
|
||
|
//
|
||
|
// NAME
|
||
|
//
|
||
|
// Base class for boolean query resolution nodes
|
||
|
//
|
||
|
// SYNOPSIS
|
||
|
//
|
||
|
// #include <WordTree.h>
|
||
|
//
|
||
|
// class WordTreeMethod : public WordTreeOperand {
|
||
|
// ...
|
||
|
// };
|
||
|
//
|
||
|
// DESCRIPTION
|
||
|
//
|
||
|
// The WordTreeOperand class is derived from WordTree and implemet
|
||
|
// the basic operations and data structures needed for query resultion
|
||
|
// of boolean operators. It contains a list of WordTree objects (the
|
||
|
// operands or cursors) and redefine the basic WordCursor methods
|
||
|
// to operate on all of them according to the logic defined by the
|
||
|
// derived class.
|
||
|
//
|
||
|
//
|
||
|
// END
|
||
|
//
|
||
|
|
||
|
//
|
||
|
// Helper for debugging that returns the string representation
|
||
|
// of the return codes.
|
||
|
//
|
||
|
static char* ret2str(int ret)
|
||
|
{
|
||
|
if(ret == WORD_WALK_REDO)
|
||
|
return "REDO";
|
||
|
|
||
|
if(ret == WORD_WALK_RESTART)
|
||
|
return "RESTART";
|
||
|
|
||
|
if(ret == WORD_WALK_NEXT)
|
||
|
return "NEXT";
|
||
|
|
||
|
if(ret == OK)
|
||
|
return "OK";
|
||
|
|
||
|
if(ret == NOTOK)
|
||
|
return "NOTOK";
|
||
|
|
||
|
if(ret == WORD_WALK_ATEND)
|
||
|
return "ATEND";
|
||
|
|
||
|
return "???";
|
||
|
}
|
||
|
|
||
|
class WordTreeOperand : public WordTree
|
||
|
{
|
||
|
public:
|
||
|
//-
|
||
|
// Constructor. The scope is <b>nscope</b>.
|
||
|
//
|
||
|
WordTreeOperand(const char* nscope) {
|
||
|
scope.set((char*)nscope);
|
||
|
}
|
||
|
//-
|
||
|
// Free the objects pointed by <i>cursors</i> with delete as well
|
||
|
// as the <i>cursors</i> array itself with delete [].
|
||
|
//
|
||
|
virtual ~WordTreeOperand();
|
||
|
|
||
|
virtual void Clear() {
|
||
|
cursors = 0;
|
||
|
cursors_length = 0;
|
||
|
WordCursor::Clear();
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// Recursively call Optimize on each <i>cursors</i>.
|
||
|
//
|
||
|
virtual int Optimize();
|
||
|
|
||
|
//-
|
||
|
// Change the <i>permutation</i> data member ignore mask according
|
||
|
// to WORD_TREE_MANDATORY and WORD_TREE_NOT nodes found in
|
||
|
// <i>cursors</i>. MANDATORY and NOT nodes are reduced (replaced
|
||
|
// by their first child cursor. For each MANDATORY and NOT nodes
|
||
|
// the bit (see WordExcludeMask for information)
|
||
|
// corresponding to their position is ignored (set in the <b>ignore</b>
|
||
|
// argument of the WordExcludeMask::Initialize function. For NOT
|
||
|
// nodes, the bit corresponding to their position is set in
|
||
|
// the <b>ignore_mask</b> of the WordExcludeMask::Initialize function
|
||
|
// (i.e. implementing a <i>not</i> operation).
|
||
|
// The <b>proximity</b> argument may be WORD_PERMUTE_PROXIMITY_TOGGLE or
|
||
|
// WORD_PERMUTE_PROXIMITY_NO.
|
||
|
// Returns OK on success, NOTOK on failure.
|
||
|
//
|
||
|
int OptimizeOr(int proximity);
|
||
|
|
||
|
virtual int ContextSave(String& buffer) const {
|
||
|
StringList list;
|
||
|
int ret;
|
||
|
if((ret = ContextSaveList(list)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
buffer.trunc();
|
||
|
String* element;
|
||
|
list.Start_Get();
|
||
|
while((element = (String*)list.Get_Next())) {
|
||
|
buffer << (*element) << ';';
|
||
|
}
|
||
|
//
|
||
|
// Trim last ;
|
||
|
//
|
||
|
buffer.chop(1);
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
virtual int ContextSaveList(StringList& list) const {
|
||
|
//
|
||
|
// Apply to each cursor
|
||
|
//
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
if(cursors[i]->ContextSaveList(list) == NOTOK)
|
||
|
return NOTOK;
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
virtual int ContextRestore(const String& buffer) {
|
||
|
if(!buffer.empty()) {
|
||
|
StringList list(buffer, ";");
|
||
|
return ContextRestoreList(list);
|
||
|
} else {
|
||
|
return OK;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
virtual int ContextRestoreList(StringList& list) {
|
||
|
//
|
||
|
// Apply to each cursor
|
||
|
//
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
if(cursors[i]->ContextRestoreList(list) == NOTOK)
|
||
|
return NOTOK;
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// Recursively call WalkInit on each <i>cursors</i>.
|
||
|
//
|
||
|
virtual int WalkInit();
|
||
|
//-
|
||
|
// Recursively call WalkRewind on each <i>cursors</i>.
|
||
|
// Reset the <i>pos</i> data member with WordKeySemantic::DocumentClear.
|
||
|
//
|
||
|
virtual int WalkRewind();
|
||
|
//-
|
||
|
// Recursively call WalkFinish on each <i>cursors</i>.
|
||
|
//
|
||
|
virtual int WalkFinish();
|
||
|
//-
|
||
|
// Recursively call Seek on each <i>cursors</i>.
|
||
|
// Save the <b>patch</b> argument in the <i>pos</i> data
|
||
|
// member.
|
||
|
//
|
||
|
virtual int Seek(const WordKey& patch);
|
||
|
|
||
|
//-
|
||
|
// The number of occurrence of a WordTreeOperand is the sum of the
|
||
|
// number of occurrence of each term.
|
||
|
//
|
||
|
virtual int Noccurrence(unsigned int& noccurrence) const {
|
||
|
noccurrence = 0;
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++) {
|
||
|
unsigned int frequency;
|
||
|
if(cursors[i]->Noccurrence(frequency) != OK)
|
||
|
return NOTOK;
|
||
|
noccurrence += frequency;
|
||
|
}
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// The <b>bufferout</b> argument is filled with a lisp like representation
|
||
|
// of the tree starting at this node.
|
||
|
//
|
||
|
virtual int Get(String& bufferout) const {
|
||
|
bufferout << "( " << operator_name[IsA()] << " \"" << scope << "\" ";
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
bufferout << cursors[i]->Get() << " ";
|
||
|
bufferout << " )";
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// Call Prepare on each <i>cursors</i>. Set the <i>search</i> member
|
||
|
// with an textual representation of the tree starting at this node.
|
||
|
//
|
||
|
virtual int Prepare(WordList *words, int nuniq, int nproximity, int *document, int document_length, int location) {
|
||
|
int ret;
|
||
|
if((ret = WordTree::Prepare(words, nuniq, nproximity, document, document_length, location)) != OK)
|
||
|
return ret;
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++) {
|
||
|
if((ret = cursors[i]->Prepare(words, nuniq, nproximity, document, document_length, location)) != OK)
|
||
|
return ret;
|
||
|
}
|
||
|
return Get(GetSearch().GetWord());
|
||
|
}
|
||
|
|
||
|
//-
|
||
|
// The current cursor offset (set by Seek for instance). It
|
||
|
// duplicates the function of the WordCursor <i>key</i> data member
|
||
|
// because the data type is different (WordKey instead of String).
|
||
|
//
|
||
|
WordKey pos;
|
||
|
//-
|
||
|
// Sub nodes array.
|
||
|
//
|
||
|
WordTree** cursors;
|
||
|
//-
|
||
|
// Number of valid entries in the <i>cursors</i> member.
|
||
|
//
|
||
|
unsigned int cursors_length;
|
||
|
//-
|
||
|
// Permutation generator with proximity toggle
|
||
|
//
|
||
|
WordPermute permutation;
|
||
|
};
|
||
|
|
||
|
WordTreeOperand::~WordTreeOperand()
|
||
|
{
|
||
|
if(cursors) {
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
delete cursors[i];
|
||
|
free(cursors);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int
|
||
|
WordTreeOperand::Optimize()
|
||
|
{
|
||
|
//
|
||
|
// Apply to each cursor
|
||
|
//
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
if(cursors[i]->Optimize() == NOTOK)
|
||
|
return NOTOK;
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
int WordTreeOperand::OptimizeOr(int proximity)
|
||
|
{
|
||
|
unsigned int ignore = 0;
|
||
|
unsigned int ignore_mask = 0;
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++) {
|
||
|
int reduce;
|
||
|
//
|
||
|
// Set ignore & ignore_mask if cursor is NOT or MANDATORY
|
||
|
//
|
||
|
switch(cursors[i]->IsA()) {
|
||
|
case WORD_TREE_MANDATORY:
|
||
|
ignore |= (1 << WORD_EXCLUDE_POSITION2BIT(cursors_length, i));
|
||
|
reduce = 1;
|
||
|
break;
|
||
|
case WORD_TREE_NOT:
|
||
|
ignore |= (1 << WORD_EXCLUDE_POSITION2BIT(cursors_length, i));
|
||
|
ignore_mask |= (1 << WORD_EXCLUDE_POSITION2BIT(cursors_length, i));
|
||
|
reduce = 1;
|
||
|
break;
|
||
|
default:
|
||
|
reduce = 0;
|
||
|
break;
|
||
|
}
|
||
|
//
|
||
|
// Replace the NOT or MANDATORY node by its only child
|
||
|
//
|
||
|
if(reduce) {
|
||
|
WordTreeOperand* old = (WordTreeOperand*)cursors[i];
|
||
|
cursors[i] = old->cursors[0];
|
||
|
old->cursors[0] = 0;
|
||
|
old->cursors_length--;
|
||
|
if(old->cursors_length > 0) {
|
||
|
fprintf(stderr, "WordTreeOptional::OptimizeOr: too many cursors\n");
|
||
|
return NOTOK;
|
||
|
}
|
||
|
delete old;
|
||
|
}
|
||
|
}
|
||
|
return permutation.Initialize(cursors_length, ignore, ignore_mask, proximity);
|
||
|
}
|
||
|
|
||
|
int
|
||
|
WordTreeOperand::WalkInit()
|
||
|
{
|
||
|
unsigned int i;
|
||
|
int ret = WORD_WALK_ATEND;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
if((ret = cursors[i]->WalkInit()) != OK)
|
||
|
return ret;
|
||
|
return (status = ret);
|
||
|
}
|
||
|
|
||
|
int
|
||
|
WordTreeOperand::WalkRewind()
|
||
|
{
|
||
|
unsigned int i;
|
||
|
int ret = OK;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
if((ret = cursors[i]->WalkRewind()) != OK)
|
||
|
return ret;
|
||
|
status = OK;
|
||
|
key_semantic.DocumentClear(pos);
|
||
|
cursor_get_flags = DB_SET_RANGE;
|
||
|
found.Clear();
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
int
|
||
|
WordTreeOperand::WalkFinish()
|
||
|
{
|
||
|
unsigned int i;
|
||
|
int ret = OK;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
if((ret = cursors[i]->WalkFinish()) != OK)
|
||
|
return ret;
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
int
|
||
|
WordTreeOperand::Seek(const WordKey& patch)
|
||
|
{
|
||
|
pos.CopyFrom(patch);
|
||
|
cursor_get_flags = DB_SET_RANGE;
|
||
|
|
||
|
unsigned int i;
|
||
|
int ret = OK;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
if((ret = cursors[i]->Seek(patch)) != OK &&
|
||
|
ret != WORD_WALK_ATEND)
|
||
|
return ret;
|
||
|
status = OK;
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
// ************************* WordTreeOptional implementation ****************
|
||
|
|
||
|
class WordTreeOptional : public WordTreeOperand {
|
||
|
public:
|
||
|
WordTreeOptional(const char* nscope) : WordTreeOperand(nscope) { }
|
||
|
|
||
|
//-
|
||
|
// Return WORD_TREE_OPTIONAL
|
||
|
//
|
||
|
virtual int IsA() const { return WORD_TREE_OPTIONAL; }
|
||
|
|
||
|
virtual int Optimize();
|
||
|
|
||
|
virtual int ContextSaveList(StringList& list) const;
|
||
|
|
||
|
virtual int ContextRestoreList(StringList& list);
|
||
|
|
||
|
//-
|
||
|
// Multipass walk of the occurrences according to the <i>permutation</i>
|
||
|
// data member specifications. First search for documents containing
|
||
|
// all occurrences near to each other. Then documents that
|
||
|
// contain all occurrences far appart. Then ignore the most frequent
|
||
|
// search criterion and search for documents that contain all the others
|
||
|
// near to each other. The logic goes on until there only remains the
|
||
|
// most frequent word.
|
||
|
//
|
||
|
virtual int WalkNext();
|
||
|
//-
|
||
|
// Only seek the first non excluded cursor. The implementation
|
||
|
// of WalkNext makes it useless to seek the others.
|
||
|
//
|
||
|
virtual int Seek(const WordKey& position);
|
||
|
|
||
|
virtual int Prepare(WordList *words, int nuniq, int nproximity, int *document, int document_length, int location) {
|
||
|
int ret;
|
||
|
if((ret = permutation.Initialize(cursors_length, 0, 0, WORD_PERMUTE_PROXIMITY_TOGGLE)) != OK)
|
||
|
return ret;
|
||
|
return WordTreeOperand::Prepare(words, nuniq, nproximity, document, document_length, location);
|
||
|
}
|
||
|
|
||
|
virtual void SetInfo();
|
||
|
|
||
|
virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_TOGGLE; }
|
||
|
|
||
|
virtual int UsePermutation() const { return 1; }
|
||
|
|
||
|
//-
|
||
|
// Returns true if all cursors must have a frequency > 0, false otherwise.
|
||
|
//
|
||
|
virtual int AllOrNothing() const { return 0; }
|
||
|
|
||
|
//-
|
||
|
// Comparison between <b>cursor</b> and <b>constraint</b> is made
|
||
|
// with WordKeySemantic::LocationCompare using the <b>proximity</b>
|
||
|
// argument. If <b>master</b> is NULL it is set to point to <b>
|
||
|
// <b>cursor</b>.
|
||
|
//
|
||
|
// Return WORD_WALK_NEXT if <b>cursor</b> is at <b>constraint</b> and
|
||
|
// set <b>constraint</b> if <b>cursor</b> is <b>master</b>.
|
||
|
//
|
||
|
// Return WORD_WALK_REDO if <b>cursor</b> is above <b>constraint</b> and
|
||
|
// call cursor.WalkNext().
|
||
|
//
|
||
|
// Return WORD_WALK_RESTART if <b>cursor</b> is below <b>constraint</b> and
|
||
|
// set <b>constraint</b> from <b>cursor</b> using
|
||
|
// WordKeySemantic::DocumentSet if <b>cursor</b> is not <b>master</b>
|
||
|
// otherwise also set location of <b>constraint</b> using
|
||
|
// WordKeySemantic::LocationSet and call WordKeySemantic::LocationNext
|
||
|
// on <b>constraint.</b>
|
||
|
//
|
||
|
// Return WORD_WALK_ATEND if no more match possible.
|
||
|
//
|
||
|
// Return NOTOK on failure.
|
||
|
//
|
||
|
int SearchCursorNear(WordTree& cursor, WordTree*& master, WordKey& constraint, int proximity);
|
||
|
//-
|
||
|
// Comparison between <b>cursor</b> and <b>document</b> is made
|
||
|
// with WordKeySemantic::DocumentCompare.
|
||
|
//
|
||
|
// Return WORD_WALK_NEXT if <b>cursor</b> is above <b>document.</b>
|
||
|
//
|
||
|
// Return WORD_WALK_REDO if <b>cursor</b> is below <b>document</b>
|
||
|
// and call cursor.WalkNext().
|
||
|
//
|
||
|
// Return WORD_WALK_RESTART if <b>cursor</b> is at <b>document</b>
|
||
|
// and call WordKeySemantic::DocumentNext method on <b>document.</b>
|
||
|
//
|
||
|
// Return WORD_WALK_ATEND if no more match possible.
|
||
|
//
|
||
|
// Return NOTOK on failure.
|
||
|
//
|
||
|
int SearchCursorNot(WordTree& cursor, WordKey& document);
|
||
|
//-
|
||
|
// Comparison between <b>cursor</b> and <b>document</b> is made
|
||
|
// with WordKeySemantic::DocumentCompare.
|
||
|
//
|
||
|
// Return WORD_WALK_NEXT if <b>cursor</b> is at <b>document.</b>.
|
||
|
//
|
||
|
// Return WORD_WALK_REDO if <b>cursor</b> is below <b>document</b>
|
||
|
//
|
||
|
// Return WORD_WALK_RESTART if <b>cursor</b> is above <b>document</b>
|
||
|
// and call WordKeySemantic::DocumentNext method on <b>document.</b>
|
||
|
//
|
||
|
// Return WORD_WALK_ATEND if no more match possible.
|
||
|
//
|
||
|
// Return NOTOK on failure.
|
||
|
//
|
||
|
//
|
||
|
int SearchCursorAnd(WordTree& cursor, WordKey& document, WordExclude& permutation);
|
||
|
//
|
||
|
// We know that :
|
||
|
// 1) document does not contain any excluded words.
|
||
|
// 2) contains at least one occurrence of each non excluded word.
|
||
|
// The logic, although very similar to WordSearchNear::SearchOne
|
||
|
// is therefore simpler. We ignore all excluded cursors and
|
||
|
// return WORD_SEARCH_NOPROXIMITY as soon as a cursor move outside
|
||
|
// <document>.
|
||
|
//
|
||
|
//-
|
||
|
// If <b>document</b> contains words that match proximity
|
||
|
// requirement, return OK. Return WORD_SEARCH_NOPROXIMITY if proximity
|
||
|
// requirement cannot be matched for <document>.
|
||
|
//
|
||
|
int CursorsObeyProximity(WordKey& document);
|
||
|
|
||
|
//-
|
||
|
// Sort the <i>cursors</i> in ascending frequency order using the
|
||
|
// Noccurrence method on each cursor.
|
||
|
// Return OK on success, NOTOK on failure.
|
||
|
//
|
||
|
virtual int AscendingFrequency();
|
||
|
//-
|
||
|
// Delete all elements of the <i>cursors</i> array that have a
|
||
|
// zero frequency. The <i>cursors</i> array is shrinked and the
|
||
|
// <i>cursors_length</i> set accordingly. Returns the number of
|
||
|
// deletions in the <b>stripped</i> argument.
|
||
|
// Return OK on success, NOTOK on failure.
|
||
|
//
|
||
|
virtual int StripNonExistent(unsigned int& stripped);
|
||
|
};
|
||
|
|
||
|
int WordTreeOptional::Optimize()
|
||
|
{
|
||
|
int ret;
|
||
|
if((ret = WordTreeOperand::Optimize()) != OK)
|
||
|
return ret;
|
||
|
|
||
|
if(UseProximity() != WORD_PERMUTE_PROXIMITY_ONLY) {
|
||
|
if((ret = AscendingFrequency()) != OK)
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
unsigned int stripped;
|
||
|
if((ret = StripNonExistent(stripped)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
if(AllOrNothing() && stripped) {
|
||
|
//
|
||
|
// One word is missing and everything is lost,
|
||
|
// Just kill the remaining cursors.
|
||
|
//
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
delete cursors[i];
|
||
|
cursors_length = 0;
|
||
|
|
||
|
return OK;
|
||
|
} else {
|
||
|
return OptimizeOr(UseProximity());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::ContextSaveList(StringList& list) const
|
||
|
{
|
||
|
int ret;
|
||
|
if((ret = WordTreeOperand::ContextSaveList(list)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
if(UsePermutation()) {
|
||
|
String* buffer = new String();
|
||
|
permutation.Get(*buffer);
|
||
|
|
||
|
list.Add(buffer);
|
||
|
}
|
||
|
|
||
|
{
|
||
|
String* buffer = new String();
|
||
|
if((ret = WordCursor::ContextSave(*buffer)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
list.Add(buffer);
|
||
|
}
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::ContextRestoreList(StringList& list)
|
||
|
{
|
||
|
int ret;
|
||
|
if((ret = WordTreeOperand::ContextRestoreList(list)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
if(UsePermutation()) {
|
||
|
char* buffer = list[0];
|
||
|
if((ret = permutation.Set(buffer)) != OK)
|
||
|
return ret;
|
||
|
list.Remove(0);
|
||
|
}
|
||
|
|
||
|
{
|
||
|
char* buffer = list[0];
|
||
|
if(!buffer) return NOTOK;
|
||
|
WordKey key(buffer);
|
||
|
if((ret = Seek(key)) != OK)
|
||
|
return ret;
|
||
|
cursor_get_flags = DB_NEXT;
|
||
|
|
||
|
list.Remove(0);
|
||
|
}
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::WalkNext()
|
||
|
{
|
||
|
WordKey& constraint = pos;
|
||
|
//
|
||
|
// Set constraint with all 0
|
||
|
//
|
||
|
if(constraint.Empty())
|
||
|
key_semantic.DocumentClear(constraint);
|
||
|
|
||
|
//
|
||
|
// Advance cursors to next constraint, if not at the
|
||
|
// beginning of the search.
|
||
|
//
|
||
|
int ret = OK;
|
||
|
int match_ok = 0;
|
||
|
do {
|
||
|
//
|
||
|
// Advance cursors so that next call fetches another constraint
|
||
|
//
|
||
|
if(cursor_get_flags == DB_NEXT)
|
||
|
key_semantic.DocumentNext(constraint, uniq);
|
||
|
|
||
|
if((ret = Seek(constraint)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
int near = permutation.Proximity();
|
||
|
WordTree* first = 0;
|
||
|
for(unsigned int i = 0; i < cursors_length;) {
|
||
|
WordTree& cursor = *(cursors[i]);
|
||
|
near = permutation.Proximity();
|
||
|
int excluded = permutation.Excluded(i);
|
||
|
if(verbose) fprintf(stderr, "WordTreeOptional::WalkNext: %s excluded = %s, proximity = %s\n", (char*)cursor.GetSearch().GetWord(), (excluded ? "yes" : "no"), (near ? "yes" : "no" ));
|
||
|
|
||
|
int ret;
|
||
|
if(excluded) {
|
||
|
ret = SearchCursorNot(cursor, constraint);
|
||
|
if(verbose > 2) fprintf(stderr, "WordTreeOptional::WalkNext: Not -> %s\n", ret2str(ret));
|
||
|
} else {
|
||
|
if(near) {
|
||
|
ret = SearchCursorNear(cursor, first, constraint, proximity);
|
||
|
if(verbose > 2) fprintf(stderr, "WordTreeOptional::WalkNext: Near -> %s\n", ret2str(ret));
|
||
|
} else {
|
||
|
ret = SearchCursorAnd(cursor, constraint, permutation);
|
||
|
if(verbose > 2) fprintf(stderr, "WordTreeOptional::WalkNext: And -> %s\n", ret2str(ret));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
switch(ret) {
|
||
|
case WORD_WALK_ATEND:
|
||
|
if(UsePermutation()) {
|
||
|
//
|
||
|
// The search is over with this permutation, try another one.
|
||
|
//
|
||
|
switch(permutation.Next()) {
|
||
|
//
|
||
|
// No permutations left, the end
|
||
|
//
|
||
|
case WORD_PERMUTE_END:
|
||
|
return (status = WORD_WALK_ATEND);
|
||
|
break;
|
||
|
|
||
|
//
|
||
|
// Sart over with this permutation
|
||
|
//
|
||
|
case WORD_PERMUTE_OK:
|
||
|
if(WalkRewind() != OK)
|
||
|
return NOTOK;
|
||
|
break;
|
||
|
}
|
||
|
first = 0;
|
||
|
i = 0;
|
||
|
} else {
|
||
|
return (status = WORD_WALK_ATEND);
|
||
|
}
|
||
|
break;
|
||
|
case WORD_WALK_REDO:
|
||
|
break;
|
||
|
case WORD_WALK_RESTART:
|
||
|
first = 0;
|
||
|
i = 0;
|
||
|
break;
|
||
|
case WORD_WALK_NEXT:
|
||
|
i++;
|
||
|
break;
|
||
|
case NOTOK:
|
||
|
default:
|
||
|
return ret;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
cursor_get_flags = DB_NEXT;
|
||
|
|
||
|
SetInfo();
|
||
|
|
||
|
//
|
||
|
// Save possible result, i.e. first non excluded cursor
|
||
|
//
|
||
|
for(unsigned int i = 0; i < cursors_length; i++) {
|
||
|
WordTree& cursor = *(cursors[i]);
|
||
|
if(!permutation.Excluded(i)) {
|
||
|
found.Key().CopyFrom(cursor.GetFound().Key());
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
match_ok = 1;
|
||
|
//
|
||
|
// Only bother if near and non near search are involved
|
||
|
//
|
||
|
if(UseProximity() == WORD_PERMUTE_PROXIMITY_TOGGLE) {
|
||
|
//
|
||
|
// If we reach this point in the function and
|
||
|
// either proximity search is active or there is
|
||
|
// only one word involved, the match is valid.
|
||
|
// Otherwise it may be excluded, see below.
|
||
|
//
|
||
|
if(!near && permutation.NotExcludedCount() > 1) {
|
||
|
//
|
||
|
// If not using proximity, a match that fits the proximity
|
||
|
// requirements must be skipped because it was matched by
|
||
|
// the previous permutation (see WordPermute).
|
||
|
//
|
||
|
switch(CursorsObeyProximity(constraint)) {
|
||
|
case OK:
|
||
|
match_ok = 0;
|
||
|
break;
|
||
|
case WORD_SEARCH_NOPROXIMITY:
|
||
|
match_ok = 1;
|
||
|
break;
|
||
|
default:
|
||
|
case NOTOK:
|
||
|
return NOTOK;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} while(!match_ok && ret == OK);
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::Seek(const WordKey& position)
|
||
|
{
|
||
|
pos.CopyFrom(position);
|
||
|
cursor_get_flags = DB_SET_RANGE;
|
||
|
status = OK;
|
||
|
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++) {
|
||
|
if(!permutation.Excluded(i)) {
|
||
|
WordTree& cursor = *(cursors[i]);
|
||
|
return cursor.Seek(position);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fprintf(stderr, "WordTreeOptional::Seek: failed\n");
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
|
||
|
void WordTreeOptional::SetInfo()
|
||
|
{
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
cursors[i]->SetInfo();
|
||
|
|
||
|
info.trunc();
|
||
|
|
||
|
for(i = 0; i < cursors_length; i++) {
|
||
|
WordTree& cursor = *(cursors[i]);
|
||
|
|
||
|
if(!permutation.Excluded(i))
|
||
|
info << cursor.info << " ";
|
||
|
}
|
||
|
|
||
|
info << (permutation.Proximity() ? "proximity" : "");
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::SearchCursorNear(WordTree& cursor, WordTree*& master, WordKey& constraint, int proximity)
|
||
|
{
|
||
|
int is_master = master == 0 || master == &cursor;
|
||
|
if(master == 0) master = &cursor;
|
||
|
const WordKey& masterKey = master->GetFound().Key();
|
||
|
|
||
|
int direction = key_semantic.LocationCompare(constraint, cursor.GetFound().Key(), proximity);
|
||
|
if(verbose > 2) fprintf(stderr, "WordTreeOptional::SearchCursorNear: LocationCompare(\n\t%s,\n\t%s)\n\t = %d\n", (char*)(constraint.Get()), (char*)(cursor.GetFound().Key().Get()), direction);
|
||
|
|
||
|
//
|
||
|
// If the cursor is in the authorized locations, consider
|
||
|
// next cursor
|
||
|
//
|
||
|
if(direction == 0) {
|
||
|
//
|
||
|
// master cursor makes the rules for location : its location
|
||
|
// is the base to calculate other words mandatory loacations.
|
||
|
//
|
||
|
if(is_master)
|
||
|
key_semantic.LocationSet(cursor.GetFound().Key(), constraint);
|
||
|
//
|
||
|
// Fix location constraint to accomodate proximity tolerance.
|
||
|
//
|
||
|
key_semantic.LocationNearLowest(constraint, proximity);
|
||
|
return WORD_WALK_NEXT;
|
||
|
|
||
|
//
|
||
|
// If current location is above cursor location
|
||
|
//
|
||
|
} else if(direction > 0) {
|
||
|
//
|
||
|
// Move the cursor up to the location.
|
||
|
//
|
||
|
cursor.Seek(constraint);
|
||
|
if(verbose > 1) fprintf(stderr, "WordTreeOptional::SearchCursorNear: leap to %s\n", (char*)constraint.Get());
|
||
|
int ret;
|
||
|
if((ret = cursor.WalkNext()) == OK) {
|
||
|
//
|
||
|
// Remove the location constraint for the master word
|
||
|
// so that it matches and then enforce location for other
|
||
|
// keys.
|
||
|
//
|
||
|
if(is_master)
|
||
|
key_semantic.Location2Document(constraint);
|
||
|
//
|
||
|
// Reconsider the situation for this cursor
|
||
|
//
|
||
|
return WORD_WALK_REDO;
|
||
|
} else {
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// If current location is lower than cursor location,
|
||
|
// meaning that the cursor found no match for the current
|
||
|
// location.
|
||
|
//
|
||
|
} else if(direction < 0) {
|
||
|
//
|
||
|
// The cursor document becomes the current document.
|
||
|
// The master cursor is forced to catch up.
|
||
|
//
|
||
|
key_semantic.DocumentSet(cursor.GetDocument(), constraint);
|
||
|
//
|
||
|
// It is possible that this cursor document is the same
|
||
|
// as the master cursor document (if this cursor hit in the
|
||
|
// same document but a higher location). In this case we must
|
||
|
// increase the location of the master cursor otherwise it will
|
||
|
// match without moving and loop forever.
|
||
|
//
|
||
|
if(!is_master && key_semantic.DocumentCompare(masterKey, constraint) == 0) {
|
||
|
key_semantic.LocationSet(masterKey, constraint);
|
||
|
key_semantic.LocationNext(constraint);
|
||
|
}
|
||
|
//
|
||
|
// Since the current location changed, start over.
|
||
|
//
|
||
|
return WORD_WALK_RESTART;
|
||
|
} else {
|
||
|
fprintf(stderr, "WordTreeOptional::WordCursorNear: reached unreachable statement\n");
|
||
|
return NOTOK;
|
||
|
}
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::SearchCursorNot(WordTree& cursor, WordKey& document)
|
||
|
{
|
||
|
int direction = key_semantic.DocumentCompare(document, cursor.GetFound().Key());
|
||
|
if(verbose > 2) fprintf(stderr, "WordTreeOptional::SearchCursorNot: DocumentCompare(\n\t%s,\n\t%s)\n\t = %d\n", (char*)(document.Get()), (char*)(cursor.GetFound().Key().Get()), direction);
|
||
|
|
||
|
//
|
||
|
// If the cursor is above the current document
|
||
|
// (being at the end of walk is being above all documents).
|
||
|
//
|
||
|
// Means that the cursor is positioned in an acceptable document
|
||
|
// and proceed to the next cursor.
|
||
|
//
|
||
|
if(direction < 0 || cursor.IsAtEnd()) {
|
||
|
return WORD_WALK_NEXT;
|
||
|
|
||
|
//
|
||
|
// If the cursor is below current document
|
||
|
//
|
||
|
} else if(direction > 0) {
|
||
|
//
|
||
|
// Move the cursor up to the document
|
||
|
//
|
||
|
cursor.Seek(document);
|
||
|
if(verbose > 1) fprintf(stderr, "WordTreeOptional::SearchCursorNot: leap to %s\n", (char*)document.Get());
|
||
|
int ret;
|
||
|
if((ret = cursor.WalkNext()) != OK && ret != WORD_WALK_ATEND)
|
||
|
return NOTOK;
|
||
|
//
|
||
|
// It is expected in this case that the cursor has moved above
|
||
|
// the current document and another visit in the loop will
|
||
|
// tell us.
|
||
|
//
|
||
|
return WORD_WALK_REDO;
|
||
|
|
||
|
//
|
||
|
// If the cursor matches the current document.
|
||
|
//
|
||
|
// Means that the current document is not a possible match
|
||
|
// since it is pointed by this cursor.
|
||
|
//
|
||
|
} else if(direction == 0) {
|
||
|
//
|
||
|
// The cursor does not give any hint on a possible
|
||
|
// next document, just go to the next possible one.
|
||
|
//
|
||
|
key_semantic.DocumentNext(document, uniq);
|
||
|
//
|
||
|
// Since the current document changed, start over.
|
||
|
//
|
||
|
return WORD_WALK_RESTART;
|
||
|
} else {
|
||
|
fprintf(stderr, "WordTreeOptional::WordCursorNot: reached unreachable statement\n");
|
||
|
return NOTOK;
|
||
|
}
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::SearchCursorAnd(WordTree& cursor, WordKey& document, WordExclude& permutation)
|
||
|
{
|
||
|
int direction = key_semantic.DocumentCompare(document, cursor.GetFound().Key());
|
||
|
if(verbose > 2) fprintf(stderr, "WordTreeOptional::SearchCursorAnd: DocumentCompare(\n\t%s,\n\t%s)\n\t = %d\n", (char*)(document.Get()), (char*)(cursor.GetFound().Key().Get()), direction);
|
||
|
|
||
|
//
|
||
|
// If the cursor is in the current document.
|
||
|
//
|
||
|
// Means that the cursor is positioned in an acceptable document
|
||
|
// and proceed to the next cursor.
|
||
|
//
|
||
|
if(direction == 0) {
|
||
|
return WORD_WALK_NEXT;
|
||
|
|
||
|
//
|
||
|
// If the cursor is below current document
|
||
|
//
|
||
|
} else if(direction > 0) {
|
||
|
//
|
||
|
// Move the cursor up to the document
|
||
|
//
|
||
|
cursor.Seek(document);
|
||
|
if(verbose > 1) fprintf(stderr, "WordTreeOptional::SearchCursorAnd: leap to %s\n", (char*)document.Get());
|
||
|
int ret;
|
||
|
if((ret = cursor.WalkNext()) == OK)
|
||
|
return WORD_WALK_REDO;
|
||
|
else
|
||
|
return ret;
|
||
|
|
||
|
//
|
||
|
// If the cursor is above current document.
|
||
|
//
|
||
|
// Means the the current document is not a possible match
|
||
|
// since it will never reach it because it's already
|
||
|
// above it.
|
||
|
//
|
||
|
} else if(direction < 0) {
|
||
|
//
|
||
|
// The cursor document becomes the current document.
|
||
|
//
|
||
|
key_semantic.DocumentSet(cursor.GetDocument(), document);
|
||
|
|
||
|
//
|
||
|
// Since the current document changed, start over.
|
||
|
//
|
||
|
return WORD_WALK_RESTART;
|
||
|
} else {
|
||
|
fprintf(stderr, "WordTreeOptional::WordCursorAnd: reached unreachable statement\n");
|
||
|
return NOTOK;
|
||
|
}
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::CursorsObeyProximity(WordKey& document)
|
||
|
{
|
||
|
//
|
||
|
// Run if more than one word is involved, proximity
|
||
|
// is always true if there is only one word.
|
||
|
//
|
||
|
if(permutation.NotExcludedCount() <= 1) return OK;
|
||
|
|
||
|
WordKey location;
|
||
|
|
||
|
//
|
||
|
// The first non excluded cursor contains anchor location.
|
||
|
//
|
||
|
unsigned int master_index = 0;
|
||
|
for(unsigned int i = 0; i < cursors_length; i++) {
|
||
|
if(!permutation.Excluded(i)) {
|
||
|
master_index = i;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
const WordKey& masterKey = cursors[master_index]->GetFound().Key();
|
||
|
key_semantic.DocumentSet(masterKey, location);
|
||
|
|
||
|
for(unsigned int i = 0; i < cursors_length;) {
|
||
|
if(permutation.Excluded(i)) {
|
||
|
i++;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
WordTree& cursor = *(cursors[i]);
|
||
|
if(cursor.IsAtEnd()) return WORD_SEARCH_NOPROXIMITY;
|
||
|
// if(cursor.status & WORD_WALK_FAILED) return NOTOK;
|
||
|
|
||
|
//
|
||
|
// If the cursor moved outside of the tested document,
|
||
|
// no proximity match is possible.
|
||
|
//
|
||
|
if(key_semantic.DocumentCompare(cursor.GetFound().Key(), document) != 0)
|
||
|
return WORD_SEARCH_NOPROXIMITY;
|
||
|
|
||
|
int direction = key_semantic.LocationCompare(location, cursor.GetFound().Key(), proximity);
|
||
|
|
||
|
//
|
||
|
// If the cursor is in the authorized locations, consider
|
||
|
// next cursor
|
||
|
//
|
||
|
if(direction == 0) {
|
||
|
//
|
||
|
// master cursor makes the rules for location : its location
|
||
|
// is the base to calculate other words mandatory loacations.
|
||
|
//
|
||
|
if(i == master_index)
|
||
|
key_semantic.LocationSet(cursor.GetFound().Key(), location);
|
||
|
//
|
||
|
// Fix location constraint to accomodate proximity tolerance.
|
||
|
//
|
||
|
key_semantic.LocationNearLowest(location, proximity);
|
||
|
i++;
|
||
|
|
||
|
//
|
||
|
// If current location is greater than cursor location
|
||
|
//
|
||
|
} else if(direction > 0) {
|
||
|
//
|
||
|
// Move the cursor up to the location.
|
||
|
//
|
||
|
cursor.Seek(location);
|
||
|
if(verbose > 1) fprintf(stderr, "WordTreeOptional::CursorsObeyProximity: leap to %s\n", (char*)location.Get());
|
||
|
int ret;
|
||
|
if((ret = cursor.WalkNext()) != OK) {
|
||
|
if(ret == WORD_WALK_ATEND) {
|
||
|
return WORD_SEARCH_NOPROXIMITY;
|
||
|
} else {
|
||
|
return NOTOK;
|
||
|
}
|
||
|
}
|
||
|
//
|
||
|
// Remove the location constraint for the master word
|
||
|
// so that it matches and then enforce location for other
|
||
|
// keys.
|
||
|
//
|
||
|
if(i == master_index)
|
||
|
key_semantic.Location2Document(location);
|
||
|
//
|
||
|
// Reconsider the situation for this cursor
|
||
|
//
|
||
|
|
||
|
//
|
||
|
// If current location is lower than cursor location,
|
||
|
// meaning that the cursor found no match in the current
|
||
|
// document.
|
||
|
//
|
||
|
} else if(direction < 0) {
|
||
|
//
|
||
|
// Move to next master key, if possible.
|
||
|
//
|
||
|
if(i != master_index) {
|
||
|
key_semantic.LocationSet(masterKey, location);
|
||
|
key_semantic.LocationNext(location);
|
||
|
}
|
||
|
//
|
||
|
// Since the current location changed, start over.
|
||
|
//
|
||
|
i = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Helper class for AscendingFrequency method
|
||
|
//
|
||
|
class WordSort {
|
||
|
public:
|
||
|
unsigned int frequency;
|
||
|
WordTree *cursor;
|
||
|
};
|
||
|
|
||
|
//
|
||
|
// Helper function for AscendingFrequency method
|
||
|
//
|
||
|
static int ascending_frequency(const void *a, const void *b)
|
||
|
{
|
||
|
const WordSort& a_cursor = *(WordSort*)a;
|
||
|
const WordSort& b_cursor = *(WordSort*)b;
|
||
|
|
||
|
return a_cursor.frequency - b_cursor.frequency;
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::AscendingFrequency()
|
||
|
{
|
||
|
//
|
||
|
// Reorder cursors
|
||
|
//
|
||
|
WordSort *tmp = new WordSort[cursors_length];
|
||
|
|
||
|
memset((char*)tmp, '\0', cursors_length * sizeof(WordSort));
|
||
|
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++) {
|
||
|
unsigned int frequency;
|
||
|
if(cursors[i]->Noccurrence(frequency) != OK) {
|
||
|
delete [] tmp;
|
||
|
return NOTOK;
|
||
|
}
|
||
|
if(verbose > 2) fprintf(stderr, "WordTreeOptional::AscendingFrequency: %s occurs %d times\n", (char*)cursors[i]->GetSearch().Get(), frequency);
|
||
|
tmp[i].frequency = frequency;
|
||
|
tmp[i].cursor = cursors[i];
|
||
|
}
|
||
|
|
||
|
memset((char*)cursors, '\0', cursors_length * sizeof(WordTree*));
|
||
|
|
||
|
qsort((void *)tmp, cursors_length, sizeof(WordSort), &ascending_frequency);
|
||
|
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
cursors[i] = tmp[i].cursor;
|
||
|
|
||
|
delete [] tmp;
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
int WordTreeOptional::StripNonExistent(unsigned int& stripped)
|
||
|
{
|
||
|
stripped = 0;
|
||
|
|
||
|
WordTree** tmp = new WordTree*[cursors_length];
|
||
|
memset((char*)tmp, '\0', cursors_length * sizeof(WordTree*));
|
||
|
|
||
|
unsigned int from;
|
||
|
unsigned int to;
|
||
|
|
||
|
for(to = from = 0; from < cursors_length; from++) {
|
||
|
unsigned int frequency;
|
||
|
if(cursors[from]->Noccurrence(frequency) != OK) {
|
||
|
delete [] tmp;
|
||
|
return NOTOK;
|
||
|
}
|
||
|
|
||
|
if(verbose > 2) fprintf(stderr, "WordTreeOptional::StripNonExistent: %s occurs %d times\n", (char*)cursors[from]->GetSearch().Get(), frequency);
|
||
|
if(frequency > 0) {
|
||
|
tmp[to++] = cursors[from];
|
||
|
} else {
|
||
|
delete cursors[from];
|
||
|
stripped++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
memset((char*)cursors, '\0', cursors_length * sizeof(WordTree*));
|
||
|
|
||
|
cursors_length = to;
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
cursors[i] = tmp[i];
|
||
|
|
||
|
delete [] tmp;
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
// ************************* WordTreeOr implementation ********************
|
||
|
|
||
|
class WordTreeOr : public WordTreeOperand {
|
||
|
public:
|
||
|
WordTreeOr(const char* nscope) : WordTreeOperand(nscope) { }
|
||
|
|
||
|
//-
|
||
|
// Return WORD_TREE_OR
|
||
|
//
|
||
|
virtual int IsA() const { return WORD_TREE_OR; }
|
||
|
|
||
|
virtual int Optimize();
|
||
|
|
||
|
virtual int ContextSaveList(StringList& list) const;
|
||
|
|
||
|
virtual int ContextRestoreList(StringList& list);
|
||
|
|
||
|
virtual void SetInfo();
|
||
|
|
||
|
virtual int WalkNext();
|
||
|
|
||
|
virtual int UsePermutation() const { return 0; }
|
||
|
|
||
|
virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_NO; }
|
||
|
};
|
||
|
|
||
|
int WordTreeOr::Optimize()
|
||
|
{
|
||
|
int ret;
|
||
|
if((ret = WordTreeOperand::Optimize()) != OK)
|
||
|
return ret;
|
||
|
|
||
|
if((ret = AscendingFrequency()) != OK)
|
||
|
return ret;
|
||
|
|
||
|
unsigned int stripped;
|
||
|
if((ret = StripNonExistent(stripped)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
return OptimizeOr(WORD_PERMUTE_PROXIMITY_NO);
|
||
|
}
|
||
|
|
||
|
int WordTreeOr::ContextSaveList(StringList& list) const
|
||
|
{
|
||
|
int ret;
|
||
|
if((ret = WordTreeOperand::ContextSaveList(list)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
{
|
||
|
String* buffer = new String();
|
||
|
permutation.Get(*buffer);
|
||
|
|
||
|
list.Add(buffer);
|
||
|
}
|
||
|
|
||
|
{
|
||
|
String* buffer = new String();
|
||
|
if((ret = WordCursor::ContextSave(*buffer)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
list.Add(buffer);
|
||
|
}
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
int WordTreeOr::ContextRestoreList(StringList& list)
|
||
|
{
|
||
|
int ret;
|
||
|
if((ret = WordTreeOperand::ContextRestoreList(list)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
{
|
||
|
char* buffer = list[0];
|
||
|
if((ret = permutation.Set(buffer)) != OK)
|
||
|
return ret;
|
||
|
list.Remove(0);
|
||
|
}
|
||
|
|
||
|
{
|
||
|
char* buffer = list[0];
|
||
|
if(!buffer) return NOTOK;
|
||
|
WordKey key(buffer);
|
||
|
if((ret = Seek(key)) != OK)
|
||
|
return ret;
|
||
|
cursor_get_flags = DB_NEXT;
|
||
|
|
||
|
list.Remove(0);
|
||
|
}
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
void WordTreeOr::SetInfo()
|
||
|
{
|
||
|
unsigned int i;
|
||
|
for(i = 0; i < cursors_length; i++)
|
||
|
cursors[i]->SetInfo();
|
||
|
|
||
|
info.trunc();
|
||
|
|
||
|
for(i = 0; i < cursors_length; i++) {
|
||
|
WordTree& cursor = *(cursors[i]);
|
||
|
|
||
|
if(!permutation.Excluded(i) &&
|
||
|
!cursor.IsAtEnd() &&
|
||
|
key_semantic.DocumentCompare(cursor.GetFound().Key(), GetFound().Key()) == 0) {
|
||
|
info << cursor.info << " ";
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int WordTreeOr::WalkNext()
|
||
|
{
|
||
|
WordKey& constraint = pos;
|
||
|
//
|
||
|
// Set constraint with all 0
|
||
|
//
|
||
|
if(constraint.Empty())
|
||
|
key_semantic.DocumentClear(constraint);
|
||
|
|
||
|
WordKey candidate;
|
||
|
int match_ok;
|
||
|
do {
|
||
|
int ret;
|
||
|
unsigned int i;
|
||
|
candidate.Clear();
|
||
|
//
|
||
|
// Advance cursors so that next call fetches another constraint
|
||
|
//
|
||
|
if(cursor_get_flags == DB_NEXT)
|
||
|
key_semantic.DocumentNext(constraint, uniq);
|
||
|
|
||
|
if((ret = Seek(constraint)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
match_ok = 1;
|
||
|
//
|
||
|
// All non excluded cursors are about to move
|
||
|
// at or beyond constraint. Search for the one (candidate) that
|
||
|
// is located at the lowest location beyond the constraint.
|
||
|
//
|
||
|
for(i = 0; i < cursors_length; i++) {
|
||
|
if(permutation.Excluded(i))
|
||
|
continue;
|
||
|
WordTree& cursor = *(cursors[i]);
|
||
|
|
||
|
switch((ret = cursor.WalkNext())) {
|
||
|
case WORD_WALK_ATEND:
|
||
|
//
|
||
|
// Constraint is above all matches for this cursor
|
||
|
//
|
||
|
break;
|
||
|
case OK:
|
||
|
//
|
||
|
// If candidate is not set or current cursor is below
|
||
|
// the current candidate, the curent cursor document becomes
|
||
|
// the candidate.
|
||
|
//
|
||
|
if(candidate.Empty() ||
|
||
|
key_semantic.DocumentCompare(candidate, cursor.GetFound().Key()) > 0) {
|
||
|
key_semantic.DocumentSet(cursor.GetDocument(), candidate);
|
||
|
}
|
||
|
break;
|
||
|
default:
|
||
|
return ret;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// No candidate ? It's the end of the match list.
|
||
|
//
|
||
|
if(candidate.Empty())
|
||
|
return WORD_WALK_ATEND;
|
||
|
|
||
|
found.Key().CopyFrom(candidate);
|
||
|
|
||
|
SetInfo();
|
||
|
|
||
|
if(permutation.ExcludedCount() > 0) {
|
||
|
if((ret = Seek(candidate)) != OK)
|
||
|
return ret;
|
||
|
|
||
|
//
|
||
|
// Restart loop if candidate matches an excluded cursor.
|
||
|
//
|
||
|
for(i = 0; i < cursors_length && match_ok; i++) {
|
||
|
if(!permutation.Excluded(i))
|
||
|
continue;
|
||
|
WordTree& cursor = *(cursors[i]);
|
||
|
|
||
|
switch((ret = cursor.WalkNext())) {
|
||
|
case WORD_WALK_ATEND:
|
||
|
//
|
||
|
// This excluded cursor can't match the candidate, fine.
|
||
|
//
|
||
|
break;
|
||
|
case OK:
|
||
|
//
|
||
|
// This excluded cursor matches candidate therefore it's
|
||
|
// not a valid candidate. Restart search with this candidate
|
||
|
// as the constraint.
|
||
|
//
|
||
|
if(key_semantic.DocumentCompare(candidate, cursor.GetFound().Key()) == 0) {
|
||
|
constraint = candidate;
|
||
|
match_ok = 0;
|
||
|
}
|
||
|
break;
|
||
|
default:
|
||
|
return ret;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
cursor_get_flags = DB_NEXT;
|
||
|
|
||
|
} while(!match_ok);
|
||
|
|
||
|
constraint = candidate;
|
||
|
|
||
|
return OK;
|
||
|
}
|
||
|
|
||
|
// ************************* WordTreeAnd implementation ********************
|
||
|
|
||
|
class WordTreeAnd : public WordTreeOptional {
|
||
|
public:
|
||
|
WordTreeAnd(const char* nscope) : WordTreeOptional(nscope) { }
|
||
|
|
||
|
//-
|
||
|
// Return WORD_TREE_AND
|
||
|
//
|
||
|
virtual int IsA() const { return WORD_TREE_AND; }
|
||
|
|
||
|
virtual int UsePermutation() const { return 0; }
|
||
|
|
||
|
virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_NO; }
|
||
|
|
||
|
virtual int AllOrNothing() const { return 1; }
|
||
|
};
|
||
|
|
||
|
// ************************* WordTreeNear implementation ********************
|
||
|
|
||
|
class WordTreeNear : public WordTreeOptional {
|
||
|
public:
|
||
|
WordTreeNear(const char* nscope) : WordTreeOptional(nscope) { }
|
||
|
|
||
|
//-
|
||
|
// Return WORD_TREE_NEAR
|
||
|
//
|
||
|
virtual int IsA() const { return WORD_TREE_NEAR; }
|
||
|
|
||
|
virtual int UsePermutation() const { return 0; }
|
||
|
|
||
|
virtual int UseProximity() const { return WORD_PERMUTE_PROXIMITY_ONLY; }
|
||
|
|
||
|
virtual int AllOrNothing() const { return 1; }
|
||
|
};
|
||
|
|
||
|
// ************************* WordTreeMandatory implementation ***************
|
||
|
|
||
|
class WordTreeMandatory : public WordTreeOperand {
|
||
|
public:
|
||
|
WordTreeMandatory(const char* nscope) : WordTreeOperand(nscope) { }
|
||
|
|
||
|
//-
|
||
|
// Return WORD_TREE_MANDATORY
|
||
|
//
|
||
|
virtual int IsA() const { return WORD_TREE_MANDATORY; }
|
||
|
};
|
||
|
|
||
|
// ************************* WordTreeNot implementation ***************
|
||
|
|
||
|
class WordTreeNot : public WordTreeOperand {
|
||
|
public:
|
||
|
WordTreeNot(const char* nscope) : WordTreeOperand(nscope) { }
|
||
|
|
||
|
//-
|
||
|
// Return WORD_TREE_NOT
|
||
|
//
|
||
|
virtual int IsA() const { return WORD_TREE_NOT; }
|
||
|
};
|
||
|
|
||
|
// ************************* WordMatch implementation ********************
|
||
|
|
||
|
//
|
||
|
// Return value of the Search method, tells us which document
|
||
|
// matched and why.
|
||
|
//
|
||
|
class WordMatch {
|
||
|
public:
|
||
|
|
||
|
//-
|
||
|
// Return a textual representation of the object.
|
||
|
//
|
||
|
String Get() const;
|
||
|
|
||
|
//-
|
||
|
// The document that matched
|
||
|
//
|
||
|
WordKey match;
|
||
|
//-
|
||
|
// An ascii description of why it matched.
|
||
|
//
|
||
|
String info;
|
||
|
};
|
||
|
|
||
|
String WordMatch::Get() const
|
||
|
{
|
||
|
String tmp;
|
||
|
match.Get(tmp);
|
||
|
if(!info.empty())
|
||
|
tmp << "(" << info << ")";
|
||
|
return tmp;
|
||
|
}
|
||
|
|
||
|
// ************************* WordSearch implementation ********************
|
||
|
//
|
||
|
// NAME
|
||
|
//
|
||
|
// Solve a query from a WordTree syntax tree
|
||
|
//
|
||
|
// SYNOPSIS
|
||
|
//
|
||
|
// #include <WordSearch.h>
|
||
|
//
|
||
|
// WordTree* expr = get_query();
|
||
|
// WordSearch search;
|
||
|
// search.limit_count = NUMBER_OF_RESULTS;
|
||
|
// WordMatch* search.Search(expr);
|
||
|
// ...
|
||
|
//
|
||
|
// DESCRIPTION
|
||
|
//
|
||
|
// The WordSearch class is a wrapper to query an inverted index
|
||
|
// using a WordTree syntax tree.
|
||
|
//
|
||
|
// END
|
||
|
//
|
||
|
class WordSearch {
|
||
|
public:
|
||
|
WordSearch();
|
||
|
|
||
|
//-
|
||
|
// Perform a search from the <b>expr</b> specifications.
|
||
|
// Restore the context from <i>context_in</i> on <b>expr</b>.
|
||
|
// Then skip (using WalkNext) <i>limit_bottom</i> entries.
|
||
|
// Then collect in a WordMatch array of size <i>limit_count</i>
|
||
|
// each match returned by WalkNext. When finished store
|
||
|
// the context (ContextSave) in <i>context_out</i>.
|
||
|
// It is the responsibility of the caller to free the WordMatch
|
||
|
// array. If no match are found a null pointer is returned.
|
||
|
//
|
||
|
WordMatch *Search();
|
||
|
|
||
|
//
|
||
|
// Search backend, only run the WalkNext loop but does not
|
||
|
// allocate/deallocate data.
|
||
|
//
|
||
|
int SearchLoop(WordTree *expr);
|
||
|
|
||
|
//
|
||
|
// Return a context description string to resume the
|
||
|
// search at a given point.
|
||
|
//
|
||
|
const String& Context() const { return context_out; }
|
||
|
|
||
|
//
|
||
|
// Input
|
||
|
//
|
||
|
unsigned int limit_bottom;
|
||
|
unsigned int limit_count;
|
||
|
String context_in;
|
||
|
WordTree* expr;
|
||
|
|
||
|
//
|
||
|
// Output
|
||
|
//
|
||
|
WordMatch* matches;
|
||
|
unsigned int matches_size;
|
||
|
unsigned int matches_length;
|
||
|
String context_out;
|
||
|
};
|
||
|
|
||
|
WordSearch::WordSearch()
|
||
|
{
|
||
|
//
|
||
|
// Input
|
||
|
//
|
||
|
limit_bottom = 0;
|
||
|
limit_count = 0;
|
||
|
context_in.trunc();
|
||
|
expr = 0;
|
||
|
|
||
|
//
|
||
|
// Output
|
||
|
//
|
||
|
matches = 0;
|
||
|
matches_size = 0;
|
||
|
matches_length = 0;
|
||
|
context_out.trunc();
|
||
|
}
|
||
|
|
||
|
WordMatch *WordSearch::Search()
|
||
|
{
|
||
|
int ret = 0;
|
||
|
|
||
|
if(verbose) fprintf(stderr, "WordSearch::Search: non optimized expression %s\n", (char*)expr->Get());
|
||
|
if(expr->Optimize() != OK)
|
||
|
return 0;
|
||
|
if(verbose) fprintf(stderr, "WordSearch::Search: optimized expression %s\n", (char*)expr->Get());
|
||
|
|
||
|
|
||
|
//
|
||
|
// Build space for results
|
||
|
//
|
||
|
matches_size = limit_count + 1;
|
||
|
matches = new WordMatch[matches_size];
|
||
|
matches_length = 0;
|
||
|
|
||
|
//
|
||
|
// Move to first possible position.
|
||
|
//
|
||
|
if(expr->WalkInit() != OK)
|
||
|
goto end;
|
||
|
|
||
|
if(expr->ContextRestore(context_in) == NOTOK)
|
||
|
goto end;
|
||
|
ret = SearchLoop(expr);
|
||
|
//
|
||
|
// Don't bother saving the context if at end of
|
||
|
// search (WORD_WALK_ATEND) or error (NOTOK)
|
||
|
//
|
||
|
if(ret == OK && expr->ContextSave(context_out) == NOTOK)
|
||
|
goto end;
|
||
|
|
||
|
end:
|
||
|
expr->WalkFinish();
|
||
|
|
||
|
if(ret == NOTOK || matches_length <= 0) {
|
||
|
delete [] matches;
|
||
|
matches = 0;
|
||
|
}
|
||
|
|
||
|
return matches;
|
||
|
}
|
||
|
|
||
|
int WordSearch::SearchLoop(WordTree *expr)
|
||
|
{
|
||
|
int ret = OK;
|
||
|
unsigned int i;
|
||
|
//
|
||
|
// Skip the first <limit_bottom> documents
|
||
|
//
|
||
|
{
|
||
|
for(i = 0; i < limit_bottom; i++) {
|
||
|
if((ret = expr->WalkNext()) != OK)
|
||
|
return ret;
|
||
|
}
|
||
|
}
|
||
|
//
|
||
|
// Get documents up to <limit_count> or exhaustion
|
||
|
//
|
||
|
for(matches_length = 0; matches_length < limit_count; matches_length++) {
|
||
|
if((ret = expr->WalkNext()) != OK) {
|
||
|
break;
|
||
|
} else {
|
||
|
matches[matches_length].match = expr->GetDocument();
|
||
|
if(expr->IsA() != WORD_TREE_LITERAL)
|
||
|
matches[matches_length].info = ((WordTreeOperand*)expr)->GetInfo();
|
||
|
if(verbose) fprintf(stderr, "WordSearch::Search: match %s\n", (char*)matches[matches_length].match.Get());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(ret == WORD_WALK_ATEND)
|
||
|
matches[matches_length].match.Clear();
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
// ************************* WordParser implementation ********************
|
||
|
//
|
||
|
// NAME
|
||
|
//
|
||
|
// Textual query parser for test purpose
|
||
|
//
|
||
|
// SYNOPSIS
|
||
|
//
|
||
|
// #include <WordParser.h>
|
||
|
//
|
||
|
// WordParser parser;
|
||
|
// WordTree* expr = parser.Parse("( or \"scope1\" a query )");
|
||
|
// ...
|
||
|
// delete expr;
|
||
|
//
|
||
|
// DESCRIPTION
|
||
|
//
|
||
|
// The WordParser class implement a lisp-like parser for queries
|
||
|
// implemented by the WordTree derived classes. The syntax is rigid
|
||
|
// and should not be used for human input. The generic syntax of an
|
||
|
// expression is
|
||
|
// <pre>
|
||
|
// ( operator "scope" operand [operand ...] )
|
||
|
// </pre>
|
||
|
// The parenthesis must <b>always</b> be surrounded by white space otherwise
|
||
|
// the parser will be lost. The separator is white space and newline.
|
||
|
// Tabulation may be used in scope to separate key fields.
|
||
|
//
|
||
|
// As a special case a single word is strictly equivalent
|
||
|
// to
|
||
|
// <pre>
|
||
|
// ( literal "" word )
|
||
|
// </pre>
|
||
|
//
|
||
|
// Operators can be lower case or upper case. There is almost no syntax
|
||
|
// checking and it's the responsibility of the caller to associate meaningfull
|
||
|
// operands. For instance ( near ( not foo ) bar ) is meaningless.
|
||
|
//
|
||
|
// OPERATORS
|
||
|
//
|
||
|
// <dl>
|
||
|
//
|
||
|
// <dt> optional
|
||
|
// <dd> WordTreeOptional
|
||
|
//
|
||
|
// <dt> or
|
||
|
// <dd> WordTreeOr
|
||
|
//
|
||
|
// <dt> and
|
||
|
// <dd> WordTreeAnd
|
||
|
//
|
||
|
// <dt> near
|
||
|
// <dd> WordTreeNear
|
||
|
//
|
||
|
// <dt> not,forbiden
|
||
|
// <dd> WordTreeNot
|
||
|
//
|
||
|
// <dt> mandatory
|
||
|
// <dd> WordTreeMandatory
|
||
|
//
|
||
|
// <dt> literal
|
||
|
// <dd> WordTreeLiteral
|
||
|
//
|
||
|
// </dl>
|
||
|
//
|
||
|
//
|
||
|
// END
|
||
|
|
||
|
//
|
||
|
// Possible values of the info argument of ParseOperands
|
||
|
//
|
||
|
#define WORD_TREE_MANY 0x01
|
||
|
#define WORD_TREE_ONE 0x02
|
||
|
#define WORD_TREE_TWO 0x04
|
||
|
|
||
|
class WordParser {
|
||
|
public:
|
||
|
WordTree *Parse(const String& expr);
|
||
|
WordTree *ParseList(StringList& terms);
|
||
|
|
||
|
WordTree *ParseExpr(StringList& terms);
|
||
|
WordTree *ParseUnary(StringList& terms);
|
||
|
WordTree *ParseConj(StringList& terms);
|
||
|
void ParseOperands(StringList& terms, int info, WordTreeOperand* expr);
|
||
|
WordTree *ParseLiteral(StringList& terms);
|
||
|
char *ParseScope(StringList& terms);
|
||
|
|
||
|
void Shift(StringList& terms);
|
||
|
char *Term(StringList& terms);
|
||
|
};
|
||
|
|
||
|
WordTree *WordParser::Parse(const String& expr)
|
||
|
{
|
||
|
StringList terms(expr, " \n");
|
||
|
return ParseList(terms);
|
||
|
}
|
||
|
|
||
|
WordTree *WordParser::ParseList(StringList& terms)
|
||
|
{
|
||
|
WordTree *expr = ParseExpr(terms);
|
||
|
return expr;
|
||
|
}
|
||
|
|
||
|
WordTree *WordParser::ParseExpr(StringList& terms)
|
||
|
{
|
||
|
WordTree *expr = 0;
|
||
|
char* term = strdup(Term(terms));
|
||
|
if(!strcmp(term, "(")) {
|
||
|
Shift(terms);
|
||
|
expr = ParseExpr(terms);
|
||
|
} else if(!strcmp(term, ")")) {
|
||
|
//
|
||
|
// At end of expression, return null
|
||
|
//
|
||
|
} else if(!mystrcasecmp(term, "optional") ||
|
||
|
!mystrcasecmp(term, "or") ||
|
||
|
!mystrcasecmp(term, "and") ||
|
||
|
!mystrcasecmp(term, "near")) {
|
||
|
expr = ParseConj(terms);
|
||
|
} else if(!mystrcasecmp(term, "not") ||
|
||
|
!mystrcasecmp(term, "mandatory") ||
|
||
|
!mystrcasecmp(term, "forbiden")) {
|
||
|
expr = ParseUnary(terms);
|
||
|
} else {
|
||
|
expr = ParseLiteral(terms);
|
||
|
}
|
||
|
free(term);
|
||
|
return expr;
|
||
|
}
|
||
|
|
||
|
WordTree *WordParser::ParseUnary(StringList& terms)
|
||
|
{
|
||
|
int op = 0;
|
||
|
if(!mystrcasecmp(Term(terms), "mandatory"))
|
||
|
op = WORD_TREE_MANDATORY;
|
||
|
else if(!mystrcasecmp(Term(terms), "forbiden") ||
|
||
|
!mystrcasecmp(Term(terms), "not"))
|
||
|
op = WORD_TREE_NOT;
|
||
|
|
||
|
Shift(terms);
|
||
|
char* scope = ParseScope(terms);
|
||
|
WordTreeOperand *expr = 0;
|
||
|
switch(op) {
|
||
|
case WORD_TREE_MANDATORY:
|
||
|
expr = new WordTreeMandatory(scope);
|
||
|
break;
|
||
|
case WORD_TREE_NOT:
|
||
|
expr = new WordTreeNot(scope);
|
||
|
break;
|
||
|
default:
|
||
|
fprintf(stderr, "WordParser::ParseUnary: unexpected operator %d\n", op);
|
||
|
exit(1);
|
||
|
break;
|
||
|
}
|
||
|
free(scope);
|
||
|
ParseOperands(terms, WORD_TREE_ONE, expr);
|
||
|
return expr;
|
||
|
}
|
||
|
|
||
|
WordTree *WordParser::ParseConj(StringList& terms)
|
||
|
{
|
||
|
int op = 0;
|
||
|
if(!mystrcasecmp(Term(terms), "optional"))
|
||
|
op = WORD_TREE_OPTIONAL;
|
||
|
else if(!mystrcasecmp(Term(terms), "or"))
|
||
|
op = WORD_TREE_OR;
|
||
|
else if(!mystrcasecmp(Term(terms), "and"))
|
||
|
op = WORD_TREE_AND;
|
||
|
else if(!mystrcasecmp(Term(terms), "near"))
|
||
|
op = WORD_TREE_NEAR;
|
||
|
|
||
|
Shift(terms);
|
||
|
char* scope = ParseScope(terms);
|
||
|
WordTreeOperand *expr = 0;
|
||
|
switch(op) {
|
||
|
case WORD_TREE_OR:
|
||
|
expr = new WordTreeOr(scope);
|
||
|
break;
|
||
|
case WORD_TREE_OPTIONAL:
|
||
|
expr = new WordTreeOptional(scope);
|
||
|
break;
|
||
|
case WORD_TREE_AND:
|
||
|
expr = new WordTreeAnd(scope);
|
||
|
break;
|
||
|
case WORD_TREE_NEAR:
|
||
|
expr = new WordTreeNear(scope);
|
||
|
break;
|
||
|
default:
|
||
|
fprintf(stderr, "WordParser::ParseOrAnd: unexpected operator %d\n", op);
|
||
|
exit(1);
|
||
|
break;
|
||
|
}
|
||
|
free(scope);
|
||
|
ParseOperands(terms, WORD_TREE_MANY, expr);
|
||
|
return expr;
|
||
|
}
|
||
|
|
||
|
void WordParser::ParseOperands(StringList& terms, int info, WordTreeOperand* expr)
|
||
|
{
|
||
|
unsigned int operands_length = 0;
|
||
|
unsigned int operands_size = 1;
|
||
|
WordTree **operands = (WordTree**)malloc(operands_size * sizeof(WordTree*));
|
||
|
WordTree *subexpr = 0;
|
||
|
while((subexpr = ParseExpr(terms))) {
|
||
|
operands_length++;
|
||
|
if((info & WORD_TREE_ONE) && operands_length > 1) {
|
||
|
fprintf(stderr, "WordParser::ParseOperands: expected only one operands\n");
|
||
|
exit(1);
|
||
|
} else if((info & WORD_TREE_TWO) && operands_length > 2) {
|
||
|
fprintf(stderr, "WordParser::ParseOperands: expected only two operands\n");
|
||
|
exit(1);
|
||
|
}
|
||
|
if(operands_length > operands_size) {
|
||
|
operands_size = operands_length * 2;
|
||
|
operands = (WordTree**)realloc(operands, operands_size * sizeof(WordTree*));
|
||
|
}
|
||
|
operands[operands_length - 1] = subexpr;
|
||
|
}
|
||
|
//
|
||
|
// Discard close parenthesis
|
||
|
//
|
||
|
if(strcmp(Term(terms), ")")) {
|
||
|
fprintf(stderr, "WordParser::ParseOperands: expected close parenthesis\n");
|
||
|
exit(1);
|
||
|
}
|
||
|
Shift(terms);
|
||
|
|
||
|
expr->cursors = operands;
|
||
|
expr->cursors_length = operands_length;
|
||
|
}
|
||
|
|
||
|
WordTree *WordParser::ParseLiteral(StringList& terms)
|
||
|
{
|
||
|
char* term = strdup(Term(terms));
|
||
|
char* scope = 0;
|
||
|
if(!mystrcasecmp(term, "literal")) {
|
||
|
Shift(terms);
|
||
|
scope = ParseScope(terms);
|
||
|
free(term);
|
||
|
term = strdup(Term(terms));
|
||
|
Shift(terms);
|
||
|
} else {
|
||
|
scope = strdup("");
|
||
|
}
|
||
|
WordTreeLiteral *expr = new WordTreeLiteral(term, scope);
|
||
|
Shift(terms);
|
||
|
free(scope);
|
||
|
free(term);
|
||
|
return expr;
|
||
|
}
|
||
|
|
||
|
char *WordParser::ParseScope(StringList& terms)
|
||
|
{
|
||
|
char *scope = Term(terms);
|
||
|
int scope_length = strlen(scope);
|
||
|
|
||
|
//
|
||
|
// Remove surrounding quotes, if any
|
||
|
//
|
||
|
if(scope_length > 0) {
|
||
|
if(scope[scope_length - 1] == '"')
|
||
|
scope[--scope_length] = '\0';
|
||
|
if(scope[0] == '"')
|
||
|
scope++;
|
||
|
}
|
||
|
|
||
|
scope = strdup(scope);
|
||
|
|
||
|
Shift(terms);
|
||
|
|
||
|
return scope;
|
||
|
}
|
||
|
|
||
|
char *WordParser::Term(StringList& terms)
|
||
|
{
|
||
|
char *term = terms[0];
|
||
|
if(!term) {
|
||
|
fprintf(stderr, "WordParser::Term: unexpected end of expression\n");
|
||
|
exit(1);
|
||
|
}
|
||
|
return term;
|
||
|
}
|
||
|
|
||
|
void WordParser::Shift(StringList& terms)
|
||
|
{
|
||
|
terms.Shift(LIST_REMOVE_DESTROY);
|
||
|
}
|
||
|
|
||
|
// ************************* main loop implementation ********************
|
||
|
|
||
|
//
|
||
|
// Store all options from the command line
|
||
|
//
|
||
|
class params_t
|
||
|
{
|
||
|
public:
|
||
|
char* dbfile;
|
||
|
char* find;
|
||
|
unsigned int bottom;
|
||
|
unsigned int count;
|
||
|
char* context;
|
||
|
int uniq_server;
|
||
|
int proximity;
|
||
|
int nop;
|
||
|
int exclude;
|
||
|
};
|
||
|
|
||
|
//
|
||
|
// Explain options
|
||
|
//
|
||
|
static void usage();
|
||
|
//
|
||
|
// Torture WordExclude* classes
|
||
|
//
|
||
|
static void exclude_test();
|
||
|
|
||
|
int main(int ac, char **av)
|
||
|
{
|
||
|
int c;
|
||
|
extern char *optarg;
|
||
|
params_t params;
|
||
|
|
||
|
params.dbfile = strdup("test");
|
||
|
params.find = 0;
|
||
|
params.bottom = 0;
|
||
|
params.count = 10;
|
||
|
params.context = 0;
|
||
|
params.uniq_server = 0;
|
||
|
params.proximity = WORD_SEARCH_DEFAULT_PROXIMITY;
|
||
|
params.nop = 0;
|
||
|
params.exclude = 0;
|
||
|
|
||
|
while ((c = getopt(ac, av, "vB:f:b:c:C:SP:ne")) != -1)
|
||
|
{
|
||
|
switch (c)
|
||
|
{
|
||
|
case 'v':
|
||
|
verbose++;
|
||
|
break;
|
||
|
case 'B':
|
||
|
free(params.dbfile);
|
||
|
params.dbfile = strdup(optarg);
|
||
|
break;
|
||
|
case 'f':
|
||
|
params.find = strdup(optarg);
|
||
|
break;
|
||
|
case 'b':
|
||
|
params.bottom = (unsigned int)atoi(optarg);
|
||
|
break;
|
||
|
case 'c':
|
||
|
params.count = (unsigned int)atoi(optarg);
|
||
|
break;
|
||
|
case 'C':
|
||
|
params.context = strdup(optarg);
|
||
|
break;
|
||
|
case 'P':
|
||
|
params.proximity = atoi(optarg);
|
||
|
break;
|
||
|
case 'S':
|
||
|
params.uniq_server = SERVER;
|
||
|
break;
|
||
|
case 'n':
|
||
|
params.nop = 1;
|
||
|
break;
|
||
|
case 'e':
|
||
|
params.exclude = 1;
|
||
|
break;
|
||
|
case '?':
|
||
|
usage();
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(params.exclude) {
|
||
|
exclude_test();
|
||
|
exit(0);
|
||
|
}
|
||
|
|
||
|
if(!params.find)
|
||
|
usage();
|
||
|
|
||
|
Configuration* config = WordContext::Initialize();
|
||
|
if(!config) {
|
||
|
fprintf(stderr, "search: no config file found\n");
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Forward command line verbosity to htword library.
|
||
|
//
|
||
|
if(verbose > 1) {
|
||
|
String tmp;
|
||
|
tmp << (verbose - 1);
|
||
|
config->Add("wordlist_verbose", tmp);
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Prepare the index (-B).
|
||
|
//
|
||
|
WordList words(*config);
|
||
|
words.Open(params.dbfile, O_RDONLY);
|
||
|
|
||
|
//
|
||
|
// Try the query parser alone
|
||
|
//
|
||
|
if(params.nop) {
|
||
|
WordTree* expr = WordParser().Parse(params.find);
|
||
|
printf("%s\n", (char*)expr->Get());
|
||
|
exit(0);
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Build a syntax tree from the expression provided by user
|
||
|
//
|
||
|
WordTree* expr = WordParser().Parse(params.find);
|
||
|
|
||
|
//
|
||
|
// Define the semantic of the key
|
||
|
//
|
||
|
{
|
||
|
#define DOCUMENT_LENGTH 3
|
||
|
static int document[DOCUMENT_LENGTH] = {
|
||
|
TAG,
|
||
|
SERVER,
|
||
|
URL
|
||
|
};
|
||
|
int document_length = DOCUMENT_LENGTH;
|
||
|
int location = LOCATION;
|
||
|
if(expr->Prepare(&words, params.uniq_server, params.proximity, document, document_length, location) != OK)
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
WordSearch* search = new WordSearch();
|
||
|
|
||
|
//
|
||
|
// Forward query options to WordSearch object
|
||
|
//
|
||
|
search->limit_bottom = params.bottom; // -b
|
||
|
search->limit_count = params.count; // -c
|
||
|
if(params.context) // -C
|
||
|
search->context_in.set(params.context, strlen(params.context));
|
||
|
|
||
|
//
|
||
|
// Perform the search (-f)
|
||
|
//
|
||
|
search->expr = expr;
|
||
|
WordMatch* matches = search->Search();
|
||
|
|
||
|
//
|
||
|
// Display results, if any.
|
||
|
//
|
||
|
if(matches) {
|
||
|
int i;
|
||
|
for(i = 0; !matches[i].match.Empty(); i++)
|
||
|
printf("match: %s\n", (char*)matches[i].Get());
|
||
|
const String& context = search->Context();
|
||
|
if(!context.empty())
|
||
|
printf("context: %s\n", (const char*)context);
|
||
|
delete [] matches;
|
||
|
} else {
|
||
|
printf("match: none\n");
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Cleanup
|
||
|
//
|
||
|
delete search;
|
||
|
if(params.context) free(params.context);
|
||
|
if(params.find) free(params.find);
|
||
|
if(params.dbfile) free(params.dbfile);
|
||
|
delete expr;
|
||
|
|
||
|
words.Close();
|
||
|
delete config;
|
||
|
}
|
||
|
|
||
|
static void exclude_test()
|
||
|
{
|
||
|
static unsigned int expected[] = {
|
||
|
0x00000001,
|
||
|
0x00000002,
|
||
|
0x00000004,
|
||
|
0x00000008,
|
||
|
0x00000010,
|
||
|
0x00000003,
|
||
|
0x00000005,
|
||
|
0x00000006,
|
||
|
0x00000009,
|
||
|
0x0000000a,
|
||
|
0x0000000c,
|
||
|
0x00000011,
|
||
|
0x00000012,
|
||
|
0x00000014,
|
||
|
0x00000018,
|
||
|
0x00000007,
|
||
|
0x0000000b,
|
||
|
0x0000000d,
|
||
|
0x0000000e,
|
||
|
0x00000013,
|
||
|
0x00000015,
|
||
|
0x00000016,
|
||
|
0x00000019,
|
||
|
0x0000001a,
|
||
|
0x0000001c,
|
||
|
0x0000000f,
|
||
|
0x00000017,
|
||
|
0x0000001b,
|
||
|
0x0000001d,
|
||
|
0x0000001e,
|
||
|
0x0000001f
|
||
|
};
|
||
|
|
||
|
//
|
||
|
// WordExclude
|
||
|
//
|
||
|
if(verbose) fprintf(stderr, "exclude_test: testing WordExclude\n");
|
||
|
{
|
||
|
WordExclude exclude;
|
||
|
exclude.Initialize(5);
|
||
|
int count = 0;
|
||
|
while(exclude.Next() == WORD_EXCLUDE_OK) {
|
||
|
if(expected[count] != exclude.Mask()) {
|
||
|
fprintf(stderr, "exclude_test: WordExclude iteration %d expected 0x%08x but got 0x%08x\n", count, expected[count], exclude.Mask());
|
||
|
exit(1);
|
||
|
}
|
||
|
count++;
|
||
|
}
|
||
|
if(count != sizeof(expected)/sizeof(unsigned int)) {
|
||
|
fprintf(stderr, "exclude_test: WordExclude expected %d iterations but got %d\n", (int)(sizeof(expected)/sizeof(unsigned int)), count);
|
||
|
exit(1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// WordExcludeMask without ignore bits behaves exactly the same
|
||
|
// as WordExclude.
|
||
|
//
|
||
|
if(verbose) fprintf(stderr, "exclude_test: testing WordExcludeMask behaving like WordExclude\n");
|
||
|
{
|
||
|
WordExcludeMask exclude;
|
||
|
exclude.Initialize(5, 0, 0);
|
||
|
int count = 0;
|
||
|
while(exclude.Next() == WORD_EXCLUDE_OK) {
|
||
|
if(expected[count] != exclude.Mask()) {
|
||
|
fprintf(stderr, "exclude_test: WordExcludeMask 1 iteration %d expected 0x%08x but got 0x%08x\n", count, expected[count], exclude.Mask());
|
||
|
exit(1);
|
||
|
}
|
||
|
count++;
|
||
|
}
|
||
|
if(count != sizeof(expected)/sizeof(unsigned int)) {
|
||
|
fprintf(stderr, "exclude_test: WordExcludeMask 1 expected %d iterations but got %d\n", (int)(sizeof(expected)/sizeof(unsigned int)), count);
|
||
|
exit(1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// WordExcludeMask
|
||
|
//
|
||
|
if(verbose) fprintf(stderr, "exclude_test: testing WordExcludeMask\n");
|
||
|
{
|
||
|
static unsigned int expected[] = {
|
||
|
0x00000102,
|
||
|
0x00000108,
|
||
|
0x00000120,
|
||
|
0x00000180,
|
||
|
0x0000010a,
|
||
|
0x00000122,
|
||
|
0x00000128,
|
||
|
0x00000182,
|
||
|
0x00000188,
|
||
|
0x000001a0,
|
||
|
0x0000012a,
|
||
|
0x0000018a,
|
||
|
0x000001a2,
|
||
|
0x000001a8,
|
||
|
0x000001aa
|
||
|
};
|
||
|
static unsigned int excluded[] = {
|
||
|
1,
|
||
|
0,
|
||
|
0,
|
||
|
0,
|
||
|
1,
|
||
|
1,
|
||
|
0,
|
||
|
1,
|
||
|
0,
|
||
|
0,
|
||
|
1,
|
||
|
1,
|
||
|
1,
|
||
|
0,
|
||
|
1
|
||
|
};
|
||
|
|
||
|
WordExcludeMask exclude;
|
||
|
unsigned int ignore = 0x155;
|
||
|
unsigned int ignore_mask = 0x100;
|
||
|
exclude.Initialize(9, ignore, ignore_mask);
|
||
|
if(verbose) {
|
||
|
fprintf(stderr, "exclude_test: ignore\n");
|
||
|
show_bits(ignore);
|
||
|
fprintf(stderr, "exclude_test: ignore_mask\n");
|
||
|
show_bits(ignore_mask);
|
||
|
}
|
||
|
if(exclude.NotExcludedCount() != 8) {
|
||
|
fprintf(stderr, "exclude_test: WordExcludeMask 2 expected NoExcludedCount = 8 but got %d\n", exclude.NotExcludedCount());
|
||
|
exit(1);
|
||
|
}
|
||
|
int count = 0;
|
||
|
while(exclude.Next() == WORD_EXCLUDE_OK) {
|
||
|
if(expected[count] != exclude.Mask()) {
|
||
|
fprintf(stderr, "exclude_test: WordExcludeMask 2 iteration %d expected 0x%08x but got 0x%08x\n", count, expected[count], exclude.Mask());
|
||
|
exit(1);
|
||
|
}
|
||
|
//
|
||
|
// Test Excluded() method on ignored bit
|
||
|
// Is bit 5 set ? (9 - 4) = 5 (counting from 1)
|
||
|
//
|
||
|
if(exclude.Excluded(4)) {
|
||
|
fprintf(stderr, "exclude_test: WordExcludeMask 2 iteration %d bit 5 was set 0x%08x\n", count, exclude.Mask());
|
||
|
exit(1);
|
||
|
}
|
||
|
//
|
||
|
// Test Excluded() method on variable bit
|
||
|
// Is bit 2 set ? (9 - 2) = 7 (counting from 1)
|
||
|
//
|
||
|
if((exclude.Excluded(7) && !excluded[count]) ||
|
||
|
(!exclude.Excluded(7) && excluded[count])) {
|
||
|
fprintf(stderr, "exclude_test: WordExcludeMask 2 iteration %d expected bit 2 %s but was %s in 0x%08x\n", count, (excluded[count] ? "set" : "not set"), (excluded[count] ? "not set" : "set"), expected[count]);
|
||
|
exit(1);
|
||
|
}
|
||
|
count++;
|
||
|
}
|
||
|
if(count != sizeof(expected)/sizeof(unsigned int)) {
|
||
|
fprintf(stderr, "exclude_test: WordExcludeMask 2 expected %d iterations but got %d\n", (int)(sizeof(expected)/sizeof(unsigned int)), count);
|
||
|
exit(1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
{
|
||
|
WordExclude exclude;
|
||
|
String ascii("110101");
|
||
|
String tmp;
|
||
|
exclude.Set(ascii);
|
||
|
exclude.Get(tmp);
|
||
|
if(tmp != ascii) {
|
||
|
fprintf(stderr, "exclude_test: WordExclude::Get/Set expected %s but got %s\n", (char*)ascii, (char*)tmp);
|
||
|
exit(1);
|
||
|
}
|
||
|
if(exclude.Mask() != 0x2b) {
|
||
|
fprintf(stderr, "exclude_test: WordExclude::Mask expected 0x2b but got 0x%02x\n", exclude.Mask());
|
||
|
exit(1);
|
||
|
}
|
||
|
}
|
||
|
{
|
||
|
WordExcludeMask exclude;
|
||
|
String ascii("12031");
|
||
|
String tmp;
|
||
|
exclude.Set(ascii);
|
||
|
exclude.Get(tmp);
|
||
|
if(tmp != ascii) {
|
||
|
fprintf(stderr, "exclude_test: WordExcludeMask::Get/Set expected %s but got %s\n", (char*)ascii, (char*)tmp);
|
||
|
exit(1);
|
||
|
}
|
||
|
if(exclude.Mask() != 0x19) {
|
||
|
fprintf(stderr, "exclude_test: WordExcludeMask::Mask expected 0x19 but got 0x%02x\n", exclude.Mask());
|
||
|
exit(1);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// *****************************************************************************
|
||
|
// void usage()
|
||
|
// Display program usage information
|
||
|
//
|
||
|
static void usage()
|
||
|
{
|
||
|
printf("usage:\tsearch -f words [options]\n");
|
||
|
printf("\tsearch -e\n");
|
||
|
printf("Options:\n");
|
||
|
printf("\t-v\t\tIncreases the verbosity.\n");
|
||
|
printf("\t-B dbfile\tUse <dbfile> as a db file name (default test).\n");
|
||
|
printf("\t-f expr\t\tLisp like search expression.\n");
|
||
|
printf("\t\t\tSee WordParser comments in source for more information.\n");
|
||
|
printf("\t-b number\tSkip number documents before retrieving.\n");
|
||
|
printf("\t-c number\tRetrieve number documents at most.\n");
|
||
|
printf("\t-n\t\tOnly parse the search expression and print it.\n");
|
||
|
printf("\t-P proximity\tUse with near/optional, proximity tolerance is <proximity>\n");
|
||
|
printf("\t\t\tif negative order of terms is not meaningful\n");
|
||
|
printf("\t\t\t(default 1).\n");
|
||
|
printf("\t-C context\tResume search at <context>.\n");
|
||
|
printf("\t-S\t\tReturn at most one match per server.\n");
|
||
|
printf("\n");
|
||
|
printf("\t-e\t\tRun tests on WordExclude and WordExcludeMask.\n");
|
||
|
exit(1);
|
||
|
}
|