You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

583 lines
16 KiB

//
// WordCursor.cc
//
// Part of the ht://Dig package <http://www.htdig.org/>
// Copyright (c) 1999-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: WordCursor.cc,v 1.4 2004/05/28 13:15:26 lha Exp $
//
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
#include <stdlib.h>
#include "WordCursor.h"
#include "WordStat.h"
#include "WordList.h"
#include <stdio.h>
//
// WordCursor implementation
//
// *****************************************************************************
//
int WordCursor::Initialize(WordList *nwords, const WordKey &nsearchKey, wordlist_walk_callback_t ncallback, Object *ncallback_data, int naction)
{
action = naction;
searchKey = nsearchKey;
callback = ncallback;
callback_data = ncallback_data;
words = nwords;
return OK;
}
// *****************************************************************************
//
void
WordCursor::Clear()
{
searchKey.Clear();
action = 0;
callback = 0;
callback_data = 0;
ClearResult();
ClearInternal();
words = 0;
//
// Debugging section.
//
traceRes = 0;
}
// *****************************************************************************
//
void
WordCursor::ClearInternal()
{
cursor.Close();
key.trunc();
data.trunc();
prefixKey.Clear();
cursor_get_flags = DB_SET_RANGE;
searchKeyIsSameAsPrefix = 0;
}
// *****************************************************************************
//
void
WordCursor::ClearResult()
{
collectRes = 0;
found.Clear();
status = OK;
}
int
WordCursor::ContextRestore(const String& buffer)
{
int ret = OK;
if(!buffer.empty()) {
WordKey key(buffer);
if((ret = Seek(key)) != OK)
return ret;
//
// Move to restored position so that next call to
// WalkNext will go above the restored position.
//
if((ret = WalkNext()) != OK)
return ret;
}
return ret;
}
// *****************************************************************************
//
// Walk and collect data from the word database.
//
// If action bit HTDIG_WORDLIST_COLLECTOR is set WordReferences are
// stored in a list and the list is returned.
// If action bit HTDIG_WORDLIST_WALKER is set the <callback> function
// is called for each WordReference found. No list is built and the
// function returns a null pointer.
//
// The <searchKey> argument may be a fully qualified key, containing precise values for each
// field of the key. It may also contain only some fields of the key. In both cases
// all the word occurrences matching the fields set in the key are retrieved. It may
// be fast if key is a prefix (see WordKey::Prefix for a definition). It may
// be *slow* if key is not a prefix because it forces a complete walk of the
// index.
//
int
WordCursor::Walk()
{
int ret;
if((ret = WalkInit()) != OK) return ret;
while((ret = WalkNext()) == OK)
;
int ret1;
if((ret1 = WalkFinish()) != OK) return ret1;
return ret == WORD_WALK_ATEND ? OK : NOTOK;
}
int
WordCursor::WalkInit()
{
int ret = OK;
ClearResult();
ClearInternal();
WordReference wordRef;
if((ret = cursor.Open(words->db.db)) != 0)
return ret;
if(words->verbose) fprintf(stderr, "WordCursor::WalkInit: action = %d, SearchKey = %s\n", action, (char*)searchKey.Get());
if(action & HTDIG_WORDLIST_COLLECTOR) {
collectRes = new List;
}
const WordReference& last = WordStat::Last();
WordKey first_key;
//
// Move the cursor to start walking and do some sanity checks.
//
if(searchKey.Empty()) {
//
// Move past the stat data
//
if(words->verbose) fprintf(stderr, "WordCursor::WalkInit: at start of keys because search key is empty\n");
first_key = last.Key();
} else {
prefixKey = searchKey;
//
// If the key is a prefix, the start key is
// the longest possible prefix contained in the key. If the
// key does not contain any prefix, start from the beginning
// of the file.
//
if(prefixKey.PrefixOnly() == NOTOK) {
if(words->verbose) fprintf(stderr, "WordCursor::WalkInit: at start of keys because search key is not a prefix\n");
prefixKey.Clear();
//
// Move past the stat data
//
first_key = last.Key();
} else {
if(words->verbose) fprintf(stderr, "WordCursor::WalkInit: go to %s \n", (char*)prefixKey.Get());
first_key = prefixKey;
}
}
first_key.Pack(key);
//
// Allow Seek immediately after Init
//
found.Key().CopyFrom(first_key);
status = OK;
searchKeyIsSameAsPrefix = searchKey.ExactEqual(prefixKey);
cursor_get_flags = DB_SET_RANGE;
return OK;
}
int
WordCursor::WalkRewind()
{
const WordReference& last = WordStat::Last();
WordKey first_key;
//
// Move the cursor to start walking and do some sanity checks.
//
if(searchKey.Empty()) {
first_key = last.Key();
} else {
prefixKey = searchKey;
//
// If the key is a prefix, the start key is
// the longest possible prefix contained in the key. If the
// key does not contain any prefix, start from the beginning
// of the file.
//
if(prefixKey.PrefixOnly() == NOTOK) {
prefixKey.Clear();
//
// Move past the stat data
//
first_key = last.Key();
} else {
first_key = prefixKey;
}
}
first_key.Pack(key);
//
// Allow Seek immediately after Rewind
//
found.Key().CopyFrom(first_key);
status = OK;
searchKeyIsSameAsPrefix = searchKey.ExactEqual(prefixKey);
cursor_get_flags = DB_SET_RANGE;
return OK;
}
int
WordCursor::WalkNext()
{
int ret;
while((ret = WalkNextStep()) == WORD_WALK_NOMATCH_FAILED)
if(words->verbose > 1) fprintf(stderr, "WordCursor::WalkNext: got false match, retry\n");
return ret;
}
int
WordCursor::WalkNextStep()
{
status = OK;
{
int error;
if((error = cursor.Get(key, data, cursor_get_flags)) != 0) {
if(error == DB_NOTFOUND) {
if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, no more matches\n", (char*)searchKey.Get());
return (status = WORD_WALK_ATEND);
} else {
return WORD_WALK_GET_FAILED;
}
}
}
//
// Next step operation is always sequential walk
//
cursor_get_flags = DB_NEXT;
found.Unpack(key, data);
if(traceRes) traceRes->Add(new WordReference(found));
if(words->verbose > 1) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, candidate is %s\n", (char*)searchKey.Get(), (char*)found.Get());
//
// Don't bother to compare keys if we want to walk all the entries
//
if(!(searchKey.Empty())) {
// examples
// searchKey: aabc 1 ? ? ?
// prefixKey: aabc 1 ? ? ?
//
// Stop loop if we reach a record whose key does not
// match prefix key requirement, provided we have a valid
// prefix key.
// (ie. stop loop if we're past last possible match...)
//
if(!prefixKey.Empty() &&
!prefixKey.Equal(found.Key())) {
if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, no more matches because found a key that is greater than searchKey\n", (char*)searchKey.Get());
return (status = WORD_WALK_ATEND);
}
//
// Skip entries that do not exactly match the specified key.
//
if(!searchKeyIsSameAsPrefix &&
!searchKey.Equal(found.Key())) {
int ret;
switch((ret = SkipUselessSequentialWalking())) {
case OK:
if(words->verbose > 1) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, false match jump to %s\n", (char*)searchKey.Get(), (char*)found.Get());
return WORD_WALK_NOMATCH_FAILED;
break;
case WORD_WALK_ATEND:
if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, no more matches according to SkipUselessSequentialWalking\n", (char*)searchKey.Get());
return (status = WORD_WALK_ATEND);
break;
default:
fprintf(stderr, "WordCursor::WalkNextStep: SkipUselessSequentialWalking failed %d\n", ret);
return NOTOK;
break;
}
}
}
if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: looking for %s, found %s\n", (char*)searchKey.Get(), (char*)found.Get());
if(collectRes) {
if(words->verbose > 2) fprintf(stderr, "WordCursor::WalkNextStep: collect\n");
collectRes->Add(new WordReference(found));
} else if(callback) {
if(words->verbose > 2) fprintf(stderr, "WordCursor::WalkNextStep: calling callback\n");
int ret = (*callback)(words, cursor, &found, *(callback_data) );
//
// The callback function tells us that something went wrong, might
// as well stop walking.
//
if(ret != OK) {
if(words->verbose) fprintf(stderr, "WordCursor::WalkNextStep: callback returned NOTOK");
return WORD_WALK_CALLBACK_FAILED|(status = WORD_WALK_ATEND);
}
}
return OK;
}
int
WordCursor::WalkFinish()
{
if(words->verbose) fprintf(stderr, "WordCursor::WalkFinish\n");
return cursor.Close() == 0 ? OK : NOTOK;
}
// *****************************************************************************
//
// Helper for SkipUselessSequentialWalking.
// Undefine in foundKey all fields defined in searchKey
// so that they are not considered by SetToFollowing.
// It could become a method of WordKey but lacks generalisation and
// from what I see it is a rather specific operation.
//
static inline void complement(WordKey& key, const WordKey& mask)
{
int nfields = WordKey::NFields();
int i;
//
// Undefine in 'key' all fields defined in 'mask'
//
for(i = 0; i < nfields; i++) {
if(mask.IsDefined(i))
key.Undefined(i);
else
key.SetDefined(i);
}
//
// If searching for a prefix, we must allow the word in
// key to increment.
//
if(mask.IsDefinedWordSuffix()) {
key.UndefinedWordSuffix();
} else {
key.SetDefinedWordSuffix();
key.SetDefined(0);
}
}
// *****************************************************************************
//
// Find out if we should better jump to the next possible key (DB_SET_RANGE) instead of
// sequential iterating (DB_NEXT).
// If it is decided that jump is a better move :
// cursor_set_flags = DB_SET_RANGE
// key = calculated next possible key
// Else
// do nothing
// Return values
// OK: skipping successfull.
// WORD_WALK_ATEND : no more possible match, reached the maximum
// WORD_WALK_FAILED: general failure, occurs if called and no skipping
// necessary.
//
// Sequential searching can waste time by searching all keys, for example:
// If searching for Key: argh <DEF> <UNDEF> 10
// Under normal circonstances we would do the following
//
// DATA STATUS ACTION
// 1: argh 1 10 match DB_NEXT
// 2: argh 2 11 nomatch DB_NEXT
// 3: argh 2 15 nomatch DB_NEXT
// 4: argh 2 20 nomatch DB_NEXT
// 5: argh 2 30 nomatch DB_NEXT
// 6: argh 5 1 nomatch DB_NEXT
// 7: argh 5 8 nomatch DB_NEXT
// 8: argh 8 6 nomatch DB_NEXT
//
// But the optimal would be
//
// DATA STATUS ACTION
// 1: argh 1 10 match DB_NEXT
// 2: argh 2 11 nomatch DB_SET_RANGE argh 3 10
// 3: argh 2 15
// 4: argh 2 20
// 5: argh 2 30
// 6: argh 5 1 nomatch DB_SET_RANGE argh 5 10
// 7: argh 5 8
// 8: argh 8 6 nomatch DB_SET_RANGE argh 8 10
//
// That saves a lot of unecessary hit. The underlying logic is a bit
// more complex but you have the idea.
//
int
WordCursor::SkipUselessSequentialWalking()
{
WordKey& foundKey = found.Key();
int nfields = WordKey::NFields();
int i;
//
// Find out how the searchKey and the foundKey differ.
//
int diff_field = 0;
int lower = 0;
if(!foundKey.Diff(searchKey, diff_field, lower)) {
//
// foundKey matches searchKey (no difference), don't
// skip, everything is fine. The caller of SkipUselessSequentialWalking
// is expected to avoid this case for efficiency.
//
return WORD_WALK_FAILED;
}
if(words->verbose > 2) fprintf(stderr, "WordCursor::SkipUselessSequentialWalking: looking for %s, candidate is %s\n", (char*)searchKey.Get(), (char*)foundKey.Get());
//
// Undefine in foundKey all fields defined in searchKey
// so that they are not considered by SetToFollowing.
//
complement(foundKey, searchKey);
//
// If the key found is lower than the searched key when
// considering only the fields defined in the search key,
// we only need to enforce the key to get the match.
// Otherwise we need to increment the found key to jump
// properly.
//
if(lower) {
if(words->verbose > 1) fprintf(stderr, "WordCursor::SkipUselessSequentialWalking: enforcing the search constraint is enough to jump forward\n");
for(i = diff_field + 1; i < nfields; i++)
if(foundKey.IsDefined(i)) foundKey.Set(i, 0);
} else {
if(words->verbose > 1) fprintf(stderr, "WordCursor::SkipUselessSequentialWalking: increment the key to jump forward\n");
//
// diff_field - 1 is not really necessary because diff_field is undefined
// in foundKey and would therefore be ignored by SetToFollowing. We write
// diff_field - 1 to clearly state that incrementing begins just before the
// field for which a difference was found.
//
int ret;
if((ret = foundKey.SetToFollowing(diff_field - 1)) != OK)
return ret;
}
//
// Copy all fields defined in searchKey into foundKey. This will copy
// searchKey in foundKey because all these fields have been
// previously undefined in foundKey.
//
foundKey.Merge(searchKey);
if(words->verbose > 2) fprintf(stderr, "WordCursor::SkipUselessSequentialWalking: looking for %s, jump to %s\n", (char*)searchKey.Get(), (char*)foundKey.Get());
//
// Instruct Next function to jump to the calculated key
//
if(foundKey.Pack(key) == NOTOK) {
return WORD_WALK_FAILED;
}
cursor_get_flags = DB_SET_RANGE;
return OK;
}
// *****************************************************************************
//
// Copy defined fields in patch into foundKey and
// initialize internal state so that WalkNext jumps to
// this key next time it's called.
//
// Technically this means : Override latest key found (found data member)
// with patch fields values, starting from the first field set in
// patch up to the last. Pack the result in the key field and set
// cursor_get_flags to DB_SET_RANGE.
//
int
WordCursor::Seek(const WordKey& patch)
{
int nfields = WordKey::NFields();
WordKey pos = searchKey;
if(patch.Empty()) {
fprintf(stderr, "WordCursor::Seek: empty patch is useless\n");
return NOTOK;
}
int i;
//
// Leave the most significant fields untouched
//
for(i = WORD_FIRSTFIELD; i < nfields; i++)
if(patch.IsDefined(i))
break;
//
// From the first value set in the patch to the end
// override.
//
for(; i < nfields; i++) {
if(patch.IsDefined(i))
pos.Set(i, patch.Get(i));
else
pos.Set(i, 0);
}
if(!pos.Filled()) {
fprintf(stderr, "WordCursor::Seek: only make sense if the resulting key is fully defined\n");
return NOTOK;
}
if(words->verbose > 2) fprintf(stderr, "WordCursor::Seek: seek to %s\n", (char*)pos.Get());
//
// Next move will jump to the patched key
//
pos.Pack(key);
cursor_get_flags = DB_SET_RANGE;
return OK;
}
int WordCursor::Noccurrence(unsigned int& noccurrence) const
{
if(!words) {
fprintf(stderr, "WordCursor::Noccurrence: words not set (call Prepare first)\n");
return NOTOK;
}
return words->Noccurrence(searchKey, noccurrence);
}
//
// Convert the whole structure to an ascii string description
//
int WordCursor::Get(String& bufferout) const
{
String tmp;
bufferout.trunc();
searchKey.Get(tmp);
bufferout << "Input: searchKey = " << tmp << ", action = " << action << "; Output: collectRes " << (collectRes ? "set" : "not set");
found.Get(tmp);
bufferout << ", found = " << tmp << ", status = " << status;
prefixKey.Get(tmp);
bufferout << "; Internal State: prefixKey = " << tmp << ", cursor_get_flags = " << cursor_get_flags;
return OK;
}