You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
602 lines
13 KiB
602 lines
13 KiB
//
|
|
// StringMatch.cc
|
|
//
|
|
// StringMatch: This class provides an interface to a fairly specialized string
|
|
// lookup facility. It is intended to be used as a replace for any
|
|
// regualr expression matching when the pattern string is in the form:
|
|
//
|
|
// <string1>|<string2>|<string3>|...
|
|
//
|
|
// Just like regular expression routines, the pattern needs to be
|
|
// compiled before it can be used. This is done using the Pattern()
|
|
// member function. Once the pattern has been compiled, the member
|
|
// function Find() can be used to search for the pattern in a string.
|
|
// If a string has been found, the "which" and "length" parameters
|
|
// will be set to the string index and string length respectively.
|
|
// (The string index is counted starting from 0) The return value of
|
|
// Find() is the position at which the string was found or -1 if no
|
|
// strings could be found. If a case insensitive match needs to be
|
|
// performed, call the IgnoreCase() member function before calling
|
|
// Pattern(). This function will setup a character translation table
|
|
// which will convert all uppercase characters to lowercase. If some
|
|
// other translation is required, the TranslationTable() member
|
|
// function can be called to provide a custom table. This table needs
|
|
// to be 256 characters.
|
|
//
|
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
|
// Copyright (c) 1999-2004 The ht://Dig Group
|
|
// For copyright details, see the file COPYING in your distribution
|
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
|
//
|
|
// $Id: StringMatch.cc,v 1.18 2004/05/28 13:15:21 lha Exp $
|
|
//
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "htconfig.h"
|
|
#endif /* HAVE_CONFIG_H */
|
|
|
|
#include "StringMatch.h"
|
|
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
|
|
#ifdef HAVE_STD
|
|
#include <fstream>
|
|
#ifdef HAVE_NAMESPACES
|
|
using namespace std;
|
|
#endif
|
|
#else
|
|
#include <fstream.h>
|
|
#endif /* HAVE_STD */
|
|
|
|
//
|
|
// Entries in the state table can either be normal or final.
|
|
// Final states have an match index encoded in them. This number
|
|
// is shifted left by INDEX_SHIFT bits.
|
|
//
|
|
#define MATCH_INDEX_MASK 0xffff0000
|
|
#define STATE_MASK 0x0000ffff
|
|
#define INDEX_SHIFT 16
|
|
|
|
//*****************************************************************************
|
|
// StringMatch::StringMatch()
|
|
//
|
|
StringMatch::StringMatch()
|
|
{
|
|
//
|
|
// Clear out the state table pointers
|
|
//
|
|
for (int i = 0; i < 256; i++)
|
|
table[i] = 0;
|
|
local_alloc = 0;
|
|
trans = 0;
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// StringMatch::~StringMatch()
|
|
//
|
|
StringMatch::~StringMatch()
|
|
{
|
|
for (int i = 0; i < 256; i++)
|
|
delete [] table[i];
|
|
if (local_alloc)
|
|
delete [] trans;
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// void StringMatch::Pattern(char *pattern)
|
|
// Compile the given pattern into a state transition table
|
|
//
|
|
void
|
|
StringMatch::Pattern(char *pattern, char sep)
|
|
{
|
|
if (!pattern || !*pattern)
|
|
{
|
|
//
|
|
// No pattern to compile...
|
|
//
|
|
return;
|
|
}
|
|
|
|
//
|
|
// Allocate enough space in the state table to hold the worst case
|
|
// patterns...
|
|
//
|
|
int n = strlen(pattern);
|
|
|
|
// ...but since the state table does not need an extra state
|
|
// for each string in the pattern, we can subtract the number
|
|
// of separators. Wins for small but numerous strings in
|
|
// the pattern.
|
|
char *tmpstr;
|
|
for (tmpstr = pattern;
|
|
(tmpstr = strchr(tmpstr, sep)) != NULL;
|
|
tmpstr++) // Pass the separator.
|
|
n--;
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < 256; i++)
|
|
{
|
|
table[i] = new int[n];
|
|
memset((unsigned char *) table[i], 0, n * sizeof(int));
|
|
}
|
|
for (i = 0; i < n; i++)
|
|
table[0][i] = i; // "no-op" states for null char, to be ignored
|
|
|
|
//
|
|
// Set up a standard case translation table if needed.
|
|
//
|
|
if (!trans)
|
|
{
|
|
trans = new unsigned char[256];
|
|
for (i = 0; i < 256; i++)
|
|
{
|
|
trans[i] = (unsigned char)i;
|
|
}
|
|
local_alloc = 1;
|
|
}
|
|
|
|
//
|
|
// Go though each of the patterns and build entries in the table.
|
|
//
|
|
int state = 0;
|
|
int totalStates = 0;
|
|
unsigned char previous = 0;
|
|
int previousState = 0;
|
|
int previousValue = 0;
|
|
int index = 1;
|
|
unsigned char chr;
|
|
|
|
while ((unsigned char)*pattern)
|
|
{
|
|
#if 0
|
|
if (totalStates > n)
|
|
{
|
|
cerr << "Fatal! Miscalculation of number of states"
|
|
<< endl;
|
|
exit (2);
|
|
}
|
|
#endif
|
|
|
|
chr = trans[(unsigned char)*pattern];
|
|
if (chr == 0)
|
|
{
|
|
pattern++;
|
|
continue;
|
|
}
|
|
if (chr == sep)
|
|
{
|
|
//
|
|
// Next pattern
|
|
//
|
|
table[previous][previousState] =
|
|
previousValue | (index << INDEX_SHIFT);
|
|
index++;
|
|
state = 0;
|
|
// totalStates--;
|
|
}
|
|
else
|
|
{
|
|
previousValue = table[chr][state];
|
|
previousState = state;
|
|
if (previousValue)
|
|
{
|
|
if (previousValue & MATCH_INDEX_MASK)
|
|
{
|
|
if (previousValue & STATE_MASK)
|
|
{
|
|
state = previousValue & STATE_MASK;
|
|
}
|
|
else
|
|
{
|
|
table[chr][state] |= ++totalStates;
|
|
state = totalStates;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
state = previousValue & STATE_MASK;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
table[chr][state] = ++totalStates;
|
|
state = totalStates;
|
|
}
|
|
}
|
|
previous = chr;
|
|
pattern++;
|
|
}
|
|
table[previous][previousState] =
|
|
previousValue | (index << INDEX_SHIFT);
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// int StringMatch::FindFirst(const char *string, int &which, int &length)
|
|
// Attempt to find the first occurance of the previous compiled patterns.
|
|
//
|
|
int StringMatch::FindFirst(const char *string, int &which, int &length)
|
|
{
|
|
which = -1;
|
|
length = -1;
|
|
|
|
if (!table[0])
|
|
return 0;
|
|
|
|
int state = 0, new_state = 0;
|
|
int pos = 0;
|
|
int start_pos = 0;
|
|
|
|
while ((unsigned char)string[pos])
|
|
{
|
|
new_state = table[trans[(unsigned char)string[pos] & 0xff]][state];
|
|
if (new_state)
|
|
{
|
|
if (state == 0)
|
|
{
|
|
//
|
|
// Keep track of where we started comparing so that we can
|
|
// come back to this point later if we didn't match anything
|
|
//
|
|
start_pos = pos;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// We came back to 0 state. This means we didn't match anything.
|
|
//
|
|
if (state)
|
|
{
|
|
// But we may already have a match, and are just being greedy.
|
|
if (which != -1)
|
|
return start_pos;
|
|
|
|
pos = start_pos + 1;
|
|
}
|
|
else
|
|
pos++;
|
|
state = 0;
|
|
continue;
|
|
}
|
|
state = new_state;
|
|
if (state & MATCH_INDEX_MASK)
|
|
{
|
|
//
|
|
// Matched one of the patterns.
|
|
// Determine which and return.
|
|
//
|
|
which = ((unsigned int) (state & MATCH_INDEX_MASK)
|
|
>> INDEX_SHIFT) - 1;
|
|
length = pos - start_pos + 1;
|
|
state &= STATE_MASK;
|
|
|
|
// Continue to find the longest, if there is one.
|
|
if (state == 0)
|
|
return start_pos;
|
|
}
|
|
pos++;
|
|
}
|
|
|
|
// Maybe we were too greedy.
|
|
if (which != -1)
|
|
return start_pos;
|
|
|
|
return -1;
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// int StringMatch::Compare(const char *string, int &which, int &length)
|
|
//
|
|
int StringMatch::Compare(const char *string, int &which, int &length)
|
|
{
|
|
which = -1;
|
|
length = -1;
|
|
|
|
if (!table[0])
|
|
return 0;
|
|
|
|
int state = 0, new_state = 0;
|
|
int pos = 0;
|
|
int start_pos = 0;
|
|
|
|
//
|
|
// Skip to at least the start of a word.
|
|
//
|
|
while ((unsigned char)string[pos])
|
|
{
|
|
new_state = table[trans[string[pos]]][state];
|
|
if (new_state)
|
|
{
|
|
if (state == 0)
|
|
{
|
|
start_pos = pos;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// We may already have a match, and are just being greedy.
|
|
if (which != -1)
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
state = new_state;
|
|
if (state & MATCH_INDEX_MASK)
|
|
{
|
|
//
|
|
// Matched one of the patterns.
|
|
//
|
|
which = ((unsigned int) (state & MATCH_INDEX_MASK)
|
|
>> INDEX_SHIFT) - 1;
|
|
length = pos - start_pos + 1;
|
|
|
|
// Continue to find the longest, if there is one.
|
|
state &= STATE_MASK;
|
|
if (state == 0)
|
|
return 1;
|
|
}
|
|
pos++;
|
|
}
|
|
|
|
// Maybe we were too greedy.
|
|
if (which != -1)
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// int StringMatch::FindFirstWord(char *string)
|
|
//
|
|
int StringMatch::FindFirstWord(const char *string)
|
|
{
|
|
int dummy;
|
|
return FindFirstWord(string, dummy, dummy);
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// int StringMatch::CompareWord(const char *string)
|
|
//
|
|
int StringMatch::CompareWord(const char *string)
|
|
{
|
|
int dummy;
|
|
return CompareWord(string, dummy, dummy);
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// int StringMatch::FindFirstWord(char *string, int &which, int &length)
|
|
// Attempt to find the first occurance of the previous compiled patterns.
|
|
//
|
|
int StringMatch::FindFirstWord(const char *string, int &which, int &length)
|
|
{
|
|
which = -1;
|
|
length = -1;
|
|
|
|
int state = 0, new_state = 0;
|
|
int pos = 0;
|
|
int start_pos = 0;
|
|
int is_word = 1;
|
|
|
|
//
|
|
// Skip to at least the start of a word.
|
|
//
|
|
while ((unsigned char)string[pos])
|
|
{
|
|
new_state = table[trans[(unsigned char)string[pos]]][state];
|
|
if (new_state)
|
|
{
|
|
if (state == 0)
|
|
{
|
|
start_pos = pos;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// We came back to 0 state. This means we didn't match anything.
|
|
//
|
|
if (state)
|
|
{
|
|
pos = start_pos + 1;
|
|
}
|
|
else
|
|
pos++;
|
|
state = 0;
|
|
continue;
|
|
}
|
|
state = new_state;
|
|
|
|
if (state & MATCH_INDEX_MASK)
|
|
{
|
|
//
|
|
// Matched one of the patterns.
|
|
//
|
|
is_word = 1;
|
|
if (start_pos != 0)
|
|
{
|
|
if (HtIsStrictWordChar((unsigned char)string[start_pos - 1]))
|
|
is_word = 0;
|
|
}
|
|
if (HtIsStrictWordChar((unsigned char)string[pos + 1]))
|
|
is_word = 0;
|
|
if (is_word)
|
|
{
|
|
//
|
|
// Determine which and return.
|
|
//
|
|
which = ((unsigned int) (state & MATCH_INDEX_MASK)
|
|
>> INDEX_SHIFT) - 1;
|
|
length = pos - start_pos + 1;
|
|
return start_pos;
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Not at the end of word. Continue searching.
|
|
//
|
|
if (state & STATE_MASK)
|
|
{
|
|
state &= STATE_MASK;
|
|
}
|
|
else
|
|
{
|
|
pos = start_pos + 1;
|
|
state = 0;
|
|
}
|
|
}
|
|
}
|
|
pos++;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// int StringMatch::CompareWord(const char *string, int &which, int &length)
|
|
//
|
|
int StringMatch::CompareWord(const char *string, int &which, int &length)
|
|
{
|
|
which = -1;
|
|
length = -1;
|
|
|
|
if (!table[0])
|
|
return 0;
|
|
|
|
int state = 0;
|
|
int position = 0;
|
|
|
|
//
|
|
// Skip to at least the start of a word.
|
|
//
|
|
while ((unsigned char)string[position])
|
|
{
|
|
state = table[trans[(unsigned char)string[position]]][state];
|
|
if (state == 0)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
if (state & MATCH_INDEX_MASK)
|
|
{
|
|
//
|
|
// Matched one of the patterns. See if it is a word.
|
|
//
|
|
int isWord = 1;
|
|
|
|
if ((unsigned char)string[position + 1])
|
|
{
|
|
if (HtIsStrictWordChar((unsigned char)string[position + 1]))
|
|
isWord = 0;
|
|
}
|
|
|
|
if (isWord)
|
|
{
|
|
which = ((unsigned int) (state & MATCH_INDEX_MASK)
|
|
>> INDEX_SHIFT) - 1;
|
|
length = position + 1;
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Not at the end of a word. Continue searching.
|
|
//
|
|
if ((state & STATE_MASK) != 0)
|
|
{
|
|
state &= STATE_MASK;
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
position++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// void StringMatch::TranslationTable(char *table)
|
|
//
|
|
void StringMatch::TranslationTable(char *table)
|
|
{
|
|
if (local_alloc)
|
|
delete [] trans;
|
|
trans = (unsigned char *) table;
|
|
local_alloc = 0;
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// void StringMatch::IgnoreCase()
|
|
// Set up the case translation table to convert uppercase to lowercase
|
|
//
|
|
void StringMatch::IgnoreCase()
|
|
{
|
|
if (!local_alloc || !trans)
|
|
{
|
|
trans = new unsigned char[256];
|
|
for (int i = 0; i < 256; i++)
|
|
trans[i] = (unsigned char)i;
|
|
local_alloc = 1;
|
|
}
|
|
for (int i = 0; i < 256; i++)
|
|
if (isupper((unsigned char)i))
|
|
trans[i] = tolower((unsigned char)i);
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// void StringMatch::IgnorePunct(char *punct)
|
|
// Set up the character translation table to ignore punctuation
|
|
//
|
|
void StringMatch::IgnorePunct(char *punct)
|
|
{
|
|
if (!local_alloc || !trans)
|
|
{
|
|
trans = new unsigned char[256];
|
|
for (int i = 0; i < 256; i++)
|
|
trans[i] = (unsigned char)i;
|
|
local_alloc = 1;
|
|
}
|
|
if (punct)
|
|
for (int i = 0; punct[i]; i++)
|
|
trans[(unsigned char)punct[i]] = 0;
|
|
else
|
|
for (int i = 0; i < 256; i++)
|
|
if (HtIsWordChar(i) && !HtIsStrictWordChar(i))
|
|
trans[i] = 0;
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// int StringMatch::FindFirst(const char *source)
|
|
//
|
|
int StringMatch::FindFirst(const char *source)
|
|
{
|
|
int dummy;
|
|
return FindFirst(source, dummy, dummy);
|
|
}
|
|
|
|
|
|
//*****************************************************************************
|
|
// int StringMatch::Compare(const char *source)
|
|
//
|
|
int StringMatch::Compare(const char *source)
|
|
{
|
|
int dummy;
|
|
return Compare(source, dummy, dummy);
|
|
}
|