You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
573 lines
13 KiB
573 lines
13 KiB
15 years ago
|
/***************************************************************************
|
||
|
* Copyright (C) 2004-2007 by Georgy Yunaev, gyunaev@ulduzsoft.com *
|
||
|
* Portions Copyright (C) 2000-2005 Trolltech AS. *
|
||
|
* Please do not use email address above for bug reports; see *
|
||
|
* the README file *
|
||
|
* *
|
||
|
* This program is free software; you can redistribute it and/or modify *
|
||
|
* it under the terms of the GNU General Public License as published by *
|
||
|
* the Free Software Foundation; either version 2 of the License, or *
|
||
|
* (at your option) any later version. *
|
||
|
* *
|
||
|
* This program is distributed in the hope that it will be useful, *
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||
|
* GNU General Public License for more details. *
|
||
|
* *
|
||
|
* You should have received a copy of the GNU General Public License *
|
||
|
* along with this program; if not, write to the *
|
||
|
* Free Software Foundation, Inc., *
|
||
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
|
||
|
***************************************************************************/
|
||
|
|
||
|
|
||
|
#include <qfile.h>
|
||
|
#include <qdir.h>
|
||
|
#include <qstringlist.h>
|
||
|
#include <qdict.h>
|
||
|
#include <qapplication.h>
|
||
|
|
||
|
#include <ctype.h>
|
||
|
|
||
|
#include "kchmsearchengine_impl.h"
|
||
|
#include "kchmmainwindow.h"
|
||
|
#include "libchmfileimpl.h"
|
||
|
|
||
|
|
||
|
namespace QtAs {
|
||
|
|
||
|
// Those characters are splitters (i.e. split the word), but added themselves into dictionary too.
|
||
|
// This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both
|
||
|
// search for "print" and "->print" will find it.
|
||
|
static const char SPLIT_CHARACTERS[] = "!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`";
|
||
|
|
||
|
// Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug.
|
||
|
static const char WORD_CHARACTERS[] = "$_";
|
||
|
|
||
|
|
||
|
int TermList::compareItems( QPtrCollection::Item i1, QPtrCollection::Item i2 )
|
||
|
{
|
||
|
if( ( (Term*)i1 )->frequency == ( (Term*)i2 )->frequency )
|
||
|
return 0;
|
||
|
if( ( (Term*)i1 )->frequency < ( (Term*)i2 )->frequency )
|
||
|
return -1;
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
QDataStream &operator>>( QDataStream &s, Document &l )
|
||
|
{
|
||
|
s >> l.docNumber;
|
||
|
s >> l.frequency;
|
||
|
return s;
|
||
|
}
|
||
|
|
||
|
QDataStream &operator<<( QDataStream &s, const Document &l )
|
||
|
{
|
||
|
s << (Q_INT16)l.docNumber;
|
||
|
s << (Q_INT16)l.frequency;
|
||
|
return s;
|
||
|
}
|
||
|
|
||
|
Index::Index( const QString &dp, const QString & )
|
||
|
: QObject( 0, 0 ), dict( 8999 ), docPath( dp )
|
||
|
{
|
||
|
lastWindowClosed = false;
|
||
|
connect( qApp, SIGNAL( lastWindowClosed() ),
|
||
|
this, SLOT( setLastWinClosed() ) );
|
||
|
}
|
||
|
|
||
|
Index::Index( const QStringList &dl, const QString & )
|
||
|
: QObject( 0, 0 ), dict( 20011 )
|
||
|
{
|
||
|
docList = dl;
|
||
|
lastWindowClosed = false;
|
||
|
connect( qApp, SIGNAL( lastWindowClosed() ),
|
||
|
this, SLOT( setLastWinClosed() ) );
|
||
|
}
|
||
|
|
||
|
void Index::setLastWinClosed()
|
||
|
{
|
||
|
lastWindowClosed = true;
|
||
|
}
|
||
|
|
||
|
void Index::setDictionaryFile( const QString &f )
|
||
|
{
|
||
|
dictFile = f;
|
||
|
}
|
||
|
|
||
|
void Index::setDocListFile( const QString &f )
|
||
|
{
|
||
|
docListFile = f;
|
||
|
}
|
||
|
|
||
|
void Index::setDocList( const QStringList &lst )
|
||
|
{
|
||
|
docList = lst;
|
||
|
}
|
||
|
|
||
|
bool Index::makeIndex()
|
||
|
{
|
||
|
if ( docList.isEmpty() )
|
||
|
return false;
|
||
|
|
||
|
QStringList::Iterator it = docList.begin();
|
||
|
int steps = docList.count() / 100;
|
||
|
|
||
|
if ( !steps )
|
||
|
steps++;
|
||
|
|
||
|
int prog = 0;
|
||
|
|
||
|
for ( int i = 0; it != docList.end(); ++it, ++i )
|
||
|
{
|
||
|
if ( lastWindowClosed )
|
||
|
return false;
|
||
|
|
||
|
parseDocument( *it, i );
|
||
|
|
||
|
if ( i%steps == 0 )
|
||
|
{
|
||
|
prog++;
|
||
|
emit indexingProgress( prog );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
|
||
|
void Index::insertInDict( const QString &str, int docNum )
|
||
|
{
|
||
|
Entry *e = 0;
|
||
|
if ( dict.count() )
|
||
|
e = dict[ str ];
|
||
|
|
||
|
if ( e )
|
||
|
{
|
||
|
if ( e->documents.first().docNumber != docNum )
|
||
|
e->documents.prepend( Document( docNum, 1 ) );
|
||
|
else
|
||
|
e->documents.first().frequency++;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
dict.insert( str, new Entry( docNum ) );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
bool Index::parseDocumentToStringlist( const QString & filename, QStringList & tokenlist )
|
||
|
{
|
||
|
QString parsedbuf, parseentity;
|
||
|
QString text;
|
||
|
|
||
|
if ( !::mainWindow->chmFile()->getFileContentAsString( &text, filename ) )
|
||
|
{
|
||
|
qWarning( "Index::parseDocument: Could not retrieve the content at %s", filename.ascii() );
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
if ( text.isNull() )
|
||
|
{
|
||
|
qWarning( "Index::parseDocument: Retrieved content for %s is empty", filename.ascii() );
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
m_charssplit = SPLIT_CHARACTERS;
|
||
|
m_charsword = WORD_CHARACTERS;
|
||
|
|
||
|
tokenlist.clear();
|
||
|
|
||
|
// State machine states
|
||
|
enum state_t
|
||
|
{
|
||
|
STATE_OUTSIDE_TAGS, // outside HTML tags; parse text
|
||
|
STATE_IN_HTML_TAG, // inside HTML tags; wait for end tag
|
||
|
STATE_IN_QUOTES, // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar)
|
||
|
STATE_IN_HTML_ENTITY, // inside HTML entity; parse the entity
|
||
|
};
|
||
|
|
||
|
state_t state = STATE_OUTSIDE_TAGS;
|
||
|
QChar QuoteChar; // used in STATE_IN_QUOTES
|
||
|
|
||
|
for ( unsigned int j = 0; j < text.length(); j++ )
|
||
|
{
|
||
|
QChar ch = text[j];
|
||
|
|
||
|
if ( (j % 20000) == 0 )
|
||
|
qApp->processEvents( QEventLoop::ExcludeUserInput );
|
||
|
|
||
|
if ( state == STATE_IN_HTML_TAG )
|
||
|
{
|
||
|
// We are inside HTML tag.
|
||
|
// Ignore everything until we see '>' (end of HTML tag) or quote char (quote start)
|
||
|
if ( ch == '"' || ch == '\'' )
|
||
|
{
|
||
|
state = STATE_IN_QUOTES;
|
||
|
QuoteChar = ch;
|
||
|
}
|
||
|
else if ( ch == '>' )
|
||
|
state = STATE_OUTSIDE_TAGS;
|
||
|
|
||
|
continue;
|
||
|
}
|
||
|
else if ( state == STATE_IN_QUOTES )
|
||
|
{
|
||
|
// We are inside quoted text inside HTML tag.
|
||
|
// Ignore everything until we see the quote character again
|
||
|
if ( ch == QuoteChar )
|
||
|
state = STATE_IN_HTML_TAG;
|
||
|
|
||
|
continue;
|
||
|
}
|
||
|
else if ( state == STATE_IN_HTML_ENTITY )
|
||
|
{
|
||
|
// We are inside encoded HTML entity (like ).
|
||
|
// Collect to parsedbuf everything until we see ;
|
||
|
if ( ch.isLetterOrNumber() )
|
||
|
{
|
||
|
// get next character of this entity
|
||
|
parseentity.append( ch );
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// The entity ended
|
||
|
state = STATE_OUTSIDE_TAGS;
|
||
|
|
||
|
// Some shitty HTML does not terminate entities correctly. Screw it.
|
||
|
if ( ch != ';' && ch != '<' )
|
||
|
{
|
||
|
if ( parseentity.isEmpty() )
|
||
|
{
|
||
|
// straight '&' symbol. Add and continue.
|
||
|
parsedbuf += "&";
|
||
|
}
|
||
|
else
|
||
|
qWarning( "Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", parseentity.ascii(), ch.latin1() );
|
||
|
|
||
|
j--; // parse this character again, but in different state
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// Don't we have a space?
|
||
|
if ( parseentity.lower() != "nbsp" )
|
||
|
{
|
||
|
QString entity = ::mainWindow->chmFile()->impl()->decodeEntity( parseentity );
|
||
|
|
||
|
if ( entity.isNull() )
|
||
|
{
|
||
|
// decodeEntity() already printed error message
|
||
|
//qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() );
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
parsedbuf += entity;
|
||
|
continue;
|
||
|
}
|
||
|
else
|
||
|
ch = ' '; // We got a space, so treat it like it, and not add it to parsebuf
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Now process STATE_OUTSIDE_TAGS
|
||
|
//
|
||
|
|
||
|
// Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is
|
||
|
if ( ch == '<' )
|
||
|
{
|
||
|
state = STATE_IN_HTML_TAG;
|
||
|
goto tokenize_buf;
|
||
|
}
|
||
|
|
||
|
// Check for start of HTML entity
|
||
|
if ( ch == '&' )
|
||
|
{
|
||
|
state = STATE_IN_HTML_ENTITY;
|
||
|
parseentity = QString::null;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// Replace quote by ' - quotes are used in search window to set the phrase
|
||
|
if ( ch == '"' )
|
||
|
ch = '\'';
|
||
|
|
||
|
// Ok, we have a valid character outside HTML tags, and probably some in buffer already.
|
||
|
// If it is char or letter, add it and continue
|
||
|
if ( ch.isLetterOrNumber() || m_charsword.find( ch ) != -1 )
|
||
|
{
|
||
|
parsedbuf.append( ch );
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// If it is a split char, add the word to the dictionary, and then add the char itself.
|
||
|
if ( m_charssplit.find( ch ) != -1 )
|
||
|
{
|
||
|
if ( !parsedbuf.isEmpty() )
|
||
|
tokenlist.push_back( parsedbuf.lower() );
|
||
|
|
||
|
tokenlist.push_back( ch.lower() );
|
||
|
parsedbuf = QString::null;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
tokenize_buf:
|
||
|
// Just add the word; it is most likely a space or terminated by tokenizer.
|
||
|
if ( !parsedbuf.isEmpty() )
|
||
|
{
|
||
|
tokenlist.push_back( parsedbuf.lower() );
|
||
|
parsedbuf = QString::null;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Add the last word if still here - for broken htmls.
|
||
|
if ( !parsedbuf.isEmpty() )
|
||
|
tokenlist.push_back( parsedbuf.lower() );
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
|
||
|
void Index::parseDocument( const QString &filename, int docNum )
|
||
|
{
|
||
|
QStringList terms;
|
||
|
|
||
|
if ( !parseDocumentToStringlist( filename, terms ) )
|
||
|
return;
|
||
|
|
||
|
for ( QStringList::Iterator it = terms.begin(); it != terms.end(); ++it )
|
||
|
insertInDict( *it, docNum );
|
||
|
}
|
||
|
|
||
|
|
||
|
void Index::writeDict()
|
||
|
{
|
||
|
QDictIterator<Entry> it( dict );
|
||
|
QFile f( dictFile );
|
||
|
|
||
|
if ( !f.open( IO_WriteOnly ) )
|
||
|
{
|
||
|
qWarning( "Index::writeDict: could not write dictionary file %s", dictFile.ascii() );
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
QDataStream s( &f );
|
||
|
s << (int) 1; // version
|
||
|
s << m_charssplit;
|
||
|
s << m_charsword;
|
||
|
|
||
|
for( ; it.current(); ++it )
|
||
|
{
|
||
|
Entry *e = it.current();
|
||
|
s << it.currentKey();
|
||
|
s << e->documents;
|
||
|
}
|
||
|
|
||
|
f.close();
|
||
|
writeDocumentList();
|
||
|
}
|
||
|
|
||
|
void Index::writeDocumentList()
|
||
|
{
|
||
|
QFile f( docListFile );
|
||
|
if ( !f.open( IO_WriteOnly ) )
|
||
|
{
|
||
|
qWarning( "Index::writeDocumentList: could not write dictionary file %s", docListFile.ascii() );
|
||
|
return;
|
||
|
}
|
||
|
QDataStream s( &f );
|
||
|
s << docList;
|
||
|
}
|
||
|
|
||
|
bool Index::readDict()
|
||
|
{
|
||
|
QFile f( dictFile );
|
||
|
if ( !f.open( IO_ReadOnly ) )
|
||
|
return false;
|
||
|
|
||
|
dict.clear();
|
||
|
QDataStream s( &f );
|
||
|
QString key;
|
||
|
int version;
|
||
|
QValueList<Document> docs;
|
||
|
|
||
|
s >> version;
|
||
|
s >> m_charssplit;
|
||
|
s >> m_charsword;
|
||
|
|
||
|
while ( !s.atEnd() )
|
||
|
{
|
||
|
s >> key;
|
||
|
s >> docs;
|
||
|
dict.insert( key, new Entry( docs ) );
|
||
|
}
|
||
|
|
||
|
f.close();
|
||
|
return dict.size() > 0 && readDocumentList();
|
||
|
}
|
||
|
|
||
|
bool Index::readDocumentList()
|
||
|
{
|
||
|
QFile f( docListFile );
|
||
|
if ( !f.open( IO_ReadOnly ) )
|
||
|
return false;
|
||
|
QDataStream s( &f );
|
||
|
s >> docList;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords )
|
||
|
{
|
||
|
TermList termList;
|
||
|
|
||
|
QStringList::ConstIterator it = terms.begin();
|
||
|
for ( it = terms.begin(); it != terms.end(); ++it )
|
||
|
{
|
||
|
Entry *e = 0;
|
||
|
|
||
|
if ( dict[ *it ] )
|
||
|
{
|
||
|
e = dict[ *it ];
|
||
|
termList.append( new Term( *it, e->documents.count(), e->documents ) );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
return QStringList();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
termList.sort();
|
||
|
|
||
|
Term *minTerm = termList.first();
|
||
|
|
||
|
if ( !termList.count() )
|
||
|
return QStringList();
|
||
|
|
||
|
termList.removeFirst();
|
||
|
|
||
|
QValueList<Document> minDocs = minTerm->documents;
|
||
|
QValueList<Document>::iterator C;
|
||
|
QValueList<Document>::ConstIterator It;
|
||
|
Term *t = termList.first();
|
||
|
|
||
|
for ( ; t; t = termList.next() )
|
||
|
{
|
||
|
QValueList<Document> docs = t->documents;
|
||
|
C = minDocs.begin();
|
||
|
|
||
|
while ( C != minDocs.end() )
|
||
|
{
|
||
|
bool found = false;
|
||
|
|
||
|
for ( It = docs.begin(); It != docs.end(); ++It )
|
||
|
{
|
||
|
if ( (*C).docNumber == (*It).docNumber )
|
||
|
{
|
||
|
(*C).frequency += (*It).frequency;
|
||
|
found = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ( !found )
|
||
|
C = minDocs.remove( C );
|
||
|
else
|
||
|
++C;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
QStringList results;
|
||
|
qHeapSort( minDocs );
|
||
|
|
||
|
if ( termSeq.isEmpty() )
|
||
|
{
|
||
|
for ( C = minDocs.begin(); C != minDocs.end(); ++C )
|
||
|
results << docList[ (int)(*C).docNumber ];
|
||
|
|
||
|
return results;
|
||
|
}
|
||
|
|
||
|
QString fileName;
|
||
|
|
||
|
for ( C = minDocs.begin(); C != minDocs.end(); ++C )
|
||
|
{
|
||
|
fileName = docList[ (int)(*C).docNumber ];
|
||
|
|
||
|
if ( searchForPhrases( termSeq, seqWords, fileName ) )
|
||
|
results << fileName;
|
||
|
}
|
||
|
|
||
|
return results;
|
||
|
}
|
||
|
|
||
|
|
||
|
bool Index::searchForPhrases( const QStringList &phrases, const QStringList &words, const QString &filename )
|
||
|
{
|
||
|
QStringList parsed_document;
|
||
|
|
||
|
if ( !parseDocumentToStringlist( filename, parsed_document ) )
|
||
|
return false;
|
||
|
|
||
|
miniDict.clear();
|
||
|
|
||
|
// Initialize the dictionary with the words in phrase(s)
|
||
|
for ( QStringList::ConstIterator cIt = words.begin(); cIt != words.end(); ++cIt )
|
||
|
miniDict.insert( *cIt, new PosEntry( 0 ) );
|
||
|
|
||
|
// Fill the dictionary with the words from the document
|
||
|
unsigned int word_offset = 3;
|
||
|
for ( QStringList::ConstIterator it = parsed_document.begin(); it != parsed_document.end(); it++, word_offset++ )
|
||
|
{
|
||
|
PosEntry * entry = miniDict[ *it ];
|
||
|
|
||
|
if ( entry )
|
||
|
entry->positions.append( word_offset );
|
||
|
}
|
||
|
|
||
|
// Dump it
|
||
|
/*
|
||
|
QDictIterator<PosEntry> it( miniDict );
|
||
|
for( ; it.current(); ++it )
|
||
|
{
|
||
|
QString text( it.currentKey() );
|
||
|
QValueList<uint> pos = miniDict[text]->positions;
|
||
|
for ( unsigned int i = 1; i < pos.size(); i++ )
|
||
|
text += " " + QString::number( pos[i] );
|
||
|
|
||
|
qDebug( "%s", text.ascii());
|
||
|
}
|
||
|
*/
|
||
|
|
||
|
QValueList<uint> first_word_positions;
|
||
|
|
||
|
for ( QStringList::ConstIterator phrase_it = phrases.begin(); phrase_it != phrases.end(); phrase_it++ )
|
||
|
{
|
||
|
QStringList phrasewords = QStringList::split( ' ', *phrase_it );
|
||
|
first_word_positions = miniDict[ phrasewords[0] ]->positions;
|
||
|
|
||
|
for ( unsigned int j = 1; j < phrasewords.count(); ++j )
|
||
|
{
|
||
|
QValueList<uint> next_word_it = miniDict[ phrasewords[j] ]->positions;
|
||
|
QValueList<uint>::iterator dict_it = first_word_positions.begin();
|
||
|
|
||
|
while ( dict_it != first_word_positions.end() )
|
||
|
{
|
||
|
if ( next_word_it.find( *dict_it + 1 ) != next_word_it.end() )
|
||
|
{
|
||
|
(*dict_it)++;
|
||
|
++dict_it;
|
||
|
}
|
||
|
else
|
||
|
dict_it = first_word_positions.remove( dict_it );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ( first_word_positions.count() )
|
||
|
return true;
|
||
|
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
|
||
|
};
|