You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
426 lines
13 KiB
426 lines
13 KiB
//
|
|
// C++ Implementation: algorithms
|
|
//
|
|
// Description:
|
|
//
|
|
//
|
|
// Author: Andrea Rizzi <rizzi@kde.org>, (C) 2003
|
|
//
|
|
// Copyright: See COPYING file that comes with this distribution
|
|
//
|
|
//
|
|
#include "algorithms.h"
|
|
#include <tqstringlist.h>
|
|
#include <kdebug.h>
|
|
|
|
//FIXME: remove
|
|
#define i18n (const char*)
|
|
|
|
DataBaseInterface::ResultList ExactSearchAlgorithm::exec(const TQString& query )
|
|
{
|
|
DataBaseInterface::ResultList res;
|
|
DataBaseInterface::MainEntry e=di->get(query,0);
|
|
|
|
TQStringList trs=e.second.getTranslations();
|
|
|
|
for(TQStringList::iterator it=trs.begin();it!=trs.end();++it)
|
|
{
|
|
|
|
emit newResult(QueryResult(*it,e.first.getString(),settings->scoreExact));
|
|
|
|
res.push_back(QueryResult(*it));
|
|
}
|
|
kdDebug(0) <<"Exact algo found " << res.count() << "entries" << endl;
|
|
return res;
|
|
}
|
|
|
|
|
|
DataBaseInterface::ResultList GenericSearchAlgorithm::exec(const TQString& query )
|
|
{
|
|
DataBaseInterface::ResultList res;
|
|
// ExactSearchAlgorithm exact(query,settings);
|
|
uint countResults=0;
|
|
for(TQValueList<AbstractSearchAlgorithm *>::iterator algoit = algoChain.begin(); algoit!=algoChain.end() && countResults < maxResults; algoit++)
|
|
{
|
|
connect(*algoit,TQT_SIGNAL(newResult(QueryResult)),this,TQT_SIGNAL(newResult(QueryResult)));
|
|
kdDebug(0) << "Algo pointer" << (*algoit) << endl;
|
|
res+=(*algoit)->exec(query);
|
|
countResults=res.count();
|
|
kdDebug(0) << "Count = " << countResults << endl;
|
|
disconnect(*algoit,TQT_SIGNAL(newResult(QueryResult)),this,TQT_SIGNAL(newResult(QueryResult)));
|
|
}
|
|
return res;
|
|
}
|
|
|
|
void GenericSearchAlgorithm::addAlgorithm( AbstractSearchAlgorithm * algo )
|
|
{
|
|
algoChain.append(algo);
|
|
}
|
|
|
|
DataBaseInterface::ResultList AlphaSearchAlgorithm::exec( const TQString & query )
|
|
{
|
|
DataBaseInterface::ResultList res;
|
|
DBItemMultiIndex::IndexList il=di->getAlpha(query);
|
|
|
|
for(DBItemMultiIndex::IndexList::iterator it=il.begin();it!=il.end()&&!di->stopNow();++it)
|
|
{
|
|
DataBaseInterface::MainEntry e=di->getFromIndex(*it);
|
|
TQStringList trs=e.second.getTranslations();
|
|
for(TQStringList::iterator it=trs.begin();it!=trs.end() && !di->stopNow();++it)
|
|
{
|
|
QueryResult r(di->format(di->simple(*it,true),query),e.first.getString(),settings->scoreAlpha);
|
|
emit newResult(r);
|
|
res.push_back(r);
|
|
}
|
|
}
|
|
kdDebug(0) <<"Alpha algo found " << res.count() << "entries" << endl;
|
|
|
|
return res;
|
|
}
|
|
|
|
DataBaseInterface::ResultList SentenceArchiveSearchAlgorithm::exec( const TQString & query )
|
|
{
|
|
DataBaseInterface::ResultList res;
|
|
|
|
DataBaseInterface::MainEntry e = di->getSentence(query);
|
|
|
|
TQStringList trs=e.second.getTranslations();
|
|
|
|
kdDebug(0) << "Count in sentence archive " << trs.count()<< endl;
|
|
|
|
for(TQStringList::iterator it=trs.begin();it!=trs.end();++it)
|
|
{
|
|
QueryResult r(di->format(di->simple(*it,true),query),e.first.getString(),settings->scoreSentence);
|
|
emit newResult(r);
|
|
|
|
res.push_back(r);
|
|
}
|
|
kdDebug(0) <<"Sentence algo found " << res.count() << "entries" << endl;
|
|
|
|
return res;
|
|
}
|
|
|
|
DataBaseInterface::ResultList ChunkByChunkSearchAlgorithm::exec( const TQString & query )
|
|
{
|
|
ResultList res;
|
|
factory->setQuery(query);
|
|
TQPtrList<AbstractChunk> chunks=factory->chunks();
|
|
kdDebug(0) << "Number of chunks " << chunks.count() << endl;
|
|
chunks.setAutoDelete(true); //I should delete the chunks myself
|
|
TQStringList querySeparators=factory->separators();
|
|
|
|
//This prevents recursive loop.
|
|
if (chunks.count()<=1) return res;
|
|
|
|
TQStringList translations,tmpTranslations;
|
|
|
|
translations.push_back(""); //FIXME this is needed to start , but is not good
|
|
int finalscore=0;
|
|
int i=0;
|
|
TQMap<TQString,bool> translationUsed;
|
|
|
|
//Loop on all chunk
|
|
for(AbstractChunk *it=chunks.first();it && !di->stopNow(); it=chunks.next())
|
|
{
|
|
kdDebug(0) << "Process next chunk" << endl;
|
|
int chunkscore=0;
|
|
TQValueList<QueryResult> r=it->translations();
|
|
kdDebug(0) << "Number of results for this chunk " << r.count() << endl;
|
|
|
|
if(r.count()<1) {
|
|
// kdDebug(0) << "Nothing found for:" << it->translations() << endl;
|
|
chunkscore=-10;
|
|
|
|
}
|
|
else
|
|
{
|
|
//FIXME: check this, why 0? it is the best one?
|
|
chunkscore=r[0].score();
|
|
kdDebug(0) << "ChunkScore " << chunkscore << endl;
|
|
tmpTranslations.clear();
|
|
|
|
|
|
//Loop on results
|
|
translationUsed.clear();
|
|
for(ResultList::iterator it1=r.begin();it1!=r.end() &&!di->stopNow(); ++it1)
|
|
{
|
|
TQString chunkTranslation= (*it1).result();
|
|
if(!translationUsed.contains(chunkTranslation))
|
|
{
|
|
translationUsed[chunkTranslation]=true;
|
|
kdDebug(0) << "a translation is: " << chunkTranslation << endl;
|
|
for(TQStringList::iterator it2=translations.begin();it2!=translations.end() && !di->stopNow() ; it2++)
|
|
{
|
|
TQString prevTranslation=*it2;
|
|
tmpTranslations.push_back(prevTranslation+chunkTranslation+querySeparators[i]);
|
|
kdDebug(0) << "..appending it to " << prevTranslation << endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
translations=tmpTranslations;
|
|
|
|
}
|
|
|
|
//kdDebug(0) << it-> << r[0].result() << "#" << querySeparators[i] << endl;
|
|
i++;
|
|
finalscore+=chunkscore;
|
|
|
|
kdDebug(0) << "partial score " << finalscore;
|
|
}
|
|
kdDebug(0) << "this is finishd" << endl;
|
|
if(settings->scoreChunkByChunk==0)
|
|
settings->scoreChunkByChunk=1;
|
|
// FIXME:fix the score system
|
|
// finalscore/=(i*100*100/settings->scoreChunkByChunk); //change 100 to 120(?) to lower this result (done)
|
|
|
|
if (finalscore<50) return res;
|
|
|
|
for(TQStringList::iterator it2=translations.begin();it2!=translations.end() && !di->stopNow() ; it2++)
|
|
{
|
|
TQString theTranslation=*it2;
|
|
QueryResult qr(di->format(theTranslation,query),i18n("CHUNK BY CHUNK"),finalscore);
|
|
qr.setRichOriginal(i18n("<h3>Chunk by chunk</h3>CHANGE THIS TEXT!!!!This translation is"
|
|
"obtained translating the sentences and using a"
|
|
"fuzzy sentence translation database.<br>"
|
|
" <b>Do not rely on it</b>. Translations may be fuzzy.<br>"));
|
|
qr.setRichResult("<font color=#800000>"+theTranslation+"</font>") ;
|
|
emit newResult(qr);
|
|
res.push_back(qr);
|
|
}
|
|
|
|
|
|
return res;
|
|
|
|
|
|
}
|
|
|
|
ChunkByChunkSearchAlgorithm::ChunkByChunkSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ): AbstractSearchAlgorithm(dbi,sets) , factory(0)
|
|
{
|
|
|
|
}
|
|
|
|
|
|
SentenceArchiveSearchAlgorithm::SentenceArchiveSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ): AbstractSearchAlgorithm(dbi,sets)
|
|
{
|
|
}
|
|
|
|
FuzzyChunkSearchAlgorithm::FuzzyChunkSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ) : AbstractSearchAlgorithm(dbi,sets)
|
|
{
|
|
|
|
}
|
|
|
|
|
|
DataBaseInterface::ResultList FuzzyChunkSearchAlgorithm::exec( const TQString & query )
|
|
{
|
|
//FIXME: this code is shit too
|
|
ResultList res;
|
|
factory->setQuery(query);
|
|
TQPtrList<AbstractChunk> querychunks = factory->chunks();
|
|
querychunks.setAutoDelete(true);
|
|
|
|
typedef TQMap<TQString,TQValueList<unsigned int> > ResultMap;
|
|
ResultMap rmap; //result of words index query
|
|
unsigned int notfound=0,frequent=0,nchunks = querychunks.count();
|
|
|
|
//Get index list for each word
|
|
for(AbstractChunk *it=querychunks.first(); it &&!di->stopNow() ; it=querychunks.next() )
|
|
{
|
|
TQValueList<uint> locations = (*it).locationReferences();
|
|
|
|
if(locations.count()>0)
|
|
{
|
|
rmap[(*it).chunkString()] = locations;
|
|
|
|
if(locations.count()>1000) //FIXME NORMALIZE THIS!!!
|
|
{
|
|
frequent++;
|
|
kdDebug(0) << "\""<<(*it).chunkString() << "\" is frequent" <<endl;
|
|
}
|
|
}
|
|
else
|
|
notfound++;
|
|
|
|
}
|
|
|
|
|
|
//Now we have a map (rmap) "word in query->list of occurency"
|
|
|
|
TQValueList<unsigned int>::iterator countpos[nchunks+1];
|
|
|
|
|
|
TQValueList<unsigned int> il;
|
|
for(int i = 0;i<=nchunks&&!di->stopNow();i++)
|
|
countpos[i]=il.end();
|
|
|
|
unsigned int bestcount=0;
|
|
while(!rmap.isEmpty())
|
|
{
|
|
unsigned int ref,count;
|
|
ref=(unsigned int)-1;
|
|
count=0;
|
|
|
|
|
|
//This will find the min head and count how many times it occurs
|
|
for(ResultMap::iterator it = rmap.begin();it!=rmap.end()&&!di->stopNow();++it)
|
|
{
|
|
unsigned int thisref=it.data().first();
|
|
if(thisref<ref)
|
|
{
|
|
ref=thisref;
|
|
count=0;
|
|
}
|
|
if(thisref==ref)
|
|
{
|
|
count++;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
for(ResultMap::iterator it = rmap.begin();it!=rmap.end()&&!di->stopNow();)
|
|
{
|
|
it.data().remove(ref);
|
|
|
|
//kdDebug(0)<< ((frequent<(nwords-notfound)) && (it.data().count()>350)) <<endl;
|
|
//FIXME: I think the frequent word check is not in the right place
|
|
if(it.data().isEmpty() || (((frequent+notfound)<nchunks) && (it.data().count()>1000)))
|
|
//very dirty hack...
|
|
{
|
|
|
|
ResultMap::iterator it2=it;
|
|
it++;
|
|
rmap.remove(it2);
|
|
}
|
|
else it++;
|
|
|
|
}
|
|
|
|
//This should be configurable or optimized:
|
|
if(count>=(nchunks-notfound)*0.50 && count!=0)
|
|
{
|
|
il.insert(countpos[count],ref);
|
|
for(unsigned int i = nchunks;i>=count;i--)
|
|
if(countpos[i]==countpos[count])
|
|
countpos[i]--;
|
|
}
|
|
}
|
|
|
|
//loop on number of words found
|
|
int bestscore=0;
|
|
|
|
for(unsigned int wf=nchunks;wf>0;wf-- ){
|
|
for(TQValueList<unsigned int>::iterator it=countpos[wf];it!=countpos[wf-1] ;++it)
|
|
{ //loop on entries with same number of word found
|
|
DataBaseInterface::MainEntry e;
|
|
e=di->getFromIndex(*it);
|
|
TQStringList trs=e.second.getTranslations();
|
|
for(TQStringList::iterator it=trs.begin();it!=trs.end()&&!di->stopNow();++it)
|
|
{
|
|
unsigned int cinr=factory->chunks(*it).count(); //chunk in result
|
|
//compute a score, lets kbabel sort now, it should be fast...
|
|
int score=90*wf/nchunks-(signed int)90*(((nchunks-cinr)>0)?(nchunks-cinr):(cinr-nchunks))/(nchunks*10);
|
|
if(score>bestscore) bestscore=score;
|
|
if(score>bestscore*0.40)
|
|
{
|
|
// kdDebug(0) << "s: "<<score << " wf: "<<wf<<" nwords: "<<nwords<<" winr: "<<winr
|
|
// <<" 90*wf/nwords: "<<90*wf/nwords << " -:" << 90*(((nwords-winr)>0)?(nwords-winr):(winr-nwords))/(nwords*10)<< endl;
|
|
// FIXME: format better the richtext
|
|
TQString ori=e.first.getString();
|
|
TQString re=di->format(di->simple(*it,true),query);
|
|
QueryResult r(re,ori,score);
|
|
for(TQPtrListIterator<AbstractChunk> it(querychunks); it.current() && di->stopNow() ; ++it){
|
|
ori=ori.replace(TQRegExp((*it)->chunkString(),false),"<font color=#000080><u><b>"+(*it)->chunkString()+"</b></u></font>");
|
|
}
|
|
r.setRichOriginal(ori);
|
|
if(!di->stopNow())
|
|
emit newResult(r);
|
|
res.push_back(r);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return res;
|
|
|
|
}
|
|
|
|
DataBaseInterface::ResultList CorrelationSearchAlgorithm::exec( const TQString & query )
|
|
{
|
|
//FIXME, this code is shit.
|
|
DataBaseInterface::ResultList res;
|
|
if(di->words(query).count()>1) return res;
|
|
TQMap<TQString,float> corRes = di->correlation(query,0,false);
|
|
float max=0,max1=0,max2=0;
|
|
TQString best,best1,best2;
|
|
|
|
for(TQMap<TQString,float>::iterator it = corRes.begin(); it !=corRes.end(); ++it)
|
|
{
|
|
if(it.data()>max)
|
|
{
|
|
max2=max1;
|
|
best2=best1;
|
|
max1=max;
|
|
best1=best;
|
|
best = it.key();
|
|
max=it.data();
|
|
|
|
}
|
|
|
|
|
|
}
|
|
if(!best.isEmpty())
|
|
{
|
|
double myscore=0.01*max*settings->scoreDynamic;
|
|
QueryResult r(di->format(best,query),i18n("DYNAMIC DICT:"),myscore);
|
|
r.setRichOriginal(i18n("<h3>Dynamic Dictionary</h3>This is a dynamic dictionary created"
|
|
" looking for correlation of original and translated words.<br>"
|
|
" <b>Do not rely on it</b>. Translations may be fuzzy.<br>"));
|
|
r.setRichResult("<font size=+2 color=#A00000>"+di->format(best,query)+"</font>") ;
|
|
res.push_back(r);
|
|
if(!di->stopNow())
|
|
emit newResult(r);
|
|
}
|
|
if(!best1.isEmpty())
|
|
{
|
|
double myscore=0.01*max1*settings->scoreDynamic;
|
|
QueryResult r(di->format(best1,query),i18n("DYNAMIC DICT:"),myscore);
|
|
r.setRichOriginal(i18n("<h3>Dynamic Dictionary</h3>This is a dynamic dictionary created"
|
|
" looking for correlation of original and translated words.<br>"
|
|
" <b>Do not rely on it</b>. Translations may be fuzzy.<br>"));
|
|
r.setRichResult("<font size=+2 color=#800000>"+di->format(best1,query)+"</font>") ;
|
|
res.push_back(r);
|
|
if(!di->stopNow())
|
|
emit newResult(r);
|
|
}
|
|
|
|
kdDebug(0) << "Correlation algorithm found" << res.count() << "results";
|
|
return res;
|
|
|
|
}
|
|
|
|
GenericSearchAlgorithm::GenericSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ): AbstractSearchAlgorithm(dbi,sets)
|
|
{
|
|
maxResults = 5; //FIXME use as default somthing from DBSESettings
|
|
}
|
|
|
|
SingleWordSearchAlgorithm::SingleWordSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ) : GenericSearchAlgorithm(dbi,sets),
|
|
exact(dbi,sets), alpha(dbi,sets), sentence(dbi,sets), corr(dbi,sets), chunk(dbi,sets),casefactory(dbi)
|
|
{
|
|
addAlgorithm(&exact);
|
|
addAlgorithm(&alpha);
|
|
addAlgorithm(&sentence);
|
|
chunk.setChunkFactory(&casefactory);
|
|
addAlgorithm(&chunk);
|
|
addAlgorithm(&corr);
|
|
}
|
|
|
|
DataBaseInterface::ResultList SingleWordSearchAlgorithm::exec( const TQString & query )
|
|
{
|
|
if(di->words(query).count()>1)
|
|
return ResultList();
|
|
return GenericSearchAlgorithm::exec(query);
|
|
}
|
|
|
|
|
|
//#include "algorithms.moc"
|