You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdesdk/kbabel/kbabeldict/modules/dbsearchengine2/algorithms.cpp

426 lines
13 KiB

//
// C++ Implementation: algorithms
//
// Description:
//
//
// Author: Andrea Rizzi <rizzi@kde.org>, (C) 2003
//
// Copyright: See COPYING file that comes with this distribution
//
//
#include "algorithms.h"
#include <tqstringlist.h>
#include <kdebug.h>
//FIXME: remove
#define i18n (const char*)
DataBaseInterface::ResultList ExactSearchAlgorithm::exec(const TQString& query )
{
DataBaseInterface::ResultList res;
DataBaseInterface::MainEntry e=di->get(query,0);
TQStringList trs=e.second.getTranslations();
for(TQStringList::iterator it=trs.begin();it!=trs.end();++it)
{
emit newResult(QueryResult(*it,e.first.getString(),settings->scoreExact));
res.push_back(QueryResult(*it));
}
kdDebug(0) <<"Exact algo found " << res.count() << "entries" << endl;
return res;
}
DataBaseInterface::ResultList GenericSearchAlgorithm::exec(const TQString& query )
{
DataBaseInterface::ResultList res;
// ExactSearchAlgorithm exact(query,settings);
uint countResults=0;
for(TQValueList<AbstractSearchAlgorithm *>::iterator algoit = algoChain.begin(); algoit!=algoChain.end() && countResults < maxResults; algoit++)
{
connect(*algoit,TQT_SIGNAL(newResult(QueryResult)),this,TQT_SIGNAL(newResult(QueryResult)));
kdDebug(0) << "Algo pointer" << (*algoit) << endl;
res+=(*algoit)->exec(query);
countResults=res.count();
kdDebug(0) << "Count = " << countResults << endl;
disconnect(*algoit,TQT_SIGNAL(newResult(QueryResult)),this,TQT_SIGNAL(newResult(QueryResult)));
}
return res;
}
void GenericSearchAlgorithm::addAlgorithm( AbstractSearchAlgorithm * algo )
{
algoChain.append(algo);
}
DataBaseInterface::ResultList AlphaSearchAlgorithm::exec( const TQString & query )
{
DataBaseInterface::ResultList res;
DBItemMultiIndex::IndexList il=di->getAlpha(query);
for(DBItemMultiIndex::IndexList::iterator it=il.begin();it!=il.end()&&!di->stopNow();++it)
{
DataBaseInterface::MainEntry e=di->getFromIndex(*it);
TQStringList trs=e.second.getTranslations();
for(TQStringList::iterator it=trs.begin();it!=trs.end() && !di->stopNow();++it)
{
QueryResult r(di->format(di->simple(*it,true),query),e.first.getString(),settings->scoreAlpha);
emit newResult(r);
res.push_back(r);
}
}
kdDebug(0) <<"Alpha algo found " << res.count() << "entries" << endl;
return res;
}
DataBaseInterface::ResultList SentenceArchiveSearchAlgorithm::exec( const TQString & query )
{
DataBaseInterface::ResultList res;
DataBaseInterface::MainEntry e = di->getSentence(query);
TQStringList trs=e.second.getTranslations();
kdDebug(0) << "Count in sentence archive " << trs.count()<< endl;
for(TQStringList::iterator it=trs.begin();it!=trs.end();++it)
{
QueryResult r(di->format(di->simple(*it,true),query),e.first.getString(),settings->scoreSentence);
emit newResult(r);
res.push_back(r);
}
kdDebug(0) <<"Sentence algo found " << res.count() << "entries" << endl;
return res;
}
DataBaseInterface::ResultList ChunkByChunkSearchAlgorithm::exec( const TQString & query )
{
ResultList res;
factory->setQuery(query);
TQPtrList<AbstractChunk> chunks=factory->chunks();
kdDebug(0) << "Number of chunks " << chunks.count() << endl;
chunks.setAutoDelete(true); //I should delete the chunks myself
TQStringList querySeparators=factory->separators();
//This prevents recursive loop.
if (chunks.count()<=1) return res;
TQStringList translations,tmpTranslations;
translations.push_back(""); //FIXME this is needed to start , but is not good
int finalscore=0;
int i=0;
TQMap<TQString,bool> translationUsed;
//Loop on all chunk
for(AbstractChunk *it=chunks.first();it && !di->stopNow(); it=chunks.next())
{
kdDebug(0) << "Process next chunk" << endl;
int chunkscore=0;
TQValueList<QueryResult> r=it->translations();
kdDebug(0) << "Number of results for this chunk " << r.count() << endl;
if(r.count()<1) {
// kdDebug(0) << "Nothing found for:" << it->translations() << endl;
chunkscore=-10;
}
else
{
//FIXME: check this, why 0? it is the best one?
chunkscore=r[0].score();
kdDebug(0) << "ChunkScore " << chunkscore << endl;
tmpTranslations.clear();
//Loop on results
translationUsed.clear();
for(ResultList::iterator it1=r.begin();it1!=r.end() &&!di->stopNow(); ++it1)
{
TQString chunkTranslation= (*it1).result();
if(!translationUsed.contains(chunkTranslation))
{
translationUsed[chunkTranslation]=true;
kdDebug(0) << "a translation is: " << chunkTranslation << endl;
for(TQStringList::iterator it2=translations.begin();it2!=translations.end() && !di->stopNow() ; it2++)
{
TQString prevTranslation=*it2;
tmpTranslations.push_back(prevTranslation+chunkTranslation+querySeparators[i]);
kdDebug(0) << "..appending it to " << prevTranslation << endl;
}
}
}
translations=tmpTranslations;
}
//kdDebug(0) << it-> << r[0].result() << "#" << querySeparators[i] << endl;
i++;
finalscore+=chunkscore;
kdDebug(0) << "partial score " << finalscore;
}
kdDebug(0) << "this is finishd" << endl;
if(settings->scoreChunkByChunk==0)
settings->scoreChunkByChunk=1;
// FIXME:fix the score system
// finalscore/=(i*100*100/settings->scoreChunkByChunk); //change 100 to 120(?) to lower this result (done)
if (finalscore<50) return res;
for(TQStringList::iterator it2=translations.begin();it2!=translations.end() && !di->stopNow() ; it2++)
{
TQString theTranslation=*it2;
QueryResult qr(di->format(theTranslation,query),i18n("CHUNK BY CHUNK"),finalscore);
qr.setRichOriginal(i18n("<h3>Chunk by chunk</h3>CHANGE THIS TEXT!!!!This translation is"
"obtained translating the sentences and using a"
"fuzzy sentence translation database.<br>"
" <b>Do not rely on it</b>. Translations may be fuzzy.<br>"));
qr.setRichResult("<font color=#800000>"+theTranslation+"</font>") ;
emit newResult(qr);
res.push_back(qr);
}
return res;
}
ChunkByChunkSearchAlgorithm::ChunkByChunkSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ): AbstractSearchAlgorithm(dbi,sets) , factory(0)
{
}
SentenceArchiveSearchAlgorithm::SentenceArchiveSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ): AbstractSearchAlgorithm(dbi,sets)
{
}
FuzzyChunkSearchAlgorithm::FuzzyChunkSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ) : AbstractSearchAlgorithm(dbi,sets)
{
}
DataBaseInterface::ResultList FuzzyChunkSearchAlgorithm::exec( const TQString & query )
{
//FIXME: this code is shit too
ResultList res;
factory->setQuery(query);
TQPtrList<AbstractChunk> querychunks = factory->chunks();
querychunks.setAutoDelete(true);
typedef TQMap<TQString,TQValueList<unsigned int> > ResultMap;
ResultMap rmap; //result of words index query
unsigned int notfound=0,frequent=0,nchunks = querychunks.count();
//Get index list for each word
for(AbstractChunk *it=querychunks.first(); it &&!di->stopNow() ; it=querychunks.next() )
{
TQValueList<uint> locations = (*it).locationReferences();
if(locations.count()>0)
{
rmap[(*it).chunkString()] = locations;
if(locations.count()>1000) //FIXME NORMALIZE THIS!!!
{
frequent++;
kdDebug(0) << "\""<<(*it).chunkString() << "\" is frequent" <<endl;
}
}
else
notfound++;
}
//Now we have a map (rmap) "word in query->list of occurency"
TQValueList<unsigned int>::iterator countpos[nchunks+1];
TQValueList<unsigned int> il;
for(int i = 0;i<=nchunks&&!di->stopNow();i++)
countpos[i]=il.end();
unsigned int bestcount=0;
while(!rmap.isEmpty())
{
unsigned int ref,count;
ref=(unsigned int)-1;
count=0;
//This will find the min head and count how many times it occurs
for(ResultMap::iterator it = rmap.begin();it!=rmap.end()&&!di->stopNow();++it)
{
unsigned int thisref=it.data().first();
if(thisref<ref)
{
ref=thisref;
count=0;
}
if(thisref==ref)
{
count++;
}
}
for(ResultMap::iterator it = rmap.begin();it!=rmap.end()&&!di->stopNow();)
{
it.data().remove(ref);
//kdDebug(0)<< ((frequent<(nwords-notfound)) && (it.data().count()>350)) <<endl;
//FIXME: I think the frequent word check is not in the right place
if(it.data().isEmpty() || (((frequent+notfound)<nchunks) && (it.data().count()>1000)))
//very dirty hack...
{
ResultMap::iterator it2=it;
it++;
rmap.remove(it2);
}
else it++;
}
//This should be configurable or optimized:
if(count>=(nchunks-notfound)*0.50 && count!=0)
{
il.insert(countpos[count],ref);
for(unsigned int i = nchunks;i>=count;i--)
if(countpos[i]==countpos[count])
countpos[i]--;
}
}
//loop on number of words found
int bestscore=0;
for(unsigned int wf=nchunks;wf>0;wf-- ){
for(TQValueList<unsigned int>::iterator it=countpos[wf];it!=countpos[wf-1] ;++it)
{ //loop on entries with same number of word found
DataBaseInterface::MainEntry e;
e=di->getFromIndex(*it);
TQStringList trs=e.second.getTranslations();
for(TQStringList::iterator it=trs.begin();it!=trs.end()&&!di->stopNow();++it)
{
unsigned int cinr=factory->chunks(*it).count(); //chunk in result
//compute a score, lets kbabel sort now, it should be fast...
int score=90*wf/nchunks-(signed int)90*(((nchunks-cinr)>0)?(nchunks-cinr):(cinr-nchunks))/(nchunks*10);
if(score>bestscore) bestscore=score;
if(score>bestscore*0.40)
{
// kdDebug(0) << "s: "<<score << " wf: "<<wf<<" nwords: "<<nwords<<" winr: "<<winr
// <<" 90*wf/nwords: "<<90*wf/nwords << " -:" << 90*(((nwords-winr)>0)?(nwords-winr):(winr-nwords))/(nwords*10)<< endl;
// FIXME: format better the richtext
TQString ori=e.first.getString();
TQString re=di->format(di->simple(*it,true),query);
QueryResult r(re,ori,score);
for(TQPtrListIterator<AbstractChunk> it(querychunks); it.current() && di->stopNow() ; ++it){
ori=ori.replace(TQRegExp((*it)->chunkString(),false),"<font color=#000080><u><b>"+(*it)->chunkString()+"</b></u></font>");
}
r.setRichOriginal(ori);
if(!di->stopNow())
emit newResult(r);
res.push_back(r);
}
}
}
}
return res;
}
DataBaseInterface::ResultList CorrelationSearchAlgorithm::exec( const TQString & query )
{
//FIXME, this code is shit.
DataBaseInterface::ResultList res;
if(di->words(query).count()>1) return res;
TQMap<TQString,float> corRes = di->correlation(query,0,false);
float max=0,max1=0,max2=0;
TQString best,best1,best2;
for(TQMap<TQString,float>::iterator it = corRes.begin(); it !=corRes.end(); ++it)
{
if(it.data()>max)
{
max2=max1;
best2=best1;
max1=max;
best1=best;
best = it.key();
max=it.data();
}
}
if(!best.isEmpty())
{
double myscore=0.01*max*settings->scoreDynamic;
QueryResult r(di->format(best,query),i18n("DYNAMIC DICT:"),myscore);
r.setRichOriginal(i18n("<h3>Dynamic Dictionary</h3>This is a dynamic dictionary created"
" looking for correlation of original and translated words.<br>"
" <b>Do not rely on it</b>. Translations may be fuzzy.<br>"));
r.setRichResult("<font size=+2 color=#A00000>"+di->format(best,query)+"</font>") ;
res.push_back(r);
if(!di->stopNow())
emit newResult(r);
}
if(!best1.isEmpty())
{
double myscore=0.01*max1*settings->scoreDynamic;
QueryResult r(di->format(best1,query),i18n("DYNAMIC DICT:"),myscore);
r.setRichOriginal(i18n("<h3>Dynamic Dictionary</h3>This is a dynamic dictionary created"
" looking for correlation of original and translated words.<br>"
" <b>Do not rely on it</b>. Translations may be fuzzy.<br>"));
r.setRichResult("<font size=+2 color=#800000>"+di->format(best1,query)+"</font>") ;
res.push_back(r);
if(!di->stopNow())
emit newResult(r);
}
kdDebug(0) << "Correlation algorithm found" << res.count() << "results";
return res;
}
GenericSearchAlgorithm::GenericSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ): AbstractSearchAlgorithm(dbi,sets)
{
maxResults = 5; //FIXME use as default somthing from DBSESettings
}
SingleWordSearchAlgorithm::SingleWordSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ) : GenericSearchAlgorithm(dbi,sets),
exact(dbi,sets), alpha(dbi,sets), sentence(dbi,sets), corr(dbi,sets), chunk(dbi,sets),casefactory(dbi)
{
addAlgorithm(&exact);
addAlgorithm(&alpha);
addAlgorithm(&sentence);
chunk.setChunkFactory(&casefactory);
addAlgorithm(&chunk);
addAlgorithm(&corr);
}
DataBaseInterface::ResultList SingleWordSearchAlgorithm::exec( const TQString & query )
{
if(di->words(query).count()>1)
return ResultList();
return GenericSearchAlgorithm::exec(query);
}
//#include "algorithms.moc"