tdesdk/kbabel/kbabeldict/modules/dbsearchengine2/algorithms.cpp

//
// C++ Implementation: algorithms
//
// Description:
//
//
// Author: Andrea Rizzi <rizzi@kde.org>, (C) 2003
//
// Copyright: See COPYING file that comes with this distribution
//
//
#include "algorithms.h"
#include <tqstringlist.h>
#include <kdebug.h>

//FIXME: remove
#define i18n (const char*)

DataBaseInterface::ResultList ExactSearchAlgorithm::exec(const TQString& query )
{
    DataBaseInterface::ResultList res;
    DataBaseInterface::MainEntry e=di->get(query,0);

    TQStringList trs=e.second.getTranslations();

    for(TQStringList::iterator it=trs.begin();it!=trs.end();++it)
    {

	emit newResult(QueryResult(*it,e.first.getString(),settings->scoreExact));

	res.push_back(QueryResult(*it));
    }
    kdDebug(0) <<"Exact algo found " << res.count() << "entries" << endl;
     return res;
}


DataBaseInterface::ResultList GenericSearchAlgorithm::exec(const TQString& query )
{
    DataBaseInterface::ResultList res;
   // ExactSearchAlgorithm exact(query,settings);
    uint countResults=0;
    for(TQValueList<AbstractSearchAlgorithm *>::iterator algoit = algoChain.begin(); algoit!=algoChain.end() && countResults < maxResults; algoit++)
	{
	connect(*algoit,TQT_SIGNAL(newResult(QueryResult)),this,TQT_SIGNAL(newResult(QueryResult)));
	kdDebug(0) << "Algo pointer" << (*algoit) << endl;
	res+=(*algoit)->exec(query);
	countResults=res.count();
	kdDebug(0) << "Count = " << countResults << endl;
	disconnect(*algoit,TQT_SIGNAL(newResult(QueryResult)),this,TQT_SIGNAL(newResult(QueryResult)));
    }
    return res;
}

void GenericSearchAlgorithm::addAlgorithm( AbstractSearchAlgorithm * algo )
{
    algoChain.append(algo);
}

DataBaseInterface::ResultList AlphaSearchAlgorithm::exec( const TQString & query )
{
    DataBaseInterface::ResultList res;
    DBItemMultiIndex::IndexList il=di->getAlpha(query);

    for(DBItemMultiIndex::IndexList::iterator it=il.begin();it!=il.end()&&!di->stopNow();++it)
    {
	DataBaseInterface::MainEntry e=di->getFromIndex(*it);
	TQStringList trs=e.second.getTranslations();
	for(TQStringList::iterator it=trs.begin();it!=trs.end() && !di->stopNow();++it)
	{
	    QueryResult r(di->format(di->simple(*it,true),query),e.first.getString(),settings->scoreAlpha);
	    emit newResult(r);
	    res.push_back(r);
	}
    }
    kdDebug(0) <<"Alpha algo found " << res.count() << "entries" << endl;

    return res;
}

DataBaseInterface::ResultList SentenceArchiveSearchAlgorithm::exec( const TQString & query )
{
    DataBaseInterface::ResultList res;

    DataBaseInterface::MainEntry e = di->getSentence(query);

    TQStringList trs=e.second.getTranslations();

    kdDebug(0) << "Count in sentence archive " << trs.count()<< endl;

    for(TQStringList::iterator it=trs.begin();it!=trs.end();++it)
    {
	QueryResult r(di->format(di->simple(*it,true),query),e.first.getString(),settings->scoreSentence);
	emit newResult(r);

	res.push_back(r);
    }
    kdDebug(0) <<"Sentence algo found " << res.count() << "entries" << endl;

    return res;
}

DataBaseInterface::ResultList ChunkByChunkSearchAlgorithm::exec( const TQString & query )
{
    ResultList res;
    factory->setQuery(query);
    TQPtrList<AbstractChunk> chunks=factory->chunks();
    kdDebug(0) << "Number of chunks " << chunks.count() << endl;
    chunks.setAutoDelete(true); //I should delete the chunks myself
    TQStringList querySeparators=factory->separators();

	//This prevents recursive loop.
	if (chunks.count()<=1) return res;

	TQStringList translations,tmpTranslations;

    translations.push_back("");   //FIXME this is needed to start  , but is not good
    int finalscore=0;
    int i=0;
    TQMap<TQString,bool> translationUsed;

    //Loop on all chunk
    for(AbstractChunk *it=chunks.first();it && !di->stopNow(); it=chunks.next())
    {
	kdDebug(0) << "Process next chunk" << endl;
 	int chunkscore=0;
	TQValueList<QueryResult> r=it->translations();
	 kdDebug(0) << "Number of results for this chunk " << r.count() << endl;

	if(r.count()<1) {
	    // kdDebug(0) << "Nothing found for:" << it->translations() << endl;
	    chunkscore=-10;

	}
	else
	{
	    //FIXME: check this, why 0? it is the best one?
	    chunkscore=r[0].score();
	    kdDebug(0) << "ChunkScore " << chunkscore << endl;
	    tmpTranslations.clear();


	    //Loop on results
	    translationUsed.clear();
	    for(ResultList::iterator it1=r.begin();it1!=r.end() &&!di->stopNow(); ++it1)
	    {
		TQString chunkTranslation= (*it1).result();
		if(!translationUsed.contains(chunkTranslation))
		{
		    translationUsed[chunkTranslation]=true;
		    kdDebug(0) << "a translation is: " << chunkTranslation << endl;
		    for(TQStringList::iterator it2=translations.begin();it2!=translations.end() && !di->stopNow() ; it2++)
		    {
			TQString prevTranslation=*it2;
			tmpTranslations.push_back(prevTranslation+chunkTranslation+querySeparators[i]);
			kdDebug(0) << "..appending it to " << prevTranslation << endl;
		    }
		}
	    }

	    translations=tmpTranslations;

	}

	//kdDebug(0) << it-> << r[0].result() << "#" << querySeparators[i] << endl;
	i++;
	finalscore+=chunkscore;

	kdDebug(0) << "partial score " << finalscore;
    }
    kdDebug(0) << "this is finishd" << endl;
       if(settings->scoreChunkByChunk==0)
	settings->scoreChunkByChunk=1;
// FIXME:fix the score system
//    finalscore/=(i*100*100/settings->scoreChunkByChunk);  //change 100 to 120(?) to lower this result (done)

    if (finalscore<50) return res;

    for(TQStringList::iterator it2=translations.begin();it2!=translations.end() && !di->stopNow() ; it2++)
    {
	TQString theTranslation=*it2;
	QueryResult qr(di->format(theTranslation,query),i18n("CHUNK BY CHUNK"),finalscore);
	qr.setRichOriginal(i18n("<h3>Chunk by chunk</h3>CHANGE THIS TEXT!!!!This translation is"
				"obtained translating the  sentences and using a"
				"fuzzy sentence translation database.<br>"
				" <b>Do not rely on it</b>. Translations may be fuzzy.<br>"));
	qr.setRichResult("<font color=#800000>"+theTranslation+"</font>") ;
	emit newResult(qr);
	res.push_back(qr);
    }


    return res;


}

ChunkByChunkSearchAlgorithm::ChunkByChunkSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ): AbstractSearchAlgorithm(dbi,sets) , factory(0)
{

}


SentenceArchiveSearchAlgorithm::SentenceArchiveSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ): AbstractSearchAlgorithm(dbi,sets)
{
}

FuzzyChunkSearchAlgorithm::FuzzyChunkSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ) : AbstractSearchAlgorithm(dbi,sets)
{

}


DataBaseInterface::ResultList FuzzyChunkSearchAlgorithm::exec( const TQString & query )
{
    //FIXME: this code is shit too
    ResultList res;
    factory->setQuery(query);
    TQPtrList<AbstractChunk> querychunks = factory->chunks();
    querychunks.setAutoDelete(true);

    typedef TQMap<TQString,TQValueList<unsigned int> > ResultMap;
    ResultMap rmap;  //result of words index query
    unsigned int notfound=0,frequent=0,nchunks = querychunks.count();

    //Get index list for each word
    for(AbstractChunk *it=querychunks.first(); it &&!di->stopNow() ;  it=querychunks.next()  )
    {
	TQValueList<uint> locations = (*it).locationReferences();

	if(locations.count()>0)
	{
	    rmap[(*it).chunkString()] = locations;

	    if(locations.count()>1000)  //FIXME NORMALIZE THIS!!!
	    {
		frequent++;
		kdDebug(0) << "\""<<(*it).chunkString()  << "\" is frequent" <<endl;
	    }
	}
	else
	    notfound++;

    }


    //Now we have a map (rmap)  "word in query->list of occurency"

    TQValueList<unsigned int>::iterator countpos[nchunks+1];


    TQValueList<unsigned int> il;
    for(int i = 0;i<=nchunks&&!di->stopNow();i++)
	countpos[i]=il.end();

    unsigned int bestcount=0;
    while(!rmap.isEmpty())
    {
	unsigned int ref,count;
	ref=(unsigned int)-1;
	count=0;


	//This will find the min head and count how many times it occurs
	for(ResultMap::iterator it = rmap.begin();it!=rmap.end()&&!di->stopNow();++it)
	{
	    unsigned int thisref=it.data().first();
	    if(thisref<ref)
	    {
		ref=thisref;
		count=0;
	    }
	    if(thisref==ref)
	    {
		count++;
	    }

	}


	for(ResultMap::iterator it = rmap.begin();it!=rmap.end()&&!di->stopNow();)
	{
	    it.data().remove(ref);

	    //kdDebug(0)<< ((frequent<(nwords-notfound)) && (it.data().count()>350)) <<endl;
	    //FIXME: I think the frequent word check is not in the right place
	    if(it.data().isEmpty() || (((frequent+notfound)<nchunks) && (it.data().count()>1000)))
		//very dirty hack...
	    {

		ResultMap::iterator it2=it;
		it++;
		rmap.remove(it2);
	    }
	    else it++;

	}

	//This should be configurable or optimized:
	if(count>=(nchunks-notfound)*0.50 && count!=0)
	{
	    il.insert(countpos[count],ref);
	     for(unsigned int i = nchunks;i>=count;i--)
		if(countpos[i]==countpos[count])
		    countpos[i]--;
	}
    }

    //loop on number of words found
    int bestscore=0;

    for(unsigned int wf=nchunks;wf>0;wf-- ){
	for(TQValueList<unsigned int>::iterator it=countpos[wf];it!=countpos[wf-1] ;++it)
	{ //loop on entries with same number of word found
	    DataBaseInterface::MainEntry e;
	    e=di->getFromIndex(*it);
	      TQStringList trs=e.second.getTranslations();
	    for(TQStringList::iterator it=trs.begin();it!=trs.end()&&!di->stopNow();++it)
	    {
		unsigned int cinr=factory->chunks(*it).count(); //chunk in result
		//compute a score, lets kbabel sort now, it should be fast...
		int score=90*wf/nchunks-(signed int)90*(((nchunks-cinr)>0)?(nchunks-cinr):(cinr-nchunks))/(nchunks*10);
		if(score>bestscore) bestscore=score;
		if(score>bestscore*0.40)
		{
		    // kdDebug(0) << "s: "<<score << "  wf: "<<wf<<"  nwords: "<<nwords<<" winr:  "<<winr
		    //    <<" 90*wf/nwords: "<<90*wf/nwords << "  -:" <<  90*(((nwords-winr)>0)?(nwords-winr):(winr-nwords))/(nwords*10)<< endl;
		    // FIXME: format better the richtext
		    TQString ori=e.first.getString();
		    TQString re=di->format(di->simple(*it,true),query);
		    QueryResult r(re,ori,score);
		    for(TQPtrListIterator<AbstractChunk> it(querychunks); it.current() && di->stopNow() ; ++it){
			ori=ori.replace(TQRegExp((*it)->chunkString(),false),"<font color=#000080><u><b>"+(*it)->chunkString()+"</b></u></font>");
		    }
		    r.setRichOriginal(ori);
		    if(!di->stopNow())
			emit newResult(r);
		    res.push_back(r);
		}
	    }
	}
    }
    return res;

}

DataBaseInterface::ResultList CorrelationSearchAlgorithm::exec( const TQString & query )
{
    //FIXME, this code is shit.
    DataBaseInterface::ResultList res;
    if(di->words(query).count()>1) return res;
    TQMap<TQString,float> corRes = di->correlation(query,0,false);
    float max=0,max1=0,max2=0;
    TQString best,best1,best2;

    for(TQMap<TQString,float>::iterator it = corRes.begin(); it !=corRes.end(); ++it)
    {
	if(it.data()>max)
	{
	    max2=max1;
	    best2=best1;
	    max1=max;
	    best1=best;
	    best = it.key();
	    max=it.data();

	}


    }
    if(!best.isEmpty())
    {
	double myscore=0.01*max*settings->scoreDynamic;
	QueryResult r(di->format(best,query),i18n("DYNAMIC DICT:"),myscore);
	r.setRichOriginal(i18n("<h3>Dynamic Dictionary</h3>This is a dynamic dictionary created"
			       " looking for correlation of original and translated words.<br>"
			       " <b>Do not rely on it</b>. Translations may be fuzzy.<br>"));
	r.setRichResult("<font size=+2 color=#A00000>"+di->format(best,query)+"</font>") ;
	res.push_back(r);
	if(!di->stopNow())
	    emit newResult(r);
    }
    if(!best1.isEmpty())
    {
	double myscore=0.01*max1*settings->scoreDynamic;
	QueryResult r(di->format(best1,query),i18n("DYNAMIC DICT:"),myscore);
	r.setRichOriginal(i18n("<h3>Dynamic Dictionary</h3>This is a dynamic dictionary created"
			       " looking for correlation of original and translated words.<br>"
			       " <b>Do not rely on it</b>. Translations may be fuzzy.<br>"));
	r.setRichResult("<font size=+2 color=#800000>"+di->format(best1,query)+"</font>") ;
	res.push_back(r);
	if(!di->stopNow())
	    emit newResult(r);
    }

    kdDebug(0) << "Correlation algorithm found" << res.count() << "results";
    return res;

}

GenericSearchAlgorithm::GenericSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ): AbstractSearchAlgorithm(dbi,sets)
{
    maxResults = 5; //FIXME use as default somthing from DBSESettings
}

SingleWordSearchAlgorithm::SingleWordSearchAlgorithm( DataBaseInterface * dbi, DBSESettings * sets ) : GenericSearchAlgorithm(dbi,sets),
    exact(dbi,sets), alpha(dbi,sets), sentence(dbi,sets), corr(dbi,sets), chunk(dbi,sets),casefactory(dbi)
    {
    addAlgorithm(&exact);
    addAlgorithm(&alpha);
    addAlgorithm(&sentence);
	chunk.setChunkFactory(&casefactory);
	addAlgorithm(&chunk);
	addAlgorithm(&corr);
}

DataBaseInterface::ResultList SingleWordSearchAlgorithm::exec( const TQString & query )
{
    if(di->words(query).count()>1)
	return ResultList();
    return GenericSearchAlgorithm::exec(query);
}


//#include "algorithms.moc"