You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdesdk/kbabel/kbabeldict/modules/dbsearchengine2/database.cpp

753 lines
17 KiB

/*
DBSE 3
(c) 2000-2003 Andrea Rizzi
License: GPLv2
*/
#include <math.h>
#include "database.h"
#include <tqregexp.h>
#include <tqdict.h>
#include <tdeapplication.h>
#include <kdebug.h>
#include <tdemessagebox.h>
#define i18n (const char*)
DataBase::DataBase(TQString dbpath,TQString dbname, TQString dblang) : Db(0,DB_CXX_NO_EXCEPTIONS)
{
filename=dbpath+"."+dblang+".db";
database=dbname;
}
int DataBase::open(DBTYPE type,unsigned int flags)
{
int ret;
ret = Db::open(
#if DB_VERSION_MINOR > 0
NULL,
#endif
(const char*)filename.local8Bit(),(const char *)database.local8Bit(),type,flags,0644);
mytype=type;
return ret;
}
unsigned int DataBase::getLast()
{
if(mytype!=DB_RECNO)
return 0;
Dbc *cur;
cursor(0,&cur,0);
DBItemNum index;
DBItemMainKey key;
cur->get(&index,&key,DB_LAST);
return index.getNum();
}
QueryResult::QueryResult(TQString r)
{
res=r;
}
QueryResult::QueryResult(TQString r,TQString o,int s)
{
res=r;
richr=r;
orig=o;
richo=o;
sco=s;
}
QueryResult::QueryResult()
{
res="";
}
DataBaseInterface::DataBaseInterface(TQString dir, DBSESettings *sets)
{
//FIXME Better db names!!
main = openMyDataBase(dir+"/testm","main","it",DB_BTREE);
alpha = openMyDataBase(dir+"/testa","alpha","it",DB_BTREE);
numindex = openMyDataBase(dir+"/testn","numindex","it",DB_RECNO);
wordsindex = openMyDataBase(dir+"/testw","wordsindex","it",DB_BTREE);
sentence = openMyDataBase(dir+"/tests","sentence","it",DB_BTREE);
corr = openMyDataBase(dir+"/testc","corr","it",DB_BTREE);
transword = openMyDataBase(dir+"/testt","transword","it",DB_RECNO);
// kdDebug(0) << main << endl;
// kdDebug(0) << alpha << endl;
settings=sets;
_stopNow=false;
}
DataBaseInterface::~DataBaseInterface()
{
if(main){
main->close(0);
delete main;
}
if(numindex){
numindex->close(0);
delete numindex;
}
if(alpha){
alpha->close(0);
delete alpha;
}
if(wordsindex){
wordsindex->close(0);
delete wordsindex;
}
if(sentence){
sentence->close(0);
delete sentence;
}
}
DataBase *DataBaseInterface::openMyDataBase(const TQString& prefix,const TQString& name,const TQString& l,DBTYPE tt)
{
DataBase *aDb = new DataBase(prefix,name,l);
if(aDb==0){
return 0;
}
else
{
if(aDb->open(tt)!=0)
{
kdDebug(0) << "Database '"<< name <<"'do not exist, I try to create it.." << endl;
//ask only the first time.
static bool create=( KMessageBox::questionYesNo(0,"Database do not exist. Do you want to create it now?",
i18n("Create Database"), i18n("Create"), i18n("Do Not Create"))==KMessageBox::Yes);
if(create)
if(aDb->open(tt,DB_CREATE)!=0)
{
kdDebug(0) << "...cannot create!!"<< endl;
return 0;
}
else
{
kdDebug(0) << "...done!" << endl;
return aDb;
}
}
}
return aDb;
}
/*
* query functions.
*
*/
DataBaseInterface::MainEntry DataBaseInterface::get(const TQString& query,SearchFilter *filter)
{
static int counter=1;
counter++;
DBItemMainKey k(query);
DBItemMainData d;
//int r=
main->get(&k,&d);
// kdDebug(0) << "MAINDB->GET returned: " << r << endl;
if(counter%5==0) kapp->processEvents(100);
// kdDebug(0) << "events processed" << endl;
return tqMakePair(k,d);
}
/*
* put functions
* *
*/
bool DataBaseInterface::addEntry(TQString original,TQString translated,InputInfo *info)
{
DBItemMainKey mk(original);
DBItemMainData md;
TQMap<TQString, int> correlationDiff;
bool newentry=false;
//try to get
kdDebug(0) << "Inserting the pair:" << endl;
kdDebug(0) << "ORIGINAL:" << original << endl;
kdDebug(0) << "TRANSLATED:" << translated << endl;
if(main->get(&mk,&md)==DB_NOTFOUND)
{
kdDebug(0) << "new entry" << endl;
newentry=true;
//This is a new entry, create index entry
DBItemNum *nind;
int newid=numindex->getLast()+1;
nind=new DBItemNum(newid);
numindex->put(nind,&mk);
delete nind;
md.clear();
md.setIndexnumber(newid);
//Update secondary index alpha
DBItemMainKey ka(simple(original));
DBItemMultiIndex in;
if(alpha->get(&ka,&in)==DB_NOTFOUND) in.clear() ;
//alpha->get(&ka,&in);
in.addEntry(newid);
alpha->put(&ka,&in);
kdDebug(0) << "Updating the word index " << endl;
//Update words index
TQStringList ws=words(original);
for(TQStringList::iterator it = ws.begin(); it!=ws.end(); ++it)
{
DBItemMainKey word(*it);
DBItemMultiIndex win;
if(wordsindex->get(&word,&win)==DB_NOTFOUND) win.clear();
win.addEntry(newid);
wordsindex->put(&word,&win);
}
kdDebug(0) << "new entry preparation DONE" << endl;
}
else
{
kdDebug(0) << "It exists!" <<endl;
}
//Update sentence index
TQStringList so=sentences(original);
TQStringList st=sentences(translated);
if(so.count()==st.count() && st.count() >1 ) //we already hav a database for single string.
{
kdDebug(0) << "inside sentence loop" << endl;
for(int i=0; i< so.count() ; i++)
{
DBItemMainKey sk(so[i]);
DBItemMainData sd;
if(sentence->get(&sk,&sd)==DB_NOTFOUND&&!newentry)
kdDebug(0) << "Warning: new sentence for old entry, do we changed sentence definition? " << endl;
kdDebug(0) << "here alive" << endl;
// if(clean)
sd.removeRef(info->ref());
kdDebug(0) << "now alive" << endl;
sd.addTranslation(st[i],info->ref());
kdDebug(0) << "still alive" << endl;
sentence->put(&sk,&sd);
}
}
kdDebug(0) << "Fuzzy sentence archive updated" << endl;
//Add that translation, link to ref for information on that translation
if(!translated.isEmpty())
{
//loop on all translations to update correlation
TQStringList tmpTranslations=md.getTranslations();
for(TQStringList::iterator otIt=tmpTranslations.begin(); otIt!=tmpTranslations.end();++otIt)
{
TQStringList wt=words(*otIt);
for(TQStringList::iterator it = wt.begin(); it!=wt.end(); ++it)
{
if(correlationDiff.contains(*it))
correlationDiff[*it]--;
else
correlationDiff[*it]=-1;
}
}
//clean so that we have only one translation per catalog.
md.removeRef(info->ref());
md.addTranslation(translated,info->ref());
tmpTranslations=md.getTranslations();
for(TQStringList::iterator otIt=tmpTranslations.begin(); otIt!=tmpTranslations.end();++otIt)
{
TQStringList wt=words(*otIt);
for(TQStringList::iterator it = wt.begin(); it!=wt.end(); ++it)
{
if(correlationDiff.contains(*it))
correlationDiff[*it]++;
else
correlationDiff[*it]=1;
}
}
//FIXME: use the correlationDIff map somehow
}
//finally put!
return (main->put(&mk,&md)==0);
}
bool DataBaseInterface::removeEntry(TQString original)
{
DBItemMainKey mk(original);
DBItemMainData md;
//FIXME implement remove
//try to get
if(main->get(&mk,&md)==DB_NOTFOUND)
{
/* //This is a new entry, create index entry
DBItemNum *nind;
int newid=numindex->getLast()+1;
nind=new DBItemNum(newid);
numindex->put(nind,&mk);
delete nind;
md.clear();
md.setIndexnumber(newid);
//Update secondary index alpha
DBItemMainKey ka(simple(original));
DBItemMultiIndex in;
if(alpha->get(&ka,&in)==DB_NOTFOUND) in.clear() ;
//alpha->get(&ka,&in);
in.addEntry(newid);
alpha->put(&ka,&in);
//Update words index
TQStringList ws=words(original);
for(TQStringList::iterator it = ws.begin(); it!=ws.end(); it++)
{
DBItemMainKey word(*it);
DBItemMultiIndex win;
if(wordsindex->get(&word,&win)==DB_NOTFOUND) win.clear();
win.addEntry(newid);
wordsindex->put(&word,&win);
}
//Update sentence index
TQStringList so=sentences(original);
TQStringList st=sentences(translated);
if(so.count()==st.count() && st.count() >1 ) //we already hav a database for single string.
{
for(int i=0; i< so.count() ; i++)
{
DBItemMainKey sk(so[i]);
DBItemMainKey sd(st[i]); //should be a list i.e. main data?
sentence->put(&sk,&sd);
}
}
*/
}
return false;
}
TQMap<TQString,float> DataBaseInterface::correlation(TQString word,SearchFilter *filter,bool notify, float minSign)
{
TQDict<unsigned int> res;
// res.setAutoDelete(true);
TQMap<TQString, float>final;
DBItemMultiIndex::IndexList il;
unsigned int tot=0;
unsigned int background=0;
unsigned int nocck;
TQString sword=simple(word);
DBItemMainKey *k = new DBItemMainKey(sword);
DBItemMultiIndex *d = new DBItemMultiIndex();
if(wordsindex->get(k,d)!=DB_NOTFOUND)
{
il=d->getList();
kdDebug(0) << il.count()<<endl;
tot=0;
for(TQValueList<unsigned int>::iterator it=il.begin();it!=il.end();++it)
{
numindex->get(*it,k);
// TQValueList<QueryResult> trad=exactMatch(k->getString(),filter);
MainEntry e=get(k->getString(),filter);
TQStringList trad=e.second.getTranslations();
nocck=words(k->getString()).contains(sword);
for( TQStringList::iterator it2=trad.begin();it2!=trad.end();++it2)
{
TQStringList w=words(*it2);
unsigned int numWords = w.count()*10+1;
unsigned int wei=100000/sqrt(numWords); //weight (is the best one?)
background+=(numWords-nocck)*wei;
TQDict<uint> count;
//count.setAutoDelete(true);
//FIXME:SET AUTODELETE FOR ALL DICTS
for(TQStringList::iterator it1=w.begin();it1!=w.end();it1++)
{
uint *ip;
if(!(ip=count[*it1])) count.insert(*it1,new uint(1));
else
(*ip)++;
}
for(TQStringList::iterator it1=w.begin();it1!=w.end();it1++)
{
uint *ip;
if(*(count[*it1])==nocck) //add only if same number of entry (it cuts articles)
if(!(ip=res[*it1])) res.insert(*it1,new uint(wei));
else
(*ip)+=wei;
}
}
}
unsigned int sqrBG=sqrt((1.0*background+1)/10000);
for(TQDictIterator<uint> it(res) ; it.current(); ++it)
{
float sign=1.0*(*(it.current()))/(10000.0*sqrBG);
if(sign >minSign){
final[it.currentKey()]=sign;
kdDebug(0) << it.currentKey() <<" Score:" << 1.0*(*(it.current()))/10000 << "/" <<sqrBG << " = " <<sign << endl;
}
}
kdDebug(0) << "final count " <<final.count()<< endl;
}
return final;
}
TQStringList DataBaseInterface::words(TQString s)
{
TQString str=simple(s);
TQStringList list;
int pos;
do {
pos=str.find(TQRegExp("\\s"));
// if(!simple(str.left(pos)).isEmpty())
// list.append(simple(str.left(pos)));
if(!str.left(pos).isEmpty())
list.append(str.left(pos));
str=str.remove(0,pos+1);
} while(!str.isEmpty() && pos != -1);
return list;
}
TQString DataBaseInterface::simple(TQString str,bool ck)
{
TQString res;
if(ck)
res=str; //case keep
else
res=str.lower(); //lowercase
//FIXME: uncoment the foll. line (check speed)
res=res.replace(TQRegExp("(<(.*)>)(.*)(</\\2>)"),"\\3"); //remove enclosing tags
//Try to get rid of regexps.
// res=res.replace(TQRegExp("(('|-|_|\\s|[^\\w%])+)")," "); //strip non-word char
// res=res.replace(TQRegExp("(('|-|_)+)")," "); //strip non-word char
// res=res.replace(TQRegExp("[^\\w\\s%]"),""); //strip non-word char
TQString r;
TQChar c;
bool wasSpace=true;
uint len=res.length();
for(uint i=0; i<len;i++)
{
c=res[i];
if(c.isLetterOrNumber())
{
r+=c;
wasSpace=false;
}
else
{
if(!wasSpace && c.isSpace())
{
r+=' ';
wasSpace=true;
}
else
{
if(!wasSpace && (c=='-' || c=='\'' || c=='_'))
{
r+=' ';
wasSpace=true;
}
else
{
if(c=='%'){
r+=c;
wasSpace=false;
}
}
}
}
// wasSpace=c.isSpace();
}
if(r[len-1].isSpace())
r.truncate(len-1);
res=r;
//kdDebug(0) << "Simple: "<<res<< endl;
//res=res.simplifyWhiteSpace(); //remove double spaces
//res=res.stripWhiteSpace(); //" as " -> "as"
// kdDebug(0) << res << endl;
return res;
}
TQStringList DataBaseInterface::sentences(TQString s)
{
TQString str=s;
TQStringList list;
// kdDebug(0) << s << endl;
int pos;
do {
TQRegExp re("((\\.|;|\\?|\\!|:)( |$|\\\\n\\n))");
pos=re.search(str);
if(!str.left(pos).isEmpty())
list.append(str.left(pos).stripWhiteSpace());
kdDebug(0) << str.left(pos) << endl;
str=str.remove(0,pos+re.cap(1).length());
} while(!str.isEmpty() && pos != -1);
return list;
}
TQStringList DataBaseInterface::sentencesSeparator(TQString s)
{
TQString str=s;
TQStringList list;
// kdDebug(0) << s << endl;
int pos;
do {
TQRegExp re;
re.setPattern("([.:?!;]( |$|\\\\n\\n))");
pos = re.search(str);
TQString separator=re.cap(1);
if(pos!=-1){
list.append(separator);
}
str=str.remove(0,pos+1);
} while(!str.isEmpty() && pos != -1);
return list;
}
bool DataBaseInterface::isUpper(TQChar s)
{
return s==s.upper();
}
bool DataBaseInterface::isLower(TQChar s)
{
return s==s.lower();
}
TQString DataBaseInterface::format(TQString _s,TQString t)
{
//FIXME use settings
//FIXME use regexp
TQString s=_s;
TQString noTagT=t.replace(TQRegExp("(<(.*)>)(.*)(</\\2>)"),"\\3");
TQChar first=noTagT[noTagT.find(TQRegExp("\\w"))];
bool firstCapital=isUpper(first);
/*
bool dotsAtEnd=(t.find("...")+3==t.length());
bool gtgtAtEnd=(t.find(">>")+2==t.length());
bool ltltAtEnd=(t.find("<<")==t.length()-2);
bool columnAtEnd=(t.find(":")+1==t.length());
*/
bool allupper=(t.upper()==t);
if(firstCapital)
s[0]=s[0].upper();
else
s[0]=s[0].lower();
//if(dotsAtEnd)
// s+="...";
/*if(gtgtAtEnd)
s+=">>";
if(ltltAtEnd)
s+="<<";
if(columnAtEnd)
s+=":";
*/
if(allupper)
s=s.upper();
int pos=t.find(TQRegExp("&"));
if(pos>=0) {
TQChar accel=t[t.find(TQRegExp("&"))+1];
if(accel!='&')
{
pos=s.find(accel,false);
if(pos<0)
pos=0;
s.insert(pos,"&");
}
}
s=formatRegExp(s,t,".*(\\.\\.\\.|:|>>|<<|\\.|\\?)$",
"^(.*)$",
"\\1@CAP1@");
s=formatRegExp(s,t,"(<(.*)>).*(\\.\\.\\.|:|>>|<<|\\.|\\?)*(</\\2>)$",
"^(.*)$",
"@CAP1@\\1@CAP3@@CAP4@");
return s;
}
TQString DataBaseInterface::formatRegExp(TQString _s, TQString t, TQString tre,TQString stringSearch,TQString stringReplace)
{
TQString s=_s;
TQRegExp templateRegExp(tre);
//TQString stringSearch = "(.*)!@CAP1@$"; // use @CAP1.. fot caps in templates
//TQString stringReplace = "\\1@CAP1@"; // use \1, \2 for caps in str and @CAP1 fot caps in template
if(templateRegExp.exactMatch(t))
{
TQStringList caps=templateRegExp.capturedTexts();
int i=0;
for(TQStringList::iterator capit=caps.begin();capit!=caps.end();++capit)
{
TQString phRegExp="(?!\\\\)@CAP"+TQString::number(i)+"@";
//kdDebug(0) << "phRegExp: " << phRegExp << endl;
//kdDebug(0) << "cap[" << i << "]: "<< *capit<< endl;
stringReplace = stringReplace.replace(TQRegExp(phRegExp),*capit);
stringSearch = stringSearch.replace(TQRegExp(phRegExp),*capit);
i++;
}
// kdDebug(0) << "stringSearch " << stringSearch << endl;
// kdDebug(0) << "stringReplace " << stringReplace << endl;
TQRegExp stringSearchRegExp = TQRegExp(stringSearch);
// kdDebug(0) << "before: "<<s<<endl;
s = s.replace(stringSearchRegExp,stringReplace);
// kdDebug(0) << "after: "<<s<<endl;
}
return s;
}
DBItemMultiIndex::IndexList DataBaseInterface::getAlpha( const TQString & query )
{
DBItemMainKey *k = new DBItemMainKey(simple(query));
DBItemMultiIndex *d = new DBItemMultiIndex();
alpha->get(k,d);
return d->getList();
}
DataBaseInterface::MainEntry DataBaseInterface::getFromIndex( uint i )
{
DBItemMainKey k;
numindex->get(i,&k);
return get(k.getString(),0); //FIXME: this is a BUG right now but the filter should be removed
}
DataBaseInterface::MainEntry DataBaseInterface::getSentence( const TQString & query )
{
static int counter=1;
counter++;
DBItemMainKey k(query);
DBItemMainData d;
sentence->get(&k,&d);
if(counter%5==0) kapp->processEvents(100);
return tqMakePair(k,d);
}
DBItemMultiIndex::IndexList DataBaseInterface::getWordIndex( const TQString & query )
{
DBItemMainKey k = DBItemMainKey(query);
DBItemMultiIndex d = DBItemMultiIndex();
if(wordsindex->get(&k,&d)!=DB_NOTFOUND){
return d.getList();
}
else
{
TQValueList<unsigned int> tmpList;
return tmpList;
}
}
//#include "database.moc.cpp"