|
|
|
/***************************************************************************
|
|
|
|
* Copyright (C) 2008 by Jacob Kanev <j_kanev@arcor.de>, *
|
|
|
|
* Thomas Fischer <fischer@unix-ag.uni-kl.de> *
|
|
|
|
* *
|
|
|
|
* This program is free software; you can redistribute it and/or modify *
|
|
|
|
* it under the terms of the GNU General Public License as published by *
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or *
|
|
|
|
* (at your option) any later version. *
|
|
|
|
* *
|
|
|
|
* This program is distributed in the hope that it will be useful, *
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
|
|
* GNU General Public License for more details. *
|
|
|
|
* *
|
|
|
|
* You should have received a copy of the GNU General Public License *
|
|
|
|
* along with this program; if not, write to the *
|
|
|
|
* Free Software Foundation, Inc., *
|
|
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
|
|
|
|
***************************************************************************/
|
|
|
|
#include <tqfile.h>
|
|
|
|
#include <tqregexp.h>
|
|
|
|
#include <tqbuffer.h>
|
|
|
|
#include <tqspinbox.h>
|
|
|
|
|
|
|
|
#include <tdelocale.h>
|
|
|
|
#include <klineedit.h>
|
|
|
|
#include <tdemessagebox.h>
|
|
|
|
#include <kurl.h>
|
|
|
|
#include <kdebug.h>
|
|
|
|
|
|
|
|
#include "fileimporterbibtex.h"
|
|
|
|
#include "encoderxml.h"
|
|
|
|
#include "settings.h"
|
|
|
|
#include "webqueryciteseerx.h"
|
|
|
|
|
|
|
|
using BibTeX::Value;
|
|
|
|
using BibTeX::Entry;
|
|
|
|
using BibTeX::EntryField;
|
|
|
|
|
|
|
|
namespace KBibTeX
|
|
|
|
{
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// Construct widget
|
|
|
|
|
|
|
|
WebQueryCiteSeerXWidget::WebQueryCiteSeerXWidget( TQWidget *parent, const char *name )
|
|
|
|
: WebQueryWidget( parent, name )
|
|
|
|
{
|
|
|
|
init();
|
|
|
|
|
|
|
|
Settings *settings = Settings::self();
|
|
|
|
TQString value = settings->getWebQueryDefault( "CiteSeerX" );
|
|
|
|
value = value == TQString::null ? "" : value;
|
|
|
|
lineEditQuery->setText( value );
|
|
|
|
slotTextChanged( value, true );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// Construct
|
|
|
|
|
|
|
|
WebQueryCiteSeerX::WebQueryCiteSeerX( TQWidget* parent )
|
|
|
|
: WebQuery( parent ), m_citeSeerXServer( "citeseerx.ist.psu.edu" )
|
|
|
|
{
|
|
|
|
m_widget = new WebQueryCiteSeerXWidget( parent );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// Destroy
|
|
|
|
|
|
|
|
WebQueryCiteSeerX::~WebQueryCiteSeerX()
|
|
|
|
{
|
|
|
|
delete m_widget;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// GUI string
|
|
|
|
|
|
|
|
TQString WebQueryCiteSeerX::title()
|
|
|
|
{
|
|
|
|
return i18n( "CiteSeerX" );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// GUI info
|
|
|
|
|
|
|
|
TQString WebQueryCiteSeerX::disclaimer()
|
|
|
|
{
|
|
|
|
return i18n( "About CiteSeerX" );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// URL for disclaimer
|
|
|
|
|
|
|
|
TQString WebQueryCiteSeerX::disclaimerURL()
|
|
|
|
{
|
|
|
|
return "http://citeseerx.ist.psu.edu/about/site";
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// return pointer to widget
|
|
|
|
|
|
|
|
WebQueryWidget *WebQueryCiteSeerX::widget()
|
|
|
|
{
|
|
|
|
return m_widget;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// user has pressed "Cancel"
|
|
|
|
|
|
|
|
void WebQueryCiteSeerX::cancelQuery()
|
|
|
|
{
|
|
|
|
m_queryQueue.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// main function -- collects all queries for one search
|
|
|
|
|
|
|
|
void WebQueryCiteSeerX::query()
|
|
|
|
{
|
|
|
|
// store CiteSeerX as future default
|
|
|
|
WebQuery::query();
|
|
|
|
Settings *settings = Settings::self();
|
|
|
|
settings->setWebQueryDefault( "CiteSeerX", m_widget->lineEditQuery->text() );
|
|
|
|
|
|
|
|
// read number of desired results from GUI
|
|
|
|
m_queryQueue.clear();
|
|
|
|
m_desiredHits = m_widget->spinBoxMaxHits->value();
|
|
|
|
// one for each entry, and one for each page of 10 links
|
|
|
|
setNumStages( m_desiredHits + ( m_desiredHits / 10 + 1 ) );
|
|
|
|
|
|
|
|
// prepare search term
|
|
|
|
TQString searchTerm = m_widget->lineEditQuery->text().stripWhiteSpace().replace( '$', "" );
|
|
|
|
TQStringList queryWords = TQStringList::split( TQRegExp( "\\s+" ), searchTerm );
|
|
|
|
|
|
|
|
if ( searchTerm.isEmpty() || queryWords.size() == 0 )
|
|
|
|
{
|
|
|
|
setEndSearch( WebQuery::statusInvalidQuery );
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// build query from search term
|
|
|
|
TQString query;
|
|
|
|
|
|
|
|
for ( uint i = 0; i < queryWords.size(); ++i )
|
|
|
|
{
|
|
|
|
if ( i ) query += " AND ";
|
|
|
|
|
|
|
|
query += queryWords[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
query = query.replace( "%", "%25" ).replace( "+", "%2B" ).replace( " ", "%20" ).replace( "#", "%23" ).replace( "&", "%26" ).replace( "?", "%3F" );
|
|
|
|
|
|
|
|
// schedule jobs
|
|
|
|
DataRequest dr;
|
|
|
|
dr.url = KURL( TQString( "http://citeseerx.ist.psu.edu/search?q=" ).append( query ).append( "&submit=Search&sort=rel" ) );
|
|
|
|
dr.parser = &WebQueryCiteSeerX::parseSummaryPage;
|
|
|
|
m_queryQueue.push_back( dr );
|
|
|
|
|
|
|
|
// start job queue
|
|
|
|
nextJob();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// process results from current job
|
|
|
|
|
|
|
|
void WebQueryCiteSeerX::parseSummaryPage( const TQString& data )
|
|
|
|
{
|
|
|
|
// regexp. for finding paper entries (example: href="/viewdoc/summary;jsessionid=12345ABCD?doi=10.1.1.108.9937")
|
|
|
|
TQRegExp paperXpr( "href=\"(/viewdoc/summary[^?]*\\?doi=[^\"]+)\"" );
|
|
|
|
|
|
|
|
// count paper results and schedule single paper URLs
|
|
|
|
|
|
|
|
for ( int p = paperXpr.search( data ); p >= 0; p = paperXpr.search( data, p + paperXpr.matchedLength() ) )
|
|
|
|
{
|
|
|
|
if ( ++m_receivedHits > m_desiredHits )
|
|
|
|
break;
|
|
|
|
|
|
|
|
DataRequest dr;
|
|
|
|
|
|
|
|
dr.url = KURL( TQString( "http://" ) + m_citeSeerXServer + paperXpr.cap( 1 ) );
|
|
|
|
|
|
|
|
dr.parser = &WebQueryCiteSeerX::parsePaperPage;
|
|
|
|
|
|
|
|
m_queryQueue.push_back( dr );
|
|
|
|
}
|
|
|
|
|
|
|
|
// if we haven't reached the desired number of hits, schedule the next summary page
|
|
|
|
TQRegExp nextSummaryXpr( "<a href=\"([^\"]+)\">Next 10" );
|
|
|
|
|
|
|
|
if ( m_receivedHits < m_desiredHits )
|
|
|
|
if ( nextSummaryXpr.search( data ) >= 0 )
|
|
|
|
{
|
|
|
|
DataRequest dr;
|
|
|
|
dr.url = KURL( TQString( "http://" ) + m_citeSeerXServer + nextSummaryXpr.cap( 1 ).replace( "&", "&" ) );
|
|
|
|
dr.parser = &WebQueryCiteSeerX::parseSummaryPage;
|
|
|
|
m_queryQueue.push_back( dr );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// process the result of one single paper link
|
|
|
|
|
|
|
|
void WebQueryCiteSeerX::parsePaperPage( const TQString& data )
|
|
|
|
{
|
|
|
|
// find type and id: @XXX{ YYY
|
|
|
|
TQRegExp typeIdXpr( "@(.*)\\{(.*)," );
|
|
|
|
typeIdXpr.setMinimal( true );
|
|
|
|
typeIdXpr.search( data );
|
|
|
|
TQString typeStr = typeIdXpr.cap( 1 );
|
|
|
|
TQString id = typeIdXpr.cap( 2 );
|
|
|
|
|
|
|
|
// create entry
|
|
|
|
Entry *entry = new BibTeX::Entry( typeIdXpr.cap( 1 ), typeIdXpr.cap( 2 ) );
|
|
|
|
|
|
|
|
// find abstract: <..>Abstract:</..> <..> XXX </..>
|
|
|
|
parseForSingleExpression( "<[^<]+>Abstract:</[^<]+>\\s*<[^<]+>([^<]+)</[^<]+>", data, entry, BibTeX::EntryField::ftAbstract );
|
|
|
|
|
|
|
|
// find title: title = {XXX}
|
|
|
|
parseForSingleExpression( "title = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftTitle );
|
|
|
|
|
|
|
|
// find author: author = {XXX}
|
|
|
|
parseForSingleExpression( "author = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftAuthor );
|
|
|
|
|
|
|
|
// find year: year = {XXX}
|
|
|
|
parseForSingleExpression( "year = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftYear );
|
|
|
|
|
|
|
|
// find journal: journal = {XXX}
|
|
|
|
parseForSingleExpression( "journal = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftJournal );
|
|
|
|
|
|
|
|
// find pages: pages = {XXX}
|
|
|
|
parseForSingleExpression( "pages = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftPages );
|
|
|
|
|
|
|
|
// publish what we've found
|
|
|
|
emit foundEntry( entry, false );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// find single bibtex field in html page and add to entry
|
|
|
|
|
|
|
|
void WebQueryCiteSeerX::parseForSingleExpression( TQString description, const TQString &data, Entry *entry, BibTeX::EntryField::FieldType type )
|
|
|
|
{
|
|
|
|
// search, and add to entry if found
|
|
|
|
TQRegExp xpr( description );
|
|
|
|
|
|
|
|
if ( xpr.search( data ) + 1 )
|
|
|
|
{
|
|
|
|
EntryField *field = new EntryField( type );
|
|
|
|
field->setValue( new Value( xpr.cap( 1 ), false ) );
|
|
|
|
entry->addField( field );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// read data from the job and start the current parser
|
|
|
|
|
|
|
|
void WebQueryCiteSeerX::getData( TDEIO::Job *job )
|
|
|
|
{
|
|
|
|
// advance GUI progress bar
|
|
|
|
enterNextStage();
|
|
|
|
|
|
|
|
if ( job && !job->error() && !m_aborted )
|
|
|
|
{
|
|
|
|
|
|
|
|
// read data
|
|
|
|
TQBuffer data;
|
|
|
|
data.open( IO_WriteOnly );
|
|
|
|
data.writeBlock( dynamic_cast<TDEIO::StoredTransferJob*>( job )->data() );
|
|
|
|
data.close();
|
|
|
|
data.open( IO_ReadOnly );
|
|
|
|
TQTextStream ts( &data );
|
|
|
|
TQString result = ts.read();
|
|
|
|
data.close();
|
|
|
|
|
|
|
|
// hand the read data over to the parser
|
|
|
|
( this->*m_currentParser )( result );
|
|
|
|
}
|
|
|
|
|
|
|
|
// proceed
|
|
|
|
nextJob();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//_______________________________________________________________________________________________________________
|
|
|
|
// call the next job
|
|
|
|
|
|
|
|
void WebQueryCiteSeerX::nextJob()
|
|
|
|
{
|
|
|
|
// no more requests: finished
|
|
|
|
if ( !m_queryQueue.size() )
|
|
|
|
{
|
|
|
|
setEndSearch( WebQuery::statusSuccess );
|
|
|
|
m_receivedHits = 0;
|
|
|
|
}
|
|
|
|
// else: take the next request from queue and start it
|
|
|
|
else if ( !m_aborted )
|
|
|
|
{
|
|
|
|
m_currentParser = m_queryQueue.front().parser;
|
|
|
|
TDEIO::Job *job = TDEIO::storedGet( m_queryQueue.front().url, FALSE, FALSE );
|
|
|
|
m_queryQueue.pop_front();
|
|
|
|
connect( job, SIGNAL( result( TDEIO::Job * ) ), this, SLOT( getData( TDEIO::Job * ) ) );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#include "webqueryciteseerx.moc"
|