/*************************************************************************** copyright : (C) 2007 by Robby Stephenson email : robby@periapsis.org ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of version 2 of the GNU General Public License as * * published by the Free Software Foundation; * * * ***************************************************************************/ #include "pdfimporter.h" #include "tellicoimporter.h" #include "xslthandler.h" #include "../collections/bibtexcollection.h" #include "../xmphandler.h" #include "../filehandler.h" #include "../imagefactory.h" #include "../tellico_kernel.h" #include "../fetch/fetchmanager.h" #include "../fetch/crossreffetcher.h" #include "../tellico_utils.h" #include "../progressmanager.h" #include "../core/netaccess.h" #include "../tellico_debug.h" #include #include #include #ifdef HAVE_POPPLER #include #endif namespace { static const int PDF_FILE_PREVIEW_SIZE = 196; } using Tellico::Import::PDFImporter; PDFImporter::PDFImporter(const KURL::List& urls_) : Importer(urls_), m_cancelled(false) { } bool PDFImporter::canImport(int type_) const { return type_ == Data::Collection::Bibtex; } Tellico::Data::CollPtr PDFImporter::collection() { TQString xsltfile = ::locate("appdata", TQString::fromLatin1("xmp2tellico.xsl")); if(xsltfile.isEmpty()) { kdWarning() << "DropHandler::handleURL() - can not locate xmp2tellico.xsl" << endl; return 0; } ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true); item.setTotalSteps(urls().count()); connect(&item, TQ_SIGNAL(signalCancelled(ProgressItem*)), TQ_SLOT(slotCancel())); ProgressItem::Done done(this); const bool showProgress = options() & ImportProgress; KURL u; u.setPath(xsltfile); XSLTHandler xsltHandler(u); if(!xsltHandler.isValid()) { kdWarning() << "DropHandler::handleURL() - invalid xslt in xmp2tellico.xsl" << endl; return 0; } bool hasDOI = false; bool hasArxiv = false; uint j = 0; Data::CollPtr coll; XMPHandler xmpHandler; KURL::List list = urls(); for(KURL::List::Iterator it = list.begin(); it != list.end() && !m_cancelled; ++it, ++j) { FileHandler::FileRef* ref = FileHandler::fileRef(*it); if(!ref) { continue; } Data::CollPtr newColl; Data::EntryPtr entry; TQString xmp = xmpHandler.extractXMP(ref->fileName()); // myDebug() << xmp << endl; if(xmp.isEmpty()) { setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file.")); } else { setStatusMessage(TQString()); Import::TellicoImporter importer(xsltHandler.applyStylesheet(xmp)); newColl = importer.collection(); if(!newColl || newColl->entryCount() == 0) { kdWarning() << "DropHandler::handleURL() - no collection found" << endl; setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file.")); } else { entry = newColl->entries().front(); hasDOI |= !entry->field(TQString::fromLatin1("doi")).isEmpty(); } } if(!newColl) { newColl = new Data::BibtexCollection(true); } if(!entry) { entry = new Data::Entry(newColl); newColl->addEntries(entry); } #ifdef HAVE_POPPLER // now load from poppler Poppler::Document* doc = Poppler::Document::load(ref->fileName()); if(doc && !doc->isLocked()) { // now the question is, do we overwrite XMP data with Poppler data? // for now, let's say yes conditionally TQString s = doc->getInfo(TQString::fromLatin1("Title")).simplifyWhiteSpace(); if(!s.isEmpty()) { entry->setField(TQString::fromLatin1("title"), s); } // author could be separated by commas, "and" or whatever // we're not going to overwrite it if(entry->field(TQString::fromLatin1("author")).isEmpty()) { TQRegExp rx(TQString::fromLatin1("\\s*(and|,|;)\\s*")); TQStringList authors = TQStringList::split(rx, doc->getInfo(TQString::fromLatin1("Author")).simplifyWhiteSpace()); entry->setField(TQString::fromLatin1("author"), authors.join(TQString::fromLatin1("; "))); } s = doc->getInfo(TQString::fromLatin1("Keywords")).simplifyWhiteSpace(); if(!s.isEmpty()) { // keywords are also separated by semi-colons in poppler entry->setField(TQString::fromLatin1("keyword"), s); } // now parse the first page text and try to guess Poppler::Page* page = doc->getPage(0); if(page) { // a null rectangle means get all text on page TQString text = page->getText(Poppler::Rectangle()); // borrowed from Referencer TQRegExp rx(TQString::fromLatin1("(?:" "(?:[Dd][Oo][Ii]:? *)" "|" "(?:[Dd]igital *[Oo]bject *[Ii]dentifier:? *)" ")" "(" "[^\\.\\s]+" "\\." "[^\\/\\s]+" "\\/" "[^\\s]+" ")")); if(rx.search(text) > -1) { TQString doi = rx.cap(1); myDebug() << "PDFImporter::collection() - in PDF file, found DOI: " << doi << endl; entry->setField(TQString::fromLatin1("doi"), doi); hasDOI = true; } rx = TQRegExp(TQString::fromLatin1("arXiv:" "(" "[^\\/\\s]+" "[\\/\\.]" "[^\\s]+" ")")); if(rx.search(text) > -1) { TQString arxiv = rx.cap(1); myDebug() << "PDFImporter::collection() - in PDF file, found arxiv: " << arxiv << endl; if(entry->collection()->fieldByName(TQString::fromLatin1("arxiv")) == 0) { Data::FieldPtr field = new Data::Field(TQString::fromLatin1("arxiv"), i18n("arXiv ID")); field->setCategory(i18n("Publishing")); entry->collection()->addField(field); } entry->setField(TQString::fromLatin1("arxiv"), arxiv); hasArxiv = true; } delete page; } } else { myDebug() << "PDFImporter::collection() - unable to read PDF info (poppler)" << endl; } delete doc; #endif entry->setField(TQString::fromLatin1("url"), (*it).url()); // always an article? entry->setField(TQString::fromLatin1("entry-type"), TQString::fromLatin1("article")); TQPixmap pix = NetAccess::filePreview(ref->fileName(), PDF_FILE_PREVIEW_SIZE); delete ref; // removes temp file if(!pix.isNull()) { // is png best option? TQString id = ImageFactory::addImage(pix, TQString::fromLatin1("PNG")); if(!id.isEmpty()) { Data::FieldPtr field = newColl->fieldByName(TQString::fromLatin1("cover")); if(!field && !newColl->imageFields().isEmpty()) { field = newColl->imageFields().front(); } else if(!field) { field = new Data::Field(TQString::fromLatin1("cover"), i18n("Front Cover"), Data::Field::Image); newColl->addField(field); } entry->setField(field, id); } } if(coll) { coll->addEntries(newColl->entries()); } else { coll = newColl; } if(showProgress) { ProgressManager::self()->setProgress(this, j); kapp->processEvents(); } } if(m_cancelled) { return 0; } if(hasDOI) { myDebug() << "looking for DOI" << endl; Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::DOI); if(vec.isEmpty()) { GUI::CursorSaver cs(TQt::arrowCursor); KMessageBox::information(Kernel::self()->widget(), i18n("Tellico is able to download information about entries with a DOI from " "CrossRef.org. However, you must create an CrossRef account and add a new " "data source with your account information."), TQString(), TQString::fromLatin1("CrossRefSourceNeeded")); } else { Data::EntryVec entries = coll->entries(); for(Fetch::FetcherVec::Iterator fetcher = vec.begin(); fetcher != vec.end(); ++fetcher) { for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) { fetcher->updateEntrySynchronous(entry); } } } } if(m_cancelled) { return 0; } if(hasArxiv) { Data::EntryVec entries = coll->entries(); Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::ArxivID); for(Fetch::FetcherVec::Iterator fetcher = vec.begin(); fetcher != vec.end(); ++fetcher) { for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) { fetcher->updateEntrySynchronous(entry); } } } // finally Data::EntryVec entries = coll->entries(); for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) { if(entry->title().isEmpty()) { // use file name KURL u = entry->field(TQString::fromLatin1("url")); entry->setField(TQString::fromLatin1("title"), u.fileName()); } } if(m_cancelled) { return 0; } return coll; } void PDFImporter::slotCancel() { m_cancelled = true; } #include "pdfimporter.moc"