You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tellico/src/fetch/imdbfetcher.cpp

1211 lines
41 KiB

/***************************************************************************
copyright : (C) 2004-2006 by Robby Stephenson
email : robby@periapsis.org
***************************************************************************/
/***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of version 2 of the GNU General Public License as *
* published by the Free Software Foundation; *
* *
***************************************************************************/
#include "imdbfetcher.h"
#include "../tellico_kernel.h"
#include "../collections/videocollection.h"
#include "../entry.h"
#include "../field.h"
#include "../filehandler.h"
#include "../latin1literal.h"
#include "../imagefactory.h"
#include "../tellico_utils.h"
#include "../gui/listboxtext.h"
#include "../tellico_debug.h"
#include <tdelocale.h>
#include <kdialogbase.h>
#include <tdeconfig.h>
#include <klineedit.h>
#include <knuminput.h>
#include <tqregexp.h>
#include <tqfile.h>
#include <tqmap.h>
#include <tqvbox.h>
#include <tqlabel.h>
#include <tqlistbox.h>
#include <tqwhatsthis.h>
#include <tqlayout.h>
#include <tqcheckbox.h>
#include <tqvgroupbox.h>
//#define IMDB_TEST
namespace {
static const char* IMDB_SERVER = "akas.imdb.com";
static const uint IMDB_MAX_RESULTS = 20;
static const TQString sep = TQString::fromLatin1("; ");
}
using Tellico::Fetch::IMDBFetcher;
TQRegExp* IMDBFetcher::s_tagRx = 0;
TQRegExp* IMDBFetcher::s_anchorRx = 0;
TQRegExp* IMDBFetcher::s_anchorTitleRx = 0;
TQRegExp* IMDBFetcher::s_anchorNameRx = 0;
TQRegExp* IMDBFetcher::s_titleRx = 0;
// static
void IMDBFetcher::initRegExps() {
s_tagRx = new TQRegExp(TQString::fromLatin1("<.*>"));
s_tagRx->setMinimal(true);
s_anchorRx = new TQRegExp(TQString::fromLatin1("<a\\s+[^>]*href\\s*=\\s*\"([^\"]*)\"[^<]*>([^<]*)</a>"), false);
s_anchorRx->setMinimal(true);
s_anchorTitleRx = new TQRegExp(TQString::fromLatin1("<a\\s+[^>]*href\\s*=\\s*\"([^\"]*/title/[^\"]*)\"[^<]*>([^<]*)</a>"), false);
s_anchorTitleRx->setMinimal(true);
s_anchorNameRx = new TQRegExp(TQString::fromLatin1("<a\\s+[^>]*href\\s*=\\s*\"([^\"]*/name/[^\"]*)\"[^<]*>([^<]*)</a>"), false);
s_anchorNameRx->setMinimal(true);
s_titleRx = new TQRegExp(TQString::fromLatin1("<title>(.*)</title>"), false);
s_titleRx->setMinimal(true);
}
IMDBFetcher::IMDBFetcher(TQObject* parent_, const char* name_) : Fetcher(parent_, name_),
m_job(0), m_started(false), m_fetchImages(true), m_host(TQString::fromLatin1(IMDB_SERVER)),
m_limit(IMDB_MAX_RESULTS), m_countOffset(0) {
if(!s_tagRx) {
initRegExps();
}
}
IMDBFetcher::~IMDBFetcher() {
}
TQString IMDBFetcher::defaultName() {
return i18n("Internet Movie Database");
}
TQString IMDBFetcher::source() const {
return m_name.isEmpty() ? defaultName() : m_name;
}
bool IMDBFetcher::canFetch(int type) const {
return type == Data::Collection::Video;
}
void IMDBFetcher::readConfigHook(const TDEConfigGroup& config_) {
TQString h = config_.readEntry("Host");
if(!h.isEmpty()) {
m_host = h;
}
m_numCast = config_.readNumEntry("Max Cast", 10);
m_fetchImages = config_.readBoolEntry("Fetch Images", true);
m_fields = config_.readListEntry("Custom Fields");
}
// multiple values not supported
void IMDBFetcher::search(FetchKey key_, const TQString& value_) {
m_key = key_;
m_value = value_;
m_started = true;
m_redirected = false;
m_data.truncate(0);
m_matches.clear();
m_popularTitles.truncate(0);
m_exactTitles.truncate(0);
m_partialTitles.truncate(0);
m_currentTitleBlock = Unknown;
m_countOffset = 0;
// only search if current collection is a video collection
if(Kernel::self()->collectionType() != Data::Collection::Video) {
myDebug() << "IMDBFetcher::search() - collection type mismatch, stopping" << endl;
stop();
return;
}
#ifdef IMDB_TEST
if(m_key == Title) {
m_url = KURL::fromPathOrURL(TQString::fromLatin1("/home/robby/imdb-title.html"));
m_redirected = false;
} else {
m_url = KURL::fromPathOrURL(TQString::fromLatin1("/home/robby/imdb-name.html"));
m_redirected = true;
}
#else
m_url = KURL();
m_url.setProtocol(TQString::fromLatin1("http"));
m_url.setHost(m_host.isEmpty() ? TQString::fromLatin1(IMDB_SERVER) : m_host);
m_url.setPath(TQString::fromLatin1("/find"));
switch(key_) {
case Title:
m_url.addQueryItem(TQString::fromLatin1("s"), TQString::fromLatin1("tt"));
break;
case Person:
m_url.addQueryItem(TQString::fromLatin1("s"), TQString::fromLatin1("nm"));
break;
default:
kdWarning() << "IMDBFetcher::search() - FetchKey not supported" << endl;
stop();
return;
}
// as far as I can tell, the url encoding should always be iso-8859-1
// not utf-8
m_url.addQueryItem(TQString::fromLatin1("q"), value_, 4 /* iso-8859-1 */);
// myDebug() << "IMDBFetcher::search() url = " << m_url << endl;
#endif
m_job = TDEIO::get(m_url, false, false);
connect(m_job, TQ_SIGNAL(data(TDEIO::Job*, const TQByteArray&)),
TQ_SLOT(slotData(TDEIO::Job*, const TQByteArray&)));
connect(m_job, TQ_SIGNAL(result(TDEIO::Job*)),
TQ_SLOT(slotComplete(TDEIO::Job*)));
connect(m_job, TQ_SIGNAL(redirection(TDEIO::Job *, const KURL&)),
TQ_SLOT(slotRedirection(TDEIO::Job*, const KURL&)));
}
void IMDBFetcher::continueSearch() {
m_started = true;
m_limit += IMDB_MAX_RESULTS;
if(m_currentTitleBlock == Popular) {
parseTitleBlock(m_popularTitles);
// if the offset is 0, then we need to be looking at the next block
m_currentTitleBlock = m_countOffset == 0 ? Exact : Popular;
}
// current title block might have changed
if(m_currentTitleBlock == Exact) {
parseTitleBlock(m_exactTitles);
m_currentTitleBlock = m_countOffset == 0 ? Partial : Exact;
}
if(m_currentTitleBlock == Partial) {
parseTitleBlock(m_partialTitles);
m_currentTitleBlock = m_countOffset == 0 ? Unknown : Partial;
}
if(m_currentTitleBlock == SinglePerson) {
parseSingleNameResult();
}
stop();
}
void IMDBFetcher::stop() {
if(!m_started) {
return;
}
// myLog() << "IMDBFetcher::stop()" << endl;
if(m_job) {
m_job->kill();
m_job = 0;
}
m_started = false;
m_redirected = false;
emit signalDone(this);
}
void IMDBFetcher::slotData(TDEIO::Job*, const TQByteArray& data_) {
TQDataStream stream(m_data, IO_WriteOnly | IO_Append);
stream.writeRawBytes(data_.data(), data_.size());
}
void IMDBFetcher::slotRedirection(TDEIO::Job*, const KURL& toURL_) {
m_url = toURL_;
m_redirected = true;
}
void IMDBFetcher::slotComplete(TDEIO::Job* job_) {
// since the fetch is done, don't worry about holding the job pointer
m_job = 0;
if(job_->error()) {
job_->showErrorDialog(Kernel::self()->widget());
stop();
return;
}
if(m_data.isEmpty()) {
stop();
return;
}
// a single result was found if we got redirected
if(m_key == Title) {
if(m_redirected) {
parseSingleTitleResult();
} else {
parseMultipleTitleResults();
}
} else {
if(m_redirected) {
parseSingleNameResult();
} else {
parseMultipleNameResults();
}
}
}
void IMDBFetcher::parseSingleTitleResult() {
// myDebug() << "IMDBFetcher::parseSingleTitleResult()" << endl;
s_titleRx->search(Tellico::decodeHTML(TQString(m_data)));
// split title at parenthesis
const TQString cap1 = s_titleRx->cap(1);
int pPos = cap1.find('(');
// FIXME: maybe remove parentheses here?
SearchResult* r = new SearchResult(this,
pPos == -1 ? cap1 : cap1.left(pPos),
pPos == -1 ? TQString() : cap1.mid(pPos),
TQString());
m_matches.insert(r->uid, m_url);
emit signalResultFound(r);
m_hasMoreResults = false;
stop();
}
void IMDBFetcher::parseMultipleTitleResults() {
// myDebug() << "IMDBFetcher::parseMultipleTitleResults()" << endl;
TQString output = Tellico::decodeHTML(TQString(m_data));
// IMDb can return three title lists, popular, exact, and partial
// the popular titles are in the first table, after the "Popular Results" text
int pos_popular = output.find(TQString::fromLatin1("Popular Titles"), 0, false);
int pos_exact = output.find(TQString::fromLatin1("Exact Matches"), TQMAX(pos_popular, 0), false);
int pos_partial = output.find(TQString::fromLatin1("Partial Matches"), TQMAX(pos_exact, 0), false);
int end_popular = pos_exact; // keep track of where to end
if(end_popular == -1) {
end_popular = pos_partial == -1 ? output.length() : pos_partial;
}
int end_exact = pos_partial; // keep track of where to end
if(end_exact == -1) {
end_exact = output.length();
}
// if found popular matches
if(pos_popular > -1) {
m_popularTitles = output.mid(pos_popular, end_popular-pos_popular);
}
// if found exact matches
if(pos_exact > -1) {
m_exactTitles = output.mid(pos_exact, end_exact-pos_exact);
}
if(pos_partial > -1) {
m_partialTitles = output.mid(pos_partial);
}
parseTitleBlock(m_popularTitles);
// if the offset is 0, then we need to be looking at the next block
m_currentTitleBlock = m_countOffset == 0 ? Exact : Popular;
if(m_matches.size() < m_limit) {
parseTitleBlock(m_exactTitles);
m_currentTitleBlock = m_countOffset == 0 ? Partial : Exact;
}
if(m_matches.size() < m_limit) {
parseTitleBlock(m_partialTitles);
m_currentTitleBlock = m_countOffset == 0 ? Unknown : Partial;
}
#ifndef NDEBUG
if(m_matches.size() == 0) {
myDebug() << "IMDBFetcher::parseMultipleTitleResults() - no matches found." << endl;
}
#endif
stop();
}
void IMDBFetcher::parseTitleBlock(const TQString& str_) {
if(str_.isEmpty()) {
m_countOffset = 0;
return;
}
// myDebug() << "IMDBFetcher::parseTitleBlock() - " << m_currentTitleBlock << endl;
TQRegExp akaRx(TQString::fromLatin1("aka (.*)(</li>|<br)"), false);
akaRx.setMinimal(true);
m_hasMoreResults = false;
int count = 0;
int start = s_anchorTitleRx->search(str_);
while(m_started && start > -1) {
// split title at parenthesis
const TQString cap1 = s_anchorTitleRx->cap(1); // the anchor url
const TQString cap2 = s_anchorTitleRx->cap(2).stripWhiteSpace(); // the anchor text
start += s_anchorTitleRx->matchedLength();
int pPos = cap2.find('('); // if it has parentheses, use that for description
TQString desc;
if(pPos > -1) {
int pPos2 = cap2.find(')', pPos+1);
if(pPos2 > -1) {
desc = cap2.mid(pPos+1, pPos2-pPos-1);
}
} else {
// parenthesis might be outside anchor tag
int end = s_anchorTitleRx->search(str_, start);
if(end == -1) {
end = str_.length();
}
TQString text = str_.mid(start, end-start);
pPos = text.find('(');
if(pPos > -1) {
int pNewLine = text.find(TQString::fromLatin1("<br"));
if(pNewLine == -1 || pPos < pNewLine) {
int pPos2 = text.find(')', pPos);
desc = text.mid(pPos+1, pPos2-pPos-1);
}
pPos = -1;
}
}
// multiple matches might have 'aka' info
int end = s_anchorTitleRx->search(str_, start+1);
if(end == -1) {
end = str_.length();
}
int akaPos = akaRx.search(str_, start+1);
if(akaPos > -1 && akaPos < end) {
// limit to 50 chars
desc += TQChar(' ') + akaRx.cap(1).stripWhiteSpace().remove(*s_tagRx);
if(desc.length() > 50) {
desc = desc.left(50) + TQString::fromLatin1("...");
}
}
start = s_anchorTitleRx->search(str_, start);
if(count < m_countOffset) {
++count;
continue;
}
// if we got this far, then there is a valid result
if(m_matches.size() >= m_limit) {
m_hasMoreResults = true;
break;
}
SearchResult* r = new SearchResult(this, pPos == -1 ? cap2 : cap2.left(pPos), desc, TQString());
KURL u(m_url, cap1);
u.setQuery(TQString());
m_matches.insert(r->uid, u);
emit signalResultFound(r);
++count;
}
if(!m_hasMoreResults && m_currentTitleBlock != Partial) {
m_hasMoreResults = true;
}
m_countOffset = m_matches.size() < m_limit ? 0 : count;
}
void IMDBFetcher::parseSingleNameResult() {
// myDebug() << "IMDBFetcher::parseSingleNameResult()" << endl;
m_currentTitleBlock = SinglePerson;
TQString output = Tellico::decodeHTML(TQString(m_data));
int pos = s_anchorTitleRx->search(output);
if(pos == -1) {
stop();
return;
}
TQRegExp tvRegExp(TQString::fromLatin1("TV\\sEpisode"), false);
int len = 0;
int count = 0;
TQString desc;
for( ; m_started && pos > -1; pos = s_anchorTitleRx->search(output, pos+len)) {
desc.truncate(0);
bool isEpisode = false;
len = s_anchorTitleRx->cap(0).length();
// split title at parenthesis
const TQString cap2 = s_anchorTitleRx->cap(2).stripWhiteSpace();
int pPos = cap2.find('(');
if(pPos > -1) {
desc = cap2.mid(pPos);
} else {
// look until the next <a
int aPos = output.find(TQString::fromLatin1("<a"), pos+len, false);
if(aPos == -1) {
aPos = output.length();
}
TQString tmp = output.mid(pos+len, aPos-pos-len);
if(tmp.find(tvRegExp) > -1) {
isEpisode = true;
}
pPos = tmp.find('(');
if(pPos > -1) {
int pNewLine = tmp.find(TQString::fromLatin1("<br"));
if(pNewLine == -1 || pPos < pNewLine) {
int pEnd = tmp.find(')', pPos+1);
desc = tmp.mid(pPos+1, pEnd-pPos-1).remove(*s_tagRx);
}
// but need to indicate it wasn't found initially
pPos = -1;
}
}
;
if(count < m_countOffset) {
++count;
continue;
}
++count;
if(isEpisode) {
continue;
}
// if we got this far, then there is a valid result
if(m_matches.size() >= m_limit) {
m_hasMoreResults = true;
break;
}
// FIXME: maybe remove parentheses here?
SearchResult* r = new SearchResult(this, pPos == -1 ? cap2 : cap2.left(pPos), desc, TQString());
KURL u(m_url, s_anchorTitleRx->cap(1)); // relative URL constructor
u.setQuery(TQString());
m_matches.insert(r->uid, u);
// myDebug() << u.prettyURL() << endl;
// myDebug() << cap2 << endl;
emit signalResultFound(r);
}
if(pos == -1) {
m_hasMoreResults = false;
}
m_countOffset = count - 1;
stop();
}
void IMDBFetcher::parseMultipleNameResults() {
// myDebug() << "IMDBFetcher::parseMultipleNameResults()" << endl;
// the exact results are in the first table after the "exact results" text
TQString output = Tellico::decodeHTML(TQString(m_data));
int pos = output.find(TQString::fromLatin1("Popular Results"), 0, false);
if(pos == -1) {
pos = output.find(TQString::fromLatin1("Exact Matches"), 0, false);
}
// find beginning of partial matches
int end = output.find(TQString::fromLatin1("Other Results"), TQMAX(pos, 0), false);
if(end == -1) {
end = output.find(TQString::fromLatin1("Partial Matches"), TQMAX(pos, 0), false);
if(end == -1) {
end = output.find(TQString::fromLatin1("Approx Matches"), TQMAX(pos, 0), false);
if(end == -1) {
end = output.length();
}
}
}
TQMap<TQString, KURL> map;
TQMap<TQString, int> nameMap;
TQString s;
// if found exact matches
if(pos > -1) {
pos = s_anchorNameRx->search(output, pos+13);
while(pos > -1 && pos < end && m_matches.size() < m_limit) {
KURL u(m_url, s_anchorNameRx->cap(1));
s = s_anchorNameRx->cap(2).stripWhiteSpace() + ' ';
// if more than one exact, add parentheses
if(nameMap.contains(s) && nameMap[s] > 0) {
// fix the first one that didn't have a number
if(nameMap[s] == 1) {
KURL u2 = map[s];
map.remove(s);
map.insert(s + "(1) ", u2);
}
nameMap.insert(s, nameMap[s] + 1);
// check for duplicate names
s += TQString::fromLatin1("(%1) ").arg(nameMap[s]);
} else {
nameMap.insert(s, 1);
}
map.insert(s, u);
pos = s_anchorNameRx->search(output, pos+s_anchorNameRx->cap(0).length());
}
}
// go ahead and search for partial matches
pos = s_anchorNameRx->search(output, end);
while(pos > -1 && m_matches.size() < m_limit) {
KURL u(m_url, s_anchorNameRx->cap(1)); // relative URL
s = s_anchorNameRx->cap(2).stripWhiteSpace();
if(nameMap.contains(s) && nameMap[s] > 0) {
// fix the first one that didn't have a number
if(nameMap[s] == 1) {
KURL u2 = map[s];
map.remove(s);
map.insert(s + " (1)", u2);
}
nameMap.insert(s, nameMap[s] + 1);
// check for duplicate names
s += TQString::fromLatin1(" (%1)").arg(nameMap[s]);
} else {
nameMap.insert(s, 1);
}
map.insert(s, u);
pos = s_anchorNameRx->search(output, pos+s_anchorNameRx->cap(0).length());
}
if(map.count() == 0) {
stop();
return;
}
KDialogBase* dlg = new KDialogBase(Kernel::self()->widget(), "imdb dialog",
true, i18n("Select IMDB Result"), KDialogBase::Ok|KDialogBase::Cancel);
TQVBox* box = new TQVBox(dlg);
box->setSpacing(10);
(void) new TQLabel(i18n("<qt>Your search returned multiple matches. Please select one below.</qt>"), box);
TQListBox* listBox = new TQListBox(box);
listBox->setMinimumWidth(400);
listBox->setColumnMode(TQListBox::FitToWidth);
const TQStringList values = map.keys();
for(TQStringList::ConstIterator it = values.begin(); it != values.end(); ++it) {
if((*it).endsWith(TQChar(' '))) {
GUI::ListBoxText* box = new GUI::ListBoxText(listBox, *it, 0);
box->setColored(true);
} else {
(void) new GUI::ListBoxText(listBox, *it);
}
}
listBox->setSelected(0, true);
TQWhatsThis::add(listBox, i18n("<qt>Select a search result.</qt>"));
dlg->setMainWidget(box);
if(dlg->exec() != TQDialog::Accepted || listBox->currentText().isEmpty()) {
dlg->delayedDestruct();
stop();
return;
}
m_url = map[listBox->currentText()];
dlg->delayedDestruct();
// redirected is true since that's how I tell if an exact match has been found
m_redirected = true;
m_data.truncate(0);
m_job = TDEIO::get(m_url, false, false);
connect(m_job, TQ_SIGNAL(data(TDEIO::Job*, const TQByteArray&)),
TQ_SLOT(slotData(TDEIO::Job*, const TQByteArray&)));
connect(m_job, TQ_SIGNAL(result(TDEIO::Job*)),
TQ_SLOT(slotComplete(TDEIO::Job*)));
connect(m_job, TQ_SIGNAL(redirection(TDEIO::Job *, const KURL&)),
TQ_SLOT(slotRedirection(TDEIO::Job*, const KURL&)));
// do not stop() here
}
Tellico::Data::EntryPtr IMDBFetcher::fetchEntry(uint uid_) {
// if we already grabbed this one, then just pull it out of the dict
Data::EntryPtr entry = m_entries[uid_];
if(entry) {
return entry;
}
KURL url = m_matches[uid_];
if(url.isEmpty()) {
myDebug() << "IMDBFetcher::fetchEntry() - no url found" << endl;
return 0;
}
KURL origURL = m_url; // keep to switch back
TQString results;
// if the url matches the current one, no need to redownload it
if(url == m_url) {
// myDebug() << "IMDBFetcher::fetchEntry() - matches previous URL, no downloading needed." << endl;
results = Tellico::decodeHTML(TQString(m_data));
} else {
// now it's sychronous
#ifdef IMDB_TEST
KURL u = KURL::fromPathOrURL(TQString::fromLatin1("/home/robby/imdb-title-result.html"));
results = Tellico::decodeHTML(FileHandler::readTextFile(u));
#else
// be quiet about failure
results = Tellico::decodeHTML(FileHandler::readTextFile(url, true));
m_url = url; // needed for processing
#endif
}
if(results.isEmpty()) {
myDebug() << "IMDBFetcher::fetchEntry() - no text results" << endl;
m_url = origURL;
return 0;
}
entry = parseEntry(results);
m_url = origURL;
if(!entry) {
myDebug() << "IMDBFetcher::fetchEntry() - error in processing entry" << endl;
return 0;
}
m_entries.insert(uid_, entry); // keep for later
return entry;
}
Tellico::Data::EntryPtr IMDBFetcher::parseEntry(const TQString& str_) {
Data::CollPtr coll = new Data::VideoCollection(true);
Data::EntryPtr entry = new Data::Entry(coll);
doTitle(str_, entry);
doRunningTime(str_, entry);
doAspectRatio(str_, entry);
doAlsoKnownAs(str_, entry);
doPlot(str_, entry, m_url);
doLists(str_, entry);
doPerson(str_, entry, TQString::fromLatin1("Director"), TQString::fromLatin1("director"));
doPerson(str_, entry, TQString::fromLatin1("Writer"), TQString::fromLatin1("writer"));
doRating(str_, entry);
doCast(str_, entry, m_url);
if(m_fetchImages) {
// needs base URL
doCover(str_, entry, m_url);
}
const TQString imdb = TQString::fromLatin1("imdb");
if(!coll->hasField(imdb) && m_fields.findIndex(imdb) > -1) {
Data::FieldPtr field = new Data::Field(imdb, i18n("IMDB Link"), Data::Field::URL);
field->setCategory(i18n("General"));
coll->addField(field);
}
if(coll->hasField(imdb) && coll->fieldByName(imdb)->type() == Data::Field::URL) {
m_url.setQuery(TQString());
entry->setField(imdb, m_url.url());
}
return entry;
}
void IMDBFetcher::doTitle(const TQString& str_, Data::EntryPtr entry_) {
if(s_titleRx->search(str_) > -1) {
const TQString cap1 = s_titleRx->cap(1);
// titles always have parentheses
int pPos = cap1.find('(');
TQString title = cap1.left(pPos).stripWhiteSpace();
// remove first and last quotes is there
if(title.startsWith(TQChar('"')) && title.endsWith(TQChar('"'))) {
title = title.mid(1, title.length()-2);
}
entry_->setField(TQString::fromLatin1("title"), title);
// remove parenthesis
uint pPos2 = pPos+1;
while(pPos2 < cap1.length() && cap1[pPos2].isDigit()) {
++pPos2;
}
TQString year = cap1.mid(pPos+1, pPos2-pPos-1);
if(!year.isEmpty()) {
entry_->setField(TQString::fromLatin1("year"), year);
}
}
}
void IMDBFetcher::doRunningTime(const TQString& str_, Data::EntryPtr entry_) {
// running time
TQRegExp runtimeRx(TQString::fromLatin1("runtime:.*(\\d+)\\s+min"), false);
runtimeRx.setMinimal(true);
if(runtimeRx.search(str_) > -1) {
// myDebug() << "running-time = " << runtimeRx.cap(1) << endl;
entry_->setField(TQString::fromLatin1("running-time"), runtimeRx.cap(1));
}
}
void IMDBFetcher::doAspectRatio(const TQString& str_, Data::EntryPtr entry_) {
TQRegExp rx(TQString::fromLatin1("aspect ratio:.*([\\d\\.]+\\s*:\\s*[\\d\\.]+)"), false);
rx.setMinimal(true);
if(rx.search(str_) > -1) {
// myDebug() << "aspect ratio = " << rx.cap(1) << endl;
entry_->setField(TQString::fromLatin1("aspect-ratio"), rx.cap(1).stripWhiteSpace());
}
}
void IMDBFetcher::doAlsoKnownAs(const TQString& str_, Data::EntryPtr entry_) {
if(m_fields.findIndex(TQString::fromLatin1("alttitle")) == -1) {
return;
}
// match until next b tag
// TQRegExp akaRx(TQString::fromLatin1("also known as(.*)<b(?:\\s.*)?>"));
TQRegExp akaRx(TQString::fromLatin1("also known as(.*)<(b[>\\s/]|div)"), false);
akaRx.setMinimal(true);
if(akaRx.search(str_) > -1 && !akaRx.cap(1).isEmpty()) {
Data::FieldPtr f = entry_->collection()->fieldByName(TQString::fromLatin1("alttitle"));
if(!f) {
f = new Data::Field(TQString::fromLatin1("alttitle"), i18n("Alternative Titles"), Data::Field::Table);
f->setFormatFlag(Data::Field::FormatTitle);
entry_->collection()->addField(f);
}
// split by <br>, remembering it could become valid xhtml!
TQRegExp brRx(TQString::fromLatin1("<br[\\s/]*>"), false);
brRx.setMinimal(true);
TQStringList list = TQStringList::split(brRx, akaRx.cap(1));
// lang could be included with [fr]
// const TQRegExp parRx(TQString::fromLatin1("\\(.+\\)"));
const TQRegExp brackRx(TQString::fromLatin1("\\[\\w+\\]"));
TQStringList values;
for(TQStringList::Iterator it = list.begin(); it != list.end(); ++it) {
TQString s = *it;
// sometimes, the word "more" gets linked to the releaseinfo page, check that
if(s.find(TQString::fromLatin1("releaseinfo")) > -1) {
continue;
}
s.remove(*s_tagRx);
s.remove(brackRx);
s = s.stripWhiteSpace();
// the first value ends up being or starting with the colon after "Also know as"
// I'm too lazy to figure out a better regexp
if(s.startsWith(TQChar(':'))) {
s = s.mid(1);
}
if(!s.isEmpty()) {
values += s;
}
}
if(!values.isEmpty()) {
entry_->setField(TQString::fromLatin1("alttitle"), values.join(sep));
}
}
}
void IMDBFetcher::doPlot(const TQString& str_, Data::EntryPtr entry_, const KURL& baseURL_) {
// plot summaries provided by users are on a separate page
// should those be preferred?
bool useUserSummary = false;
TQString thisPlot;
// match until next opening tag
TQRegExp plotRx(TQString::fromLatin1("plot\\s*(?:outline|summary)?:(.*)<[^/].*</"), false);
plotRx.setMinimal(true);
TQRegExp plotURLRx(TQString::fromLatin1("<a\\s+.*href\\s*=\\s*\".*/title/.*/plotsummary\""), false);
plotURLRx.setMinimal(true);
if(plotRx.search(str_) > -1) {
thisPlot = plotRx.cap(1);
thisPlot.remove(*s_tagRx); // remove HTML tags
entry_->setField(TQString::fromLatin1("plot"), thisPlot);
// if thisPlot ends with (more) or contains
// a url that ends with plotsummary, then we'll grab it, otherwise not
if(plotRx.cap(0).endsWith(TQString::fromLatin1("(more)</")) || plotURLRx.search(plotRx.cap(0)) > -1) {
useUserSummary = true;
}
}
if(useUserSummary) {
TQRegExp idRx(TQString::fromLatin1("title/(tt\\d+)"));
idRx.search(baseURL_.path());
KURL plotURL = baseURL_;
plotURL.setPath(TQString::fromLatin1("/title/") + idRx.cap(1) + TQString::fromLatin1("/plotsummary"));
// be quiet about failure
TQString plotPage = FileHandler::readTextFile(plotURL, true);
if(!plotPage.isEmpty()) {
TQRegExp plotRx(TQString::fromLatin1("<p\\s+class\\s*=\\s*\"plotpar\">(.*)</p"));
plotRx.setMinimal(true);
if(plotRx.search(plotPage) > -1) {
TQString userPlot = plotRx.cap(1);
userPlot.remove(*s_tagRx); // remove HTML tags
// remove last little "written by", if there
userPlot.remove(TQRegExp(TQString::fromLatin1("\\s*written by.*$"), false));
entry_->setField(TQString::fromLatin1("plot"), Tellico::decodeHTML(userPlot));
}
}
}
}
void IMDBFetcher::doPerson(const TQString& str_, Data::EntryPtr entry_,
const TQString& imdbHeader_, const TQString& fieldName_) {
TQRegExp br2Rx(TQString::fromLatin1("<br[\\s/]*>\\s*<br[\\s/]*>"), false);
br2Rx.setMinimal(true);
TQRegExp divRx(TQString::fromLatin1("<[/]*div"), false);
divRx.setMinimal(true);
TQString name = TQString::fromLatin1("/name/");
StringSet people;
for(int pos = str_.find(imdbHeader_); pos > 0; pos = str_.find(imdbHeader_, pos)) {
// loop until repeated <br> tags or </div> tag
const int endPos1 = str_.find(br2Rx, pos);
const int endPos2 = str_.find(divRx, pos);
const int endPos = TQMIN(endPos1, endPos2); // ok to be -1
pos = s_anchorRx->search(str_, pos+1);
while(pos > -1 && pos < endPos) {
if(s_anchorRx->cap(1).find(name) > -1) {
people.add(s_anchorRx->cap(2).stripWhiteSpace());
}
pos = s_anchorRx->search(str_, pos+1);
}
}
if(!people.isEmpty()) {
entry_->setField(fieldName_, people.toList().join(sep));
}
}
void IMDBFetcher::doCast(const TQString& str_, Data::EntryPtr entry_, const KURL& baseURL_) {
// the extended cast list is on a separate page
// that's usually a lot of people
// but since it can be in billing order, the main actors might not
// be in the short list
TQRegExp idRx(TQString::fromLatin1("title/(tt\\d+)"));
idRx.search(baseURL_.path());
#ifdef IMDB_TEST
KURL castURL = KURL::fromPathOrURL(TQString::fromLatin1("/home/robby/imdb-title-fullcredits.html"));
#else
KURL castURL = baseURL_;
castURL.setPath(TQString::fromLatin1("/title/") + idRx.cap(1) + TQString::fromLatin1("/fullcredits"));
#endif
// be quiet about failure and be sure to translate entities
TQString castPage = Tellico::decodeHTML(FileHandler::readTextFile(castURL, true));
int pos = -1;
// the text to search, depends on which page is being read
TQString castText = castPage;
if(castText.isEmpty()) {
// fall back to short list
castText = str_;
pos = castText.find(TQString::fromLatin1("cast overview"), 0, false);
if(pos == -1) {
pos = castText.find(TQString::fromLatin1("credited cast"), 0, false);
}
} else {
// first look for anchor
TQRegExp castAnchorRx(TQString::fromLatin1("<a\\s+name\\s*=\\s*\"cast\""), false);
pos = castText.find(castAnchorRx);
if(pos < 0) {
TQRegExp tableClassRx(TQString::fromLatin1("<table\\s+class\\s*=\\s*\"cast\""), false);
pos = castText.find(tableClassRx);
if(pos < 0) {
// fragile, the word "cast" appears in the title, but need to find
// the one right above the actual cast table
// for TV shows, there's a link on the sidebar for "episodes case"
// so need to not match that one
pos = castText.find(TQString::fromLatin1("cast</"), 0, false);
if(pos > 9) {
// back up 9 places
if(castText.mid(pos-9, 9).startsWith(TQString::fromLatin1("episodes"))) {
// find next cast list
pos = castText.find(TQString::fromLatin1("cast</"), pos+6, false);
}
}
}
}
}
if(pos == -1) { // no cast list found
myDebug() << "IMDBFetcher::doCast() - no cast list found" << endl;
return;
}
const TQString name = TQString::fromLatin1("/name/");
TQRegExp tdRx(TQString::fromLatin1("<td[^>]*>(.*)</td>"), false);
tdRx.setMinimal(true);
TQStringList cast;
// loop until closing table tag
const int endPos = castText.find(TQString::fromLatin1("</table"), pos, false);
pos = s_anchorRx->search(castText, pos+1);
while(pos > -1 && pos < endPos && static_cast<int>(cast.count()) < m_numCast) {
if(s_anchorRx->cap(1).find(name) > -1) {
// now search for <td> item with character name
// there's a column with ellipses then the character
const int pos2 = tdRx.search(castText, pos);
if(pos2 > -1 && tdRx.search(castText, pos2+1) > -1) {
cast += s_anchorRx->cap(2).stripWhiteSpace()
+ TQString::fromLatin1("::") + tdRx.cap(1).simplifyWhiteSpace().remove(*s_tagRx);
} else {
cast += s_anchorRx->cap(2).stripWhiteSpace();
}
}
pos = s_anchorRx->search(castText, pos+1);
}
if(!cast.isEmpty()) {
entry_->setField(TQString::fromLatin1("cast"), cast.join(sep));
}
}
void IMDBFetcher::doRating(const TQString& str_, Data::EntryPtr entry_) {
if(m_fields.findIndex(TQString::fromLatin1("imdb-rating")) == -1) {
return;
}
// don't add a colon, since there's a <br> at the end
// some of the imdb images use /10.gif in their path, so check for space or bracket
TQRegExp rx(TQString::fromLatin1("[>\\s](\\d+.?\\d*)/10[<//s]"), false);
rx.setMinimal(true);
if(rx.search(str_) > -1 && !rx.cap(1).isEmpty()) {
Data::FieldPtr f = entry_->collection()->fieldByName(TQString::fromLatin1("imdb-rating"));
if(!f) {
f = new Data::Field(TQString::fromLatin1("imdb-rating"), i18n("IMDB Rating"), Data::Field::Rating);
f->setCategory(i18n("General"));
f->setProperty(TQString::fromLatin1("maximum"), TQString::fromLatin1("10"));
entry_->collection()->addField(f);
}
bool ok;
float value = rx.cap(1).toFloat(&ok);
if(ok) {
entry_->setField(TQString::fromLatin1("imdb-rating"), TQString::number(value));
}
}
}
void IMDBFetcher::doCover(const TQString& str_, Data::EntryPtr entry_, const KURL& baseURL_) {
// cover is the img with the "cover" alt text
TQRegExp imgRx(TQString::fromLatin1("<img\\s+[^>]*src\\s*=\\s*\"([^\"]*)\"[^>]*>"), false);
imgRx.setMinimal(true);
TQRegExp posterRx(TQString::fromLatin1("<a\\s+[^>]*name\\s*=\\s*\"poster\"[^>]*>(.*)</a>"), false);
posterRx.setMinimal(true);
const TQString cover = TQString::fromLatin1("cover");
int pos = posterRx.search(str_);
while(pos > -1) {
if(imgRx.search(posterRx.cap(1)) > -1) {
KURL u(baseURL_, imgRx.cap(1));
TQString id = ImageFactory::addImage(u, true);
if(!id.isEmpty()) {
entry_->setField(cover, id);
}
return;
}
pos = posterRx.search(str_, pos+1);
}
// didn't find the cover, IMDb also used to put "cover" inside the url
pos = imgRx.search(str_);
while(pos > -1) {
if(imgRx.cap(0).find(cover, 0, false) > -1) {
KURL u(baseURL_, imgRx.cap(1));
TQString id = ImageFactory::addImage(u, true);
if(!id.isEmpty()) {
entry_->setField(cover, id);
}
return;
}
pos = imgRx.search(str_, pos+1);
}
}
// end up reparsing whole string, but it's not really that slow
// loook at every anchor tag in the string
void IMDBFetcher::doLists(const TQString& str_, Data::EntryPtr entry_) {
const TQString genre = TQString::fromLatin1("/Genres/");
const TQString country = TQString::fromLatin1("/Countries/");
const TQString lang = TQString::fromLatin1("/Languages/");
const TQString colorInfo = TQString::fromLatin1("color-info");
const TQString cert = TQString::fromLatin1("certificates=");
const TQString soundMix = TQString::fromLatin1("sound-mix=");
const TQString year = TQString::fromLatin1("/Years/");
const TQString company = TQString::fromLatin1("/company/");
// IIMdb also has links with the word "sections" in them, remove that
// for genres and nationalities
TQStringList genres, countries, langs, certs, tracks, studios;
for(int pos = s_anchorRx->search(str_); pos > -1; pos = s_anchorRx->search(str_, pos+1)) {
const TQString cap1 = s_anchorRx->cap(1);
if(cap1.find(genre) > -1) {
if(s_anchorRx->cap(2).find(TQString::fromLatin1(" section"), 0, false) == -1) {
genres += s_anchorRx->cap(2).stripWhiteSpace();
}
} else if(cap1.find(country) > -1) {
if(s_anchorRx->cap(2).find(TQString::fromLatin1(" section"), 0, false) == -1) {
countries += s_anchorRx->cap(2).stripWhiteSpace();
}
} else if(cap1.find(lang) > -1) {
langs += s_anchorRx->cap(2).stripWhiteSpace();
} else if(cap1.find(colorInfo) > -1) {
// change "black and white" to "black & white"
entry_->setField(TQString::fromLatin1("color"),
s_anchorRx->cap(2).replace(TQString::fromLatin1("and"), TQChar('&')).stripWhiteSpace());
} else if(cap1.find(cert) > -1) {
certs += s_anchorRx->cap(2).stripWhiteSpace();
} else if(cap1.find(soundMix) > -1) {
tracks += s_anchorRx->cap(2).stripWhiteSpace();
} else if(cap1.find(company) > -1) {
studios += s_anchorRx->cap(2).stripWhiteSpace();
// if year field wasn't set before, do it now
} else if(entry_->field(TQString::fromLatin1("year")).isEmpty() && cap1.find(year) > -1) {
entry_->setField(TQString::fromLatin1("year"), s_anchorRx->cap(2).stripWhiteSpace());
}
}
entry_->setField(TQString::fromLatin1("genre"), genres.join(sep));
entry_->setField(TQString::fromLatin1("nationality"), countries.join(sep));
entry_->setField(TQString::fromLatin1("language"), langs.join(sep));
entry_->setField(TQString::fromLatin1("audio-track"), tracks.join(sep));
entry_->setField(TQString::fromLatin1("studio"), studios.join(sep));
if(!certs.isEmpty()) {
// first try to set default certification
const TQStringList& certsAllowed = entry_->collection()->fieldByName(TQString::fromLatin1("certification"))->allowed();
for(TQStringList::ConstIterator it = certs.begin(); it != certs.end(); ++it) {
TQString country = (*it).section(':', 0, 0);
TQString cert = (*it).section(':', 1, 1);
if(cert == Latin1Literal("Unrated")) {
cert = TQChar('U');
}
cert += TQString::fromLatin1(" (") + country + ')';
if(certsAllowed.findIndex(cert) > -1) {
entry_->setField(TQString::fromLatin1("certification"), cert);
break;
}
}
// now add new field for all certifications
const TQString allc = TQString::fromLatin1("allcertification");
if(m_fields.findIndex(allc) > -1) {
Data::FieldPtr f = entry_->collection()->fieldByName(allc);
if(!f) {
f = new Data::Field(allc, i18n("Certifications"), Data::Field::Table);
f->setFlags(Data::Field::AllowGrouped);
entry_->collection()->addField(f);
}
entry_->setField(TQString::fromLatin1("allcertification"), certs.join(sep));
}
}
}
void IMDBFetcher::updateEntry(Data::EntryPtr entry_) {
// myLog() << "IMDBFetcher::updateEntry() - " << entry_->title() << endl;
// only take first 5
m_limit = 5;
TQString t = entry_->field(TQString::fromLatin1("title"));
KURL link = entry_->field(TQString::fromLatin1("imdb"));
if(!link.isEmpty() && link.isValid()) {
// check if we want a different host
if(link.host() != m_host) {
// myLog() << "IMDBFetcher::updateEntry() - switching hosts to " << m_host << endl;
link.setHost(m_host);
}
m_key = Fetch::Title;
m_value = t;
m_started = true;
m_data.truncate(0);
m_matches.clear();
m_url = link;
m_redirected = true; // m_redirected is used as a flag later to tell if we get a single result
m_job = TDEIO::get(m_url, false, false);
connect(m_job, TQ_SIGNAL(data(TDEIO::Job*, const TQByteArray&)),
TQ_SLOT(slotData(TDEIO::Job*, const TQByteArray&)));
connect(m_job, TQ_SIGNAL(result(TDEIO::Job*)),
TQ_SLOT(slotComplete(TDEIO::Job*)));
connect(m_job, TQ_SIGNAL(redirection(TDEIO::Job *, const KURL&)),
TQ_SLOT(slotRedirection(TDEIO::Job*, const KURL&)));
return;
}
// optimistically try searching for title and rely on Collection::sameEntry() to figure things out
if(!t.isEmpty()) {
search(Fetch::Title, t);
return;
}
emit signalDone(this); // always need to emit this if not continuing with the search
}
Tellico::Fetch::ConfigWidget* IMDBFetcher::configWidget(TQWidget* parent_) const {
return new IMDBFetcher::ConfigWidget(parent_, this);
}
IMDBFetcher::ConfigWidget::ConfigWidget(TQWidget* parent_, const IMDBFetcher* fetcher_/*=0*/)
: Fetch::ConfigWidget(parent_) {
TQGridLayout* l = new TQGridLayout(optionsWidget(), 4, 2);
l->setSpacing(4);
l->setColStretch(1, 10);
int row = -1;
TQLabel* label = new TQLabel(i18n("Hos&t: "), optionsWidget());
l->addWidget(label, ++row, 0);
m_hostEdit = new KLineEdit(optionsWidget());
connect(m_hostEdit, TQ_SIGNAL(textChanged(const TQString&)), TQ_SLOT(slotSetModified()));
l->addWidget(m_hostEdit, row, 1);
TQString w = i18n("The Internet Movie Database uses several different servers. Choose the one "
"you wish to use.");
TQWhatsThis::add(label, w);
TQWhatsThis::add(m_hostEdit, w);
label->setBuddy(m_hostEdit);
label = new TQLabel(i18n("&Maximum cast: "), optionsWidget());
l->addWidget(label, ++row, 0);
m_numCast = new KIntSpinBox(0, 99, 1, 10, 10, optionsWidget());
connect(m_numCast, TQ_SIGNAL(valueChanged(const TQString&)), TQ_SLOT(slotSetModified()));
l->addWidget(m_numCast, row, 1);
w = i18n("The list of cast members may include many people. Set the maximum number returned from the search.");
TQWhatsThis::add(label, w);
TQWhatsThis::add(m_numCast, w);
label->setBuddy(m_numCast);
m_fetchImageCheck = new TQCheckBox(i18n("Download cover &image"), optionsWidget());
connect(m_fetchImageCheck, TQ_SIGNAL(clicked()), TQ_SLOT(slotSetModified()));
++row;
l->addMultiCellWidget(m_fetchImageCheck, row, row, 0, 1);
w = i18n("The cover image may be downloaded as well. However, too many large images in the "
"collection may degrade performance.");
TQWhatsThis::add(m_fetchImageCheck, w);
l->setRowStretch(++row, 10);
// now add additional fields widget
addFieldsWidget(IMDBFetcher::customFields(), fetcher_ ? fetcher_->m_fields : TQStringList());
if(fetcher_) {
m_hostEdit->setText(fetcher_->m_host);
m_numCast->setValue(fetcher_->m_numCast);
m_fetchImageCheck->setChecked(fetcher_->m_fetchImages);
} else { //defaults
m_hostEdit->setText(TQString::fromLatin1(IMDB_SERVER));
m_numCast->setValue(10);
m_fetchImageCheck->setChecked(true);
}
}
void IMDBFetcher::ConfigWidget::saveConfig(TDEConfigGroup& config_) {
TQString host = m_hostEdit->text().stripWhiteSpace();
if(!host.isEmpty()) {
config_.writeEntry("Host", host);
}
config_.writeEntry("Max Cast", m_numCast->value());
config_.writeEntry("Fetch Images", m_fetchImageCheck->isChecked());
saveFieldsConfig(config_);
slotSetModified(false);
}
TQString IMDBFetcher::ConfigWidget::preferredName() const {
return IMDBFetcher::defaultName();
}
//static
Tellico::StringMap IMDBFetcher::customFields() {
StringMap map;
map[TQString::fromLatin1("imdb")] = i18n("IMDB Link");
map[TQString::fromLatin1("imdb-rating")] = i18n("IMDB Rating");
map[TQString::fromLatin1("alttitle")] = i18n("Alternative Titles");
map[TQString::fromLatin1("allcertification")] = i18n("Certifications");
return map;
}
#include "imdbfetcher.moc"