#!/usr/bin/env python # -*- coding: iso-8859-1 -*- # *************************************************************************** # copyright : (C) 2006 by Mathias Monnerville # email : tellico@monnerville.com # *************************************************************************** # # *************************************************************************** # * * # * This program is free software; you can redistribute it and/or modify * # * it under the terms of version 2 of the GNU General Public License as * # * published by the Free Software Foundation; * # * * # *************************************************************************** # Version 0.4: 2007-08-27 # * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres # could not be retrieved. Fixed bad http request error due to some changes in HTML code. # # Version 0.3: # * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed. # # Version 0.2: # * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore. # # Version 0.1: # * Initial release. import sys, os, re, md5, random import urllib, urllib2, time, base64 import xml.dom.minidom XML_HEADER = """""" DOCTYPE = """""" VERSION = "0.4" def genMD5(): obj = md5.new() float = random.random() obj.update(str(float)) return obj.hexdigest() class BasicTellicoDOM: def __init__(self): self.__doc = xml.dom.minidom.Document() self.__root = self.__doc.createElement('tellico') self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/') self.__root.setAttribute('syntaxVersion', '9') self.__collection = self.__doc.createElement('collection') self.__collection.setAttribute('title', 'My Movies') self.__collection.setAttribute('type', '3') self.__fields = self.__doc.createElement('fields') # Add all default (standard) fields self.__dfltField = self.__doc.createElement('field') self.__dfltField.setAttribute('name', '_default') # Add a custom 'Collection' field self.__customField = self.__doc.createElement('field') self.__customField.setAttribute('name', 'titre-original') self.__customField.setAttribute('title', 'Original Title') self.__customField.setAttribute('flags', '8') self.__customField.setAttribute('category', 'General') self.__customField.setAttribute('format', '1') self.__customField.setAttribute('type', '1') self.__customField.setAttribute('i18n', 'yes') self.__fields.appendChild(self.__dfltField) self.__fields.appendChild(self.__customField) self.__collection.appendChild(self.__fields) self.__images = self.__doc.createElement('images') self.__root.appendChild(self.__collection) self.__doc.appendChild(self.__root) # Current movie id self.__currentId = 0 def addEntry(self, movieData): """ Add a movie entry """ d = movieData entryNode = self.__doc.createElement('entry') entryNode.setAttribute('id', str(self.__currentId)) titleNode = self.__doc.createElement('title') titleNode.appendChild(self.__doc.createTextNode(unicode(d['title'], 'latin-1').encode('utf-8'))) otitleNode = self.__doc.createElement('titre-original') otitleNode.appendChild(self.__doc.createTextNode(unicode(d['otitle'], 'latin-1').encode('utf-8'))) yearNode = self.__doc.createElement('year') yearNode.appendChild(self.__doc.createTextNode(unicode(d['year'], 'latin-1').encode('utf-8'))) genresNode = self.__doc.createElement('genres') for g in d['genres']: genreNode = self.__doc.createElement('genre') genreNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) genresNode.appendChild(genreNode) natsNode = self.__doc.createElement('nationalitys') natNode = self.__doc.createElement('nat') natNode.appendChild(self.__doc.createTextNode(unicode(d['nat'], 'latin-1').encode('utf-8'))) natsNode.appendChild(natNode) castsNode = self.__doc.createElement('casts') for g in d['actors']: castNode = self.__doc.createElement('cast') col1Node = self.__doc.createElement('column') col2Node = self.__doc.createElement('column') col1Node.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) castNode.appendChild(col1Node) castNode.appendChild(col2Node) castsNode.appendChild(castNode) dirsNode = self.__doc.createElement('directors') for g in d['dirs']: dirNode = self.__doc.createElement('director') dirNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) dirsNode.appendChild(dirNode) timeNode = self.__doc.createElement('running-time') timeNode.appendChild(self.__doc.createTextNode(unicode(d['time'], 'latin-1').encode('utf-8'))) allocineNode = self.__doc.createElement(unicode('allociné-link', 'latin-1').encode('utf-8')) allocineNode.appendChild(self.__doc.createTextNode(unicode(d['allocine'], 'latin-1').encode('utf-8'))) plotNode = self.__doc.createElement('plot') plotNode.appendChild(self.__doc.createTextNode(unicode(d['plot'], 'latin-1').encode('utf-8'))) if d['image']: imageNode = self.__doc.createElement('image') imageNode.setAttribute('format', 'JPEG') imageNode.setAttribute('id', d['image'][0]) imageNode.setAttribute('width', '120') imageNode.setAttribute('height', '160') imageNode.appendChild(self.__doc.createTextNode(unicode(d['image'][1], 'latin-1').encode('utf-8'))) coverNode = self.__doc.createElement('cover') coverNode.appendChild(self.__doc.createTextNode(d['image'][0])) for name in ( 'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'natsNode', 'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode' ): entryNode.appendChild(eval(name)) if d['image']: entryNode.appendChild(coverNode) self.__images.appendChild(imageNode) self.__collection.appendChild(entryNode) self.__currentId += 1 def printXML(self): """ Outputs XML content to stdout """ self.__collection.appendChild(self.__images) print XML_HEADER; print DOCTYPE print self.__root.toxml() class AlloCineParser: def __init__(self): self.__baseURL = 'http://www.allocine.fr' self.__basePath = '/film/fichefilm_gen_cfilm' self.__searchURL= 'http://www.allocine.fr/recherche/?motcle=%s&f=3&rub=1' self.__movieURL = self.__baseURL + self.__basePath # Define some regexps self.__regExps = { 'title' : '(?P<title>.+?)', 'dirs' : 'Réalisé par (?P.+?).*?', 'actors' : '

Avec *(?P.+)  ', 'nat' : '

Film *(?P.+?)[,\.]', 'genres' : '

Genre *: *(?P.+?)

', 'time' : '

Durée *: *(?P[0-9])?h *(?P[0-9]{1,2})min', 'year' : 'Année de production *: *(?P[0-9]{4})', # Original movie title 'otitle' : 'Titre original *: *(?P.+?)', 'plot' : """(?s)

*(?P.+?) *

""", 'image' : """(?P.*?)</a>""" % self.__basePath, self.__data) if not matchList: return None return matchList def __fetchMovieInfo(self, url): """ Looks for movie information """ self.__getHTMLContent(url) matches = data = {} for name, regexp in self.__regExps.iteritems(): if name == 'image': matches[name] = re.findall(self.__regExps[name], self.__data, re.S | re.I) else: matches[name] = re.search(regexp, self.__data) if matches[name]: if name == 'title': data[name] = matches[name].group('title').strip() elif name == 'dirs': dirsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',') data[name] = [] for d in dirsList: data[name].append(d.strip()) elif name == 'actors': actorsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',') data[name] = [] for d in actorsList: data[name].append(d.strip()) elif name == 'nat': data[name] = matches[name].group('nat').strip() elif name == 'genres': genresList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',') data[name] = [] for d in genresList: data[name].append(d.strip()) elif name == 'time': h, m = matches[name].group('hours'), matches[name].group('mins') totmin = int(h)*60+int(m) data[name] = str(totmin) elif name == 'year': data[name] = matches[name].group('year').strip() elif name == 'otitle': data[name] = matches[name].group('otitle').strip() elif name == 'plot': data[name] = matches[name].group('plot').strip() # Image path elif name == 'image': # Save image to a temporary folder md5 = genMD5() imObj = urllib2.urlopen(matches[name][0].strip()) img = imObj.read() imObj.close() imgPath = "/tmp/%s.jpeg" % md5 try: f = open(imgPath, 'w') f.write(img) f.close() except: # Could be great if we can pass exit code and some message # to tellico in case of failure... pass data[name] = (md5 + '.jpeg', base64.encodestring(img)) # Delete temporary image try: os.remove(imgPath) except: # Could be great if we can pass exit code and some msg # to tellico in case of failure... pass else: matches[name] = '' return data def __getMovie(self, title): if not len(title): return self.__title = title self.__getHTMLContent(self.__searchURL % urllib.quote(self.__title)) # Get all links links = self.__fetchMovieLinks() # Now retrieve infos if links: for entry in links: data = self.__fetchMovieInfo( url = "%s=%s" % (self.__movieURL, entry[0]) ) # Add allocine link (custom field) data['allocine'] = "%s=%s" % (self.__movieURL, entry[0]) self.__domTree.addEntry(data) else: return None def showUsage(): print "Usage: %s movietitle" % sys.argv[0] sys.exit(1) def main(): if len(sys.argv) < 2: showUsage() parser = AlloCineParser() parser.run(sys.argv[1]) if __name__ == '__main__': main()