IGN:Add a plugin to download social metadata (tags/rating/review) etc. from Amazon

This commit is contained in:
Kovid Goyal 2009-11-11 17:10:03 -07:00
parent 289455c1d7
commit 7e05464776
8 changed files with 196 additions and 46 deletions

View File

@ -193,7 +193,7 @@ def extract(path, dir):
raise Exception('Unknown archive type') raise Exception('Unknown archive type')
extractor(path, dir) extractor(path, dir)
def get_proxies(): def get_proxies(debug=True):
proxies = {} proxies = {}
for q in ('http', 'ftp'): for q in ('http', 'ftp'):
@ -226,10 +226,40 @@ def get_proxies():
if len(proxies[x]) < 5: if len(proxies[x]) < 5:
prints('Removing invalid', x, 'proxy:', proxies[x]) prints('Removing invalid', x, 'proxy:', proxies[x])
del proxies[x] del proxies[x]
if proxies: if proxies and debug:
prints('Using proxies:', proxies) prints('Using proxies:', proxies)
return proxies return proxies
def get_parsed_proxy(typ='http', debug=True):
proxies = get_proxies(debug)
if typ not in proxies:
return
pattern = re.compile((
'(?:ptype://)?' \
'(?:(?P<user>\w+):(?P<pass>.*)@)?' \
'(?P<host>[\w\-\.]+)' \
'(?::(?P<port>\d+))?').replace('ptype', typ)
)
match = pattern.match(proxies['typ'])
if match:
try:
ans = {
'host' : match.group('host'),
'port' : match.group('port'),
'user' : match.group('user'),
'pass' : match.group('pass')
}
if ans['port']:
ans['port'] = int(ans['port'])
except:
if debug:
traceback.print_exc()
return
if debug:
prints('Using http proxy', ans)
return ans
def browser(honor_time=True, max_time=2, mobile_browser=False): def browser(honor_time=True, max_time=2, mobile_browser=False):
''' '''

View File

@ -374,8 +374,8 @@ from calibre.devices.eslick.driver import ESLICK
from calibre.devices.nuut2.driver import NUUT2 from calibre.devices.nuut2.driver import NUUT2
from calibre.devices.iriver.driver import IRIVER_STORY from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon
plugins = [HTML2ZIP, GoogleBooks, ISBNDB] plugins = [HTML2ZIP, GoogleBooks, ISBNDB, Amazon]
plugins += [ plugins += [
ComicInput, ComicInput,
EPUBInput, EPUBInput,

View File

@ -90,9 +90,10 @@ def output_profiles():
if isinstance(plugin, OutputProfile): if isinstance(plugin, OutputProfile):
yield plugin yield plugin
def metadata_sources(customize=True, isbndb_key=None): def metadata_sources(metadata_type='basic', customize=True, isbndb_key=None):
for plugin in _initialized_plugins: for plugin in _initialized_plugins:
if isinstance(plugin, MetadataSource): if isinstance(plugin, MetadataSource) and \
plugin.metadata_type == metadata_type:
if is_disabled(plugin): if is_disabled(plugin):
continue continue
if customize: if customize:

View File

@ -6,45 +6,83 @@ __docformat__ = 'restructuredtext en'
''' '''
Fetch metadata using Amazon AWS Fetch metadata using Amazon AWS
''' '''
import re import sys, re
from datetime import datetime
from lxml import etree
from dateutil import parser
from calibre import browser from calibre import browser
from calibre.ebooks.metadata import MetaInformation, string_to_authors
AWS_NS = 'http://webservices.amazon.com/AWSECommerceService/2005-10-05'
def AWS(tag):
return '{%s}%s'%(AWS_NS, tag)
def check_for_errors(root):
err = root.find('.//'+AWS('Error'))
if err is not None:
raise Exception('Failed to get metadata with error: '\
+ etree.tostring(err, method='text', pretty_print=True,
encoding=unicode))
def get_social_metadata(title, authors, publisher, isbn):
mi = MetaInformation(title, authors)
if isbn:
br = browser()
response_xml = br.open('http://status.calibre-ebook.com/aws/metadata/'+isbn).read()
root = etree.fromstring(response_xml)
check_for_errors(root)
mi.title = root.findtext('.//'+AWS('Title'))
authors = [x.text for x in root.findall('.//'+AWS('Author'))]
if authors:
mi.authors = []
for x in authors:
mi.authors.extend(string_to_authors(x))
mi.publisher = root.findtext('.//'+AWS('Publisher'))
try:
d = root.findtext('.//'+AWS('PublicationDate'))
if d:
default = datetime.utcnow()
default = datetime(default.year, default.month, 15)
d = parser.parse(d[0].text, default=default)
mi.pubdate = d
except:
pass
try:
rating = float(root.findtext('.//'+AWS('AverageRating')))
num_of_reviews = int(root.findtext('.//'+AWS('TotalReviews')))
if num_of_reviews > 4 and rating > 0 and rating < 5:
mi.rating = rating
except:
pass
tags = [x.text for x in root.findall('.//%s/%s'%(AWS('Subjects'),
AWS('Subject')))]
if tags:
mi.tags = []
for x in tags:
mi.tags.extend([y.strip() for y in x.split('/')])
comments = root.find('.//%s/%s'%(AWS('EditorialReview'),
AWS('Content')))
if comments is not None:
mi.comments = etree.tostring(comments,
method='text', encoding=unicode)
mi.comments = re.sub('<([pP]|DIV)>', '\n\n', mi.comments)
mi.comments = re.sub('</?[iI]>', '*', mi.comments)
mi.comments = re.sub('</?[bB]>', '**', mi.comments)
mi.comments = re.sub('<BR>', '\n\n', mi.comments)
mi.comments = re.sub('<[^>]+>', '', mi.comments)
mi.comments = mi.comments.strip()
mi.comments = _('EDITORIAL REVIEW')+':\n\n'+mi.comments
return mi
BASE_URL = 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=%(key)s&Operation=ItemLookup&ItemId=1416551727&ResponseGroup=%(group)s'
import sys
def get_rating(isbn, key):
br = browser()
url = BASE_URL%dict(key=key, group='Reviews')
raw = br.open(url).read()
match = re.search(r'<AverageRating>([\d.]+)</AverageRating>', raw)
if match:
return float(match.group(1))
def get_cover_url(isbn, key):
br = browser()
url = BASE_URL%dict(key=key, group='Images')
raw = br.open(url).read()
match = re.search(r'<LargeImage><URL>(.+?)</URL>', raw)
if match:
return match.group(1)
def get_editorial_review(isbn, key):
br = browser()
url = BASE_URL%dict(key=key, group='EditorialReview')
raw = br.open(url).read()
match = re.compile(r'<EditorialReview>.*?<Content>(.+?)</Content>', re.DOTALL).search(raw)
if match:
return match.group(1)
def main(args=sys.argv): def main(args=sys.argv):
print 'Rating:', get_rating(args[1], args[2]) print get_social_metadata(None, None, None, '9781416551720')
print 'Cover:', get_rating(args[1], args[2])
print 'EditorialReview:', get_editorial_review(args[1], args[2])
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
import traceback, sys, textwrap, re import traceback, sys, textwrap, re
from threading import Thread from threading import Thread
from calibre import preferred_encoding from calibre import prints
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
@ -15,7 +15,14 @@ from calibre.customize import Plugin
class MetadataSource(Plugin): class MetadataSource(Plugin):
author = 'Kovid Goyal' author = 'Kovid Goyal'
supported_platforms = ['windows', 'osx', 'linux'] supported_platforms = ['windows', 'osx', 'linux']
#: The type of metadata fetched. 'basic' means basic metadata like
#: title/author/isbn/etc. 'social' means social metadata like
#: tags/rating/reviews/etc.
metadata_type = 'basic'
type = _('Metadata download') type = _('Metadata download')
def __call__(self, title, author, publisher, isbn, verbose, log=None, def __call__(self, title, author, publisher, isbn, verbose, log=None,
@ -49,6 +56,7 @@ class MetadataSource(Plugin):
def join(self): def join(self):
return self.worker.join() return self.worker.join()
class GoogleBooks(MetadataSource): class GoogleBooks(MetadataSource):
name = 'Google Books' name = 'Google Books'
@ -104,6 +112,22 @@ class ISBNDB(MetadataSource):
ans = ans.replace('%s', '') ans = ans.replace('%s', '')
return ans return ans
class Amazon(MetadataSource):
name = 'Amazon'
metadata_type = 'social'
def fetch(self):
if not self.isbn:
return
from calibre.ebooks.metadata.amazon import get_social_metadata
try:
self.results = get_social_metadata(self.title, self.author,
self.publisher, self.isbn)
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
def result_index(source, result): def result_index(source, result):
if not result.isbn: if not result.isbn:
return -1 return -1
@ -134,16 +158,56 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
fetcher(title, author, publisher, isbn, verbose) fetcher(title, author, publisher, isbn, verbose)
for fetcher in fetchers: for fetcher in fetchers:
fetcher.join() fetcher.join()
results = list(fetchers[0].results)
for fetcher in fetchers[1:]: for fetcher in fetchers[1:]:
merge_results(fetchers[0].results, fetcher.results) merge_results(results, fetcher.results)
results = sorted(fetchers[0].results, cmp=lambda x, y : cmp( results = sorted(results, cmp=lambda x, y : cmp(
(x.comments.strip() if x.comments else ''), (x.comments.strip() if x.comments else ''),
(y.comments.strip() if y.comments else '') (y.comments.strip() if y.comments else '')
), reverse=True) ), reverse=True)
return results, [(x.name, x.exception, x.tb) for x in fetchers] return results, [(x.name, x.exception, x.tb) for x in fetchers]
def get_social_metadata(mi, verbose=0):
from calibre.customize.ui import metadata_sources
fetchers = list(metadata_sources(metadata_type='social'))
for fetcher in fetchers:
fetcher(mi.title, mi.authors, mi.publisher, mi.isbn, verbose)
for fetcher in fetchers:
fetcher.join()
ratings, tags, comments = [], set([]), set([])
for fetcher in fetchers:
if fetcher.results:
dmi = fetcher.results
if dmi.rating is not None:
ratings.append(dmi.rating)
if dmi.tags:
for t in dmi.tags:
tags.add(t)
if mi.pubdate is None and dmi.pubdate is not None:
mi.pubdate = dmi.pubdate
if dmi.comments:
comments.add(dmi.comments)
if ratings:
rating = sum(ratings)/float(len(ratings))
if mi.rating is None:
mi.rating = rating
else:
mi.rating = (mi.rating + rating)/2.0
if tags:
if not mi.tags:
mi.tags = []
mi.tags += list(tags)
mi.tags = list(sorted(list(set(mi.tags))))
if comments:
mi.comments = ''
for x in comments:
mi.comments += '\n\n'+x
return [(x.name, x.exception, x.tb) for x in fetchers]
def option_parser(): def option_parser():
parser = OptionParser(textwrap.dedent( parser = OptionParser(textwrap.dedent(
@ -174,11 +238,13 @@ def main(args=sys.argv):
opts, args = parser.parse_args(args) opts, args = parser.parse_args(args)
results, exceptions = search(opts.title, opts.author, opts.publisher, results, exceptions = search(opts.title, opts.author, opts.publisher,
opts.isbn, opts.isbndb_key, opts.verbose) opts.isbn, opts.isbndb_key, opts.verbose)
social_exceptions = []
for result in results: for result in results:
print unicode(result).encode(preferred_encoding) social_exceptions.extend(get_social_metadata(result, opts.verbose))
prints(unicode(result))
print print
for name, exception, tb in exceptions: for name, exception, tb in exceptions+social_exceptions:
if exception is not None: if exception is not None:
print 'WARNING: Fetching from', name, 'failed with error:' print 'WARNING: Fetching from', name, 'failed with error:'
print exception print exception

View File

@ -135,7 +135,11 @@ class ResultList(list):
def get_tags(self, entry, verbose): def get_tags(self, entry, verbose):
try: try:
tags = [x.text for x in subject(entry)] btags = [x.text for x in subject(entry)]
tags = []
for t in btags:
tags.extend([y.strip() for y in t.split('/')])
tags = list(sorted(list(set(tags))))
except: except:
report(verbose) report(verbose)
tags = [] tags = []

View File

@ -125,7 +125,16 @@ def create_books(opts, args, timeout=5.):
if opts.verbose: if opts.verbose:
print ('ISBNDB query: '+url) print ('ISBNDB query: '+url)
return [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)]
ans = []
for x in tans:
add = True
for y in ans:
if y.isbn == x.isbn:
add = False
if add:
ans.append(x)
return ans
def main(args=sys.argv): def main(args=sys.argv):
parser = option_parser() parser = option_parser()

View File

@ -122,6 +122,8 @@ Metadata download plugins
:class:`MetaInformation` objects. If there is an error, it should be stored :class:`MetaInformation` objects. If there is an error, it should be stored
in `self.exception` and `self.tb` (for the traceback). in `self.exception` and `self.tb` (for the traceback).
.. automember:: calibre.ebooks.metadata.fetch.MetadataSource.metadata_type
.. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.fetch .. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.fetch
.. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.is_ok .. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.is_ok