IGN:Add a plugin to download social metadata (tags/rating/review) etc. from Amazon

This commit is contained in:
Kovid Goyal 2009-11-11 17:10:03 -07:00
parent 289455c1d7
commit 7e05464776
8 changed files with 196 additions and 46 deletions

View File

@ -193,7 +193,7 @@ def extract(path, dir):
raise Exception('Unknown archive type')
extractor(path, dir)
def get_proxies():
def get_proxies(debug=True):
proxies = {}
for q in ('http', 'ftp'):
@ -226,10 +226,40 @@ def get_proxies():
if len(proxies[x]) < 5:
prints('Removing invalid', x, 'proxy:', proxies[x])
del proxies[x]
if proxies:
if proxies and debug:
prints('Using proxies:', proxies)
return proxies
def get_parsed_proxy(typ='http', debug=True):
proxies = get_proxies(debug)
if typ not in proxies:
return
pattern = re.compile((
'(?:ptype://)?' \
'(?:(?P<user>\w+):(?P<pass>.*)@)?' \
'(?P<host>[\w\-\.]+)' \
'(?::(?P<port>\d+))?').replace('ptype', typ)
)
match = pattern.match(proxies['typ'])
if match:
try:
ans = {
'host' : match.group('host'),
'port' : match.group('port'),
'user' : match.group('user'),
'pass' : match.group('pass')
}
if ans['port']:
ans['port'] = int(ans['port'])
except:
if debug:
traceback.print_exc()
return
if debug:
prints('Using http proxy', ans)
return ans
def browser(honor_time=True, max_time=2, mobile_browser=False):
'''

View File

@ -374,8 +374,8 @@ from calibre.devices.eslick.driver import ESLICK
from calibre.devices.nuut2.driver import NUUT2
from calibre.devices.iriver.driver import IRIVER_STORY
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB
plugins = [HTML2ZIP, GoogleBooks, ISBNDB]
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon
plugins = [HTML2ZIP, GoogleBooks, ISBNDB, Amazon]
plugins += [
ComicInput,
EPUBInput,

View File

@ -90,9 +90,10 @@ def output_profiles():
if isinstance(plugin, OutputProfile):
yield plugin
def metadata_sources(customize=True, isbndb_key=None):
def metadata_sources(metadata_type='basic', customize=True, isbndb_key=None):
for plugin in _initialized_plugins:
if isinstance(plugin, MetadataSource):
if isinstance(plugin, MetadataSource) and \
plugin.metadata_type == metadata_type:
if is_disabled(plugin):
continue
if customize:

View File

@ -6,45 +6,83 @@ __docformat__ = 'restructuredtext en'
'''
Fetch metadata using Amazon AWS
'''
import re
import sys, re
from datetime import datetime
from lxml import etree
from dateutil import parser
from calibre import browser
from calibre.ebooks.metadata import MetaInformation, string_to_authors
AWS_NS = 'http://webservices.amazon.com/AWSECommerceService/2005-10-05'
def AWS(tag):
return '{%s}%s'%(AWS_NS, tag)
def check_for_errors(root):
err = root.find('.//'+AWS('Error'))
if err is not None:
raise Exception('Failed to get metadata with error: '\
+ etree.tostring(err, method='text', pretty_print=True,
encoding=unicode))
def get_social_metadata(title, authors, publisher, isbn):
mi = MetaInformation(title, authors)
if isbn:
br = browser()
response_xml = br.open('http://status.calibre-ebook.com/aws/metadata/'+isbn).read()
root = etree.fromstring(response_xml)
check_for_errors(root)
mi.title = root.findtext('.//'+AWS('Title'))
authors = [x.text for x in root.findall('.//'+AWS('Author'))]
if authors:
mi.authors = []
for x in authors:
mi.authors.extend(string_to_authors(x))
mi.publisher = root.findtext('.//'+AWS('Publisher'))
try:
d = root.findtext('.//'+AWS('PublicationDate'))
if d:
default = datetime.utcnow()
default = datetime(default.year, default.month, 15)
d = parser.parse(d[0].text, default=default)
mi.pubdate = d
except:
pass
try:
rating = float(root.findtext('.//'+AWS('AverageRating')))
num_of_reviews = int(root.findtext('.//'+AWS('TotalReviews')))
if num_of_reviews > 4 and rating > 0 and rating < 5:
mi.rating = rating
except:
pass
tags = [x.text for x in root.findall('.//%s/%s'%(AWS('Subjects'),
AWS('Subject')))]
if tags:
mi.tags = []
for x in tags:
mi.tags.extend([y.strip() for y in x.split('/')])
comments = root.find('.//%s/%s'%(AWS('EditorialReview'),
AWS('Content')))
if comments is not None:
mi.comments = etree.tostring(comments,
method='text', encoding=unicode)
mi.comments = re.sub('<([pP]|DIV)>', '\n\n', mi.comments)
mi.comments = re.sub('</?[iI]>', '*', mi.comments)
mi.comments = re.sub('</?[bB]>', '**', mi.comments)
mi.comments = re.sub('<BR>', '\n\n', mi.comments)
mi.comments = re.sub('<[^>]+>', '', mi.comments)
mi.comments = mi.comments.strip()
mi.comments = _('EDITORIAL REVIEW')+':\n\n'+mi.comments
return mi
BASE_URL = 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=%(key)s&Operation=ItemLookup&ItemId=1416551727&ResponseGroup=%(group)s'
import sys
def get_rating(isbn, key):
br = browser()
url = BASE_URL%dict(key=key, group='Reviews')
raw = br.open(url).read()
match = re.search(r'<AverageRating>([\d.]+)</AverageRating>', raw)
if match:
return float(match.group(1))
def get_cover_url(isbn, key):
br = browser()
url = BASE_URL%dict(key=key, group='Images')
raw = br.open(url).read()
match = re.search(r'<LargeImage><URL>(.+?)</URL>', raw)
if match:
return match.group(1)
def get_editorial_review(isbn, key):
br = browser()
url = BASE_URL%dict(key=key, group='EditorialReview')
raw = br.open(url).read()
match = re.compile(r'<EditorialReview>.*?<Content>(.+?)</Content>', re.DOTALL).search(raw)
if match:
return match.group(1)
def main(args=sys.argv):
print 'Rating:', get_rating(args[1], args[2])
print 'Cover:', get_rating(args[1], args[2])
print 'EditorialReview:', get_editorial_review(args[1], args[2])
print get_social_metadata(None, None, None, '9781416551720')
return 0
if __name__ == '__main__':
sys.exit(main())
sys.exit(main())

View File

@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
import traceback, sys, textwrap, re
from threading import Thread
from calibre import preferred_encoding
from calibre import prints
from calibre.utils.config import OptionParser
from calibre.utils.logging import default_log
@ -15,7 +15,14 @@ from calibre.customize import Plugin
class MetadataSource(Plugin):
author = 'Kovid Goyal'
supported_platforms = ['windows', 'osx', 'linux']
#: The type of metadata fetched. 'basic' means basic metadata like
#: title/author/isbn/etc. 'social' means social metadata like
#: tags/rating/reviews/etc.
metadata_type = 'basic'
type = _('Metadata download')
def __call__(self, title, author, publisher, isbn, verbose, log=None,
@ -49,6 +56,7 @@ class MetadataSource(Plugin):
def join(self):
return self.worker.join()
class GoogleBooks(MetadataSource):
name = 'Google Books'
@ -104,6 +112,22 @@ class ISBNDB(MetadataSource):
ans = ans.replace('%s', '')
return ans
class Amazon(MetadataSource):
name = 'Amazon'
metadata_type = 'social'
def fetch(self):
if not self.isbn:
return
from calibre.ebooks.metadata.amazon import get_social_metadata
try:
self.results = get_social_metadata(self.title, self.author,
self.publisher, self.isbn)
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
def result_index(source, result):
if not result.isbn:
return -1
@ -134,16 +158,56 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
fetcher(title, author, publisher, isbn, verbose)
for fetcher in fetchers:
fetcher.join()
results = list(fetchers[0].results)
for fetcher in fetchers[1:]:
merge_results(fetchers[0].results, fetcher.results)
merge_results(results, fetcher.results)
results = sorted(fetchers[0].results, cmp=lambda x, y : cmp(
results = sorted(results, cmp=lambda x, y : cmp(
(x.comments.strip() if x.comments else ''),
(y.comments.strip() if y.comments else '')
), reverse=True)
return results, [(x.name, x.exception, x.tb) for x in fetchers]
def get_social_metadata(mi, verbose=0):
from calibre.customize.ui import metadata_sources
fetchers = list(metadata_sources(metadata_type='social'))
for fetcher in fetchers:
fetcher(mi.title, mi.authors, mi.publisher, mi.isbn, verbose)
for fetcher in fetchers:
fetcher.join()
ratings, tags, comments = [], set([]), set([])
for fetcher in fetchers:
if fetcher.results:
dmi = fetcher.results
if dmi.rating is not None:
ratings.append(dmi.rating)
if dmi.tags:
for t in dmi.tags:
tags.add(t)
if mi.pubdate is None and dmi.pubdate is not None:
mi.pubdate = dmi.pubdate
if dmi.comments:
comments.add(dmi.comments)
if ratings:
rating = sum(ratings)/float(len(ratings))
if mi.rating is None:
mi.rating = rating
else:
mi.rating = (mi.rating + rating)/2.0
if tags:
if not mi.tags:
mi.tags = []
mi.tags += list(tags)
mi.tags = list(sorted(list(set(mi.tags))))
if comments:
mi.comments = ''
for x in comments:
mi.comments += '\n\n'+x
return [(x.name, x.exception, x.tb) for x in fetchers]
def option_parser():
parser = OptionParser(textwrap.dedent(
@ -174,11 +238,13 @@ def main(args=sys.argv):
opts, args = parser.parse_args(args)
results, exceptions = search(opts.title, opts.author, opts.publisher,
opts.isbn, opts.isbndb_key, opts.verbose)
social_exceptions = []
for result in results:
print unicode(result).encode(preferred_encoding)
social_exceptions.extend(get_social_metadata(result, opts.verbose))
prints(unicode(result))
print
for name, exception, tb in exceptions:
for name, exception, tb in exceptions+social_exceptions:
if exception is not None:
print 'WARNING: Fetching from', name, 'failed with error:'
print exception

View File

@ -135,7 +135,11 @@ class ResultList(list):
def get_tags(self, entry, verbose):
try:
tags = [x.text for x in subject(entry)]
btags = [x.text for x in subject(entry)]
tags = []
for t in btags:
tags.extend([y.strip() for y in t.split('/')])
tags = list(sorted(list(set(tags))))
except:
report(verbose)
tags = []

View File

@ -125,7 +125,16 @@ def create_books(opts, args, timeout=5.):
if opts.verbose:
print ('ISBNDB query: '+url)
return [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)]
tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)]
ans = []
for x in tans:
add = True
for y in ans:
if y.isbn == x.isbn:
add = False
if add:
ans.append(x)
return ans
def main(args=sys.argv):
parser = option_parser()

View File

@ -122,6 +122,8 @@ Metadata download plugins
:class:`MetaInformation` objects. If there is an error, it should be stored
in `self.exception` and `self.tb` (for the traceback).
.. automember:: calibre.ebooks.metadata.fetch.MetadataSource.metadata_type
.. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.fetch
.. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.is_ok