IGN:Add a plugin to download social metadata (tags/rating/review) etc. from Amazon

2025-07-09 03:04:10 -04:00 · 2009-11-11 17:10:03 -07:00 · 2009-11-11 17:10:03 -07:00 · 7e05464776
commit 7e05464776
parent 289455c1d7
8 changed files with 196 additions and 46 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -193,7 +193,7 @@ def extract(path, dir):
        raise Exception('Unknown archive type')
    extractor(path, dir)
-def get_proxies():
+def get_proxies(debug=True):
    proxies = {}
    for q in ('http', 'ftp'):
@ -226,10 +226,40 @@ def get_proxies():
        if len(proxies[x]) < 5:
            prints('Removing invalid', x, 'proxy:', proxies[x])
            del proxies[x]
-    if proxies:
+    if proxies and debug:
        prints('Using proxies:', proxies)
    return proxies
 def get_parsed_proxy(typ='http', debug=True):
    proxies = get_proxies(debug)
    if typ not in proxies:
        return
    pattern = re.compile((
        '(?:ptype://)?' \
        '(?:(?P<user>\w+):(?P<pass>.*)@)?' \
        '(?P<host>[\w\-\.]+)' \
        '(?::(?P<port>\d+))?').replace('ptype', typ)
    )
    match = pattern.match(proxies['typ'])
    if match:
        try:
            ans = {
                    'host' : match.group('host'),
                    'port' : match.group('port'),
                    'user' : match.group('user'),
                    'pass' : match.group('pass')
                }
            if ans['port']:
                ans['port'] = int(ans['port'])
        except:
            if debug:
                traceback.print_exc()
            return
        if debug:
            prints('Using http proxy', ans)
        return ans
 def browser(honor_time=True, max_time=2, mobile_browser=False):
    '''
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -374,8 +374,8 @@ from calibre.devices.eslick.driver import ESLICK
 from calibre.devices.nuut2.driver import NUUT2
 from calibre.devices.iriver.driver import IRIVER_STORY
-from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB
+from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon
-plugins = [HTML2ZIP, GoogleBooks, ISBNDB]
+plugins = [HTML2ZIP, GoogleBooks, ISBNDB, Amazon]
 plugins += [
    ComicInput,
    EPUBInput,
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -90,9 +90,10 @@ def output_profiles():
        if isinstance(plugin, OutputProfile):
            yield plugin
-def metadata_sources(customize=True, isbndb_key=None):
+def metadata_sources(metadata_type='basic', customize=True, isbndb_key=None):
    for plugin in _initialized_plugins:
-        if isinstance(plugin, MetadataSource):
+        if isinstance(plugin, MetadataSource) and \
                plugin.metadata_type == metadata_type:
            if is_disabled(plugin):
                continue
            if customize:
--- a/src/calibre/ebooks/metadata/amazon.py
+++ b/src/calibre/ebooks/metadata/amazon.py
@ -6,45 +6,83 @@ __docformat__ = 'restructuredtext en'
 '''
 Fetch metadata using Amazon AWS
 '''
-import re
+import sys, re
 from datetime import datetime
 from lxml import etree
 from dateutil import parser
 from calibre import browser
 from calibre.ebooks.metadata import MetaInformation, string_to_authors
 AWS_NS = 'http://webservices.amazon.com/AWSECommerceService/2005-10-05'
 def AWS(tag):
    return '{%s}%s'%(AWS_NS, tag)
 def check_for_errors(root):
    err = root.find('.//'+AWS('Error'))
    if err is not None:
        raise Exception('Failed to get metadata with error: '\
                + etree.tostring(err, method='text', pretty_print=True,
                    encoding=unicode))
 def get_social_metadata(title, authors, publisher, isbn):
    mi = MetaInformation(title, authors)
    if isbn:
        br = browser()
        response_xml = br.open('http://status.calibre-ebook.com/aws/metadata/'+isbn).read()
        root = etree.fromstring(response_xml)
        check_for_errors(root)
        mi.title = root.findtext('.//'+AWS('Title'))
        authors = [x.text for x in root.findall('.//'+AWS('Author'))]
        if authors:
            mi.authors = []
            for x in authors:
                mi.authors.extend(string_to_authors(x))
        mi.publisher = root.findtext('.//'+AWS('Publisher'))
        try:
            d = root.findtext('.//'+AWS('PublicationDate'))
            if d:
                default = datetime.utcnow()
                default = datetime(default.year, default.month, 15)
                d = parser.parse(d[0].text, default=default)
                mi.pubdate = d
        except:
            pass
        try:
            rating = float(root.findtext('.//'+AWS('AverageRating')))
            num_of_reviews = int(root.findtext('.//'+AWS('TotalReviews')))
            if num_of_reviews > 4 and rating > 0 and rating < 5:
                mi.rating = rating
        except:
            pass
        tags = [x.text for x in root.findall('.//%s/%s'%(AWS('Subjects'),
            AWS('Subject')))]
        if tags:
            mi.tags = []
            for x in tags:
                mi.tags.extend([y.strip() for y in x.split('/')])
        comments = root.find('.//%s/%s'%(AWS('EditorialReview'),
            AWS('Content')))
        if comments is not None:
            mi.comments = etree.tostring(comments,
                    method='text', encoding=unicode)
            mi.comments = re.sub('<([pP]|DIV)>', '\n\n', mi.comments)
            mi.comments = re.sub('</?[iI]>', '*', mi.comments)
            mi.comments = re.sub('</?[bB]>', '**', mi.comments)
            mi.comments = re.sub('<BR>', '\n\n', mi.comments)
            mi.comments = re.sub('<[^>]+>', '', mi.comments)
            mi.comments = mi.comments.strip()
            mi.comments = _('EDITORIAL REVIEW')+':\n\n'+mi.comments
        return mi
 BASE_URL = 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=%(key)s&Operation=ItemLookup&ItemId=1416551727&ResponseGroup=%(group)s'
 import sys
 def get_rating(isbn, key):
    br = browser()
    url = BASE_URL%dict(key=key, group='Reviews')
    raw = br.open(url).read()
    match = re.search(r'<AverageRating>([\d.]+)</AverageRating>', raw)
    if match:
        return float(match.group(1))
 def get_cover_url(isbn, key):
    br = browser()
    url = BASE_URL%dict(key=key, group='Images')
    raw = br.open(url).read()
    match = re.search(r'<LargeImage><URL>(.+?)</URL>', raw)
    if match:
        return match.group(1)
 def get_editorial_review(isbn, key):
    br = browser()
    url = BASE_URL%dict(key=key, group='EditorialReview')
    raw = br.open(url).read()
    match = re.compile(r'<EditorialReview>.*?<Content>(.+?)</Content>', re.DOTALL).search(raw)
    if match:
        return match.group(1)
 def main(args=sys.argv):
-    print 'Rating:', get_rating(args[1], args[2])
+    print get_social_metadata(None, None, None, '9781416551720')
    print 'Cover:', get_rating(args[1], args[2])
    print 'EditorialReview:', get_editorial_review(args[1], args[2])
    return 0
 if __name__ == '__main__':
-    sys.exit(main())
+    sys.exit(main())
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
 import traceback, sys, textwrap, re
 from threading import Thread
-from calibre import preferred_encoding
+from calibre import prints
 from calibre.utils.config import OptionParser
 from calibre.utils.logging import default_log
@ -15,7 +15,14 @@ from calibre.customize import Plugin
 class MetadataSource(Plugin):
    author = 'Kovid Goyal'
    supported_platforms = ['windows', 'osx', 'linux']
    #: The type of metadata fetched. 'basic' means basic metadata like
    #: title/author/isbn/etc. 'social' means social metadata like
    #: tags/rating/reviews/etc.
    metadata_type = 'basic'
    type = _('Metadata download')
    def __call__(self, title, author, publisher, isbn, verbose, log=None,
@ -49,6 +56,7 @@ class MetadataSource(Plugin):
    def join(self):
        return self.worker.join()
 class GoogleBooks(MetadataSource):
    name = 'Google Books'
@ -104,6 +112,22 @@ class ISBNDB(MetadataSource):
            ans = ans.replace('%s', '')
        return ans
 class Amazon(MetadataSource):
    name = 'Amazon'
    metadata_type = 'social'
    def fetch(self):
        if not self.isbn:
            return
        from calibre.ebooks.metadata.amazon import get_social_metadata
        try:
            self.results = get_social_metadata(self.title, self.author,
                    self.publisher, self.isbn)
        except Exception, e:
            self.exception = e
            self.tb = traceback.format_exc()
 def result_index(source, result):
    if not result.isbn:
        return -1
@ -134,16 +158,56 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
        fetcher(title, author, publisher, isbn, verbose)
    for fetcher in fetchers:
        fetcher.join()
    results = list(fetchers[0].results)
    for fetcher in fetchers[1:]:
-        merge_results(fetchers[0].results, fetcher.results)
+        merge_results(results, fetcher.results)
-    results = sorted(fetchers[0].results, cmp=lambda x, y : cmp(
+    results = sorted(results, cmp=lambda x, y : cmp(
            (x.comments.strip() if x.comments else ''),
            (y.comments.strip() if y.comments else '')
                                                  ), reverse=True)
    return results, [(x.name, x.exception, x.tb) for x in fetchers]
 def get_social_metadata(mi, verbose=0):
    from calibre.customize.ui import metadata_sources
    fetchers = list(metadata_sources(metadata_type='social'))
    for fetcher in fetchers:
        fetcher(mi.title, mi.authors, mi.publisher, mi.isbn, verbose)
    for fetcher in fetchers:
        fetcher.join()
    ratings, tags, comments = [], set([]), set([])
    for fetcher in fetchers:
        if fetcher.results:
            dmi = fetcher.results
            if dmi.rating is not None:
                ratings.append(dmi.rating)
            if dmi.tags:
                for t in dmi.tags:
                    tags.add(t)
            if mi.pubdate is None and dmi.pubdate is not None:
                mi.pubdate = dmi.pubdate
            if dmi.comments:
                comments.add(dmi.comments)
    if ratings:
        rating = sum(ratings)/float(len(ratings))
        if mi.rating is None:
            mi.rating = rating
        else:
            mi.rating = (mi.rating + rating)/2.0
    if tags:
        if not mi.tags:
            mi.tags = []
        mi.tags += list(tags)
        mi.tags = list(sorted(list(set(mi.tags))))
    if comments:
        mi.comments = ''
        for x in comments:
            mi.comments += '\n\n'+x
    return [(x.name, x.exception, x.tb) for x in fetchers]
 def option_parser():
    parser = OptionParser(textwrap.dedent(
@ -174,11 +238,13 @@ def main(args=sys.argv):
    opts, args = parser.parse_args(args)
    results, exceptions = search(opts.title, opts.author, opts.publisher,
                                 opts.isbn, opts.isbndb_key, opts.verbose)
    social_exceptions = []
    for result in results:
-        print unicode(result).encode(preferred_encoding)
+        social_exceptions.extend(get_social_metadata(result, opts.verbose))
        prints(unicode(result))
        print
-    for name, exception, tb in exceptions:
+    for name, exception, tb in exceptions+social_exceptions:
        if exception is not None:
            print 'WARNING: Fetching from', name, 'failed with error:'
            print exception
--- a/src/calibre/ebooks/metadata/google_books.py
+++ b/src/calibre/ebooks/metadata/google_books.py
@ -135,7 +135,11 @@ class ResultList(list):
    def get_tags(self, entry, verbose):
        try:
-            tags = [x.text for x in subject(entry)]
+            btags = [x.text for x in subject(entry)]
            tags = []
            for t in btags:
                tags.extend([y.strip() for y in t.split('/')])
            tags = list(sorted(list(set(tags))))
        except:
            report(verbose)
            tags = []
--- a/src/calibre/ebooks/metadata/isbndb.py
+++ b/src/calibre/ebooks/metadata/isbndb.py
@ -125,7 +125,16 @@ def create_books(opts, args, timeout=5.):
    if opts.verbose:
        print ('ISBNDB query: '+url)
-    return [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)]
+    tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)]
    ans = []
    for x in tans:
        add = True
        for y in ans:
            if y.isbn == x.isbn:
                add = False
        if add:
            ans.append(x)
    return ans
 def main(args=sys.argv):
    parser = option_parser()
--- a/src/calibre/manual/plugins.rst
+++ b/src/calibre/manual/plugins.rst
@ -122,6 +122,8 @@ Metadata download plugins
    :class:`MetaInformation` objects. If there is an error, it should be stored
    in `self.exception` and `self.tb` (for the traceback).
 .. automember:: calibre.ebooks.metadata.fetch.MetadataSource.metadata_type
 .. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.fetch
 .. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.is_ok