IGN:Add a plugin to download social metadata (tags/rating/review) etc. from Amazon

2025-07-09 03:04:10 -04:00 · 2009-11-11 17:10:03 -07:00 · 2009-11-11 17:10:03 -07:00 · 7e05464776
commit 7e05464776
parent 289455c1d7
8 changed files with 196 additions and 46 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -193,7 +193,7 @@ def extract(path, dir):
        raise Exception('Unknown archive type')
    extractor(path, dir)

-def get_proxies():
+def get_proxies(debug=True):
    proxies = {}

    for q in ('http', 'ftp'):
@ -226,10 +226,40 @@ def get_proxies():
        if len(proxies[x]) < 5:
            prints('Removing invalid', x, 'proxy:', proxies[x])
            del proxies[x]
-    if proxies:
+    if proxies and debug:
        prints('Using proxies:', proxies)
    return proxies

+def get_parsed_proxy(typ='http', debug=True):
+    proxies = get_proxies(debug)
+    if typ not in proxies:
+        return
+    pattern = re.compile((
+        '(?:ptype://)?' \
+        '(?:(?P<user>\w+):(?P<pass>.*)@)?' \
+        '(?P<host>[\w\-\.]+)' \
+        '(?::(?P<port>\d+))?').replace('ptype', typ)
+    )
+
+    match = pattern.match(proxies['typ'])
+    if match:
+        try:
+            ans = {
+                    'host' : match.group('host'),
+                    'port' : match.group('port'),
+                    'user' : match.group('user'),
+                    'pass' : match.group('pass')
+                }
+            if ans['port']:
+                ans['port'] = int(ans['port'])
+        except:
+            if debug:
+                traceback.print_exc()
+            return
+        if debug:
+            prints('Using http proxy', ans)
+        return ans
+

 def browser(honor_time=True, max_time=2, mobile_browser=False):
    '''
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -374,8 +374,8 @@ from calibre.devices.eslick.driver import ESLICK
 from calibre.devices.nuut2.driver import NUUT2
 from calibre.devices.iriver.driver import IRIVER_STORY

-from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB
-plugins = [HTML2ZIP, GoogleBooks, ISBNDB]
+from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon
+plugins = [HTML2ZIP, GoogleBooks, ISBNDB, Amazon]
 plugins += [
    ComicInput,
    EPUBInput,
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@ -90,9 +90,10 @@ def output_profiles():
        if isinstance(plugin, OutputProfile):
            yield plugin

-def metadata_sources(customize=True, isbndb_key=None):
+def metadata_sources(metadata_type='basic', customize=True, isbndb_key=None):
    for plugin in _initialized_plugins:
-        if isinstance(plugin, MetadataSource):
+        if isinstance(plugin, MetadataSource) and \
+                plugin.metadata_type == metadata_type:
            if is_disabled(plugin):
                continue
            if customize:
--- a/src/calibre/ebooks/metadata/amazon.py
+++ b/src/calibre/ebooks/metadata/amazon.py
@ -6,45 +6,83 @@ __docformat__ = 'restructuredtext en'
 '''
 Fetch metadata using Amazon AWS
 '''
-import re
+import sys, re
+from datetime import datetime
+
+from lxml import etree
+from dateutil import parser

 from calibre import browser
+from calibre.ebooks.metadata import MetaInformation, string_to_authors
+
+AWS_NS = 'http://webservices.amazon.com/AWSECommerceService/2005-10-05'
+
+def AWS(tag):
+    return '{%s}%s'%(AWS_NS, tag)
+
+def check_for_errors(root):
+    err = root.find('.//'+AWS('Error'))
+    if err is not None:
+        raise Exception('Failed to get metadata with error: '\
+                + etree.tostring(err, method='text', pretty_print=True,
+                    encoding=unicode))
+
+def get_social_metadata(title, authors, publisher, isbn):
+    mi = MetaInformation(title, authors)
+    if isbn:
+        br = browser()
+        response_xml = br.open('http://status.calibre-ebook.com/aws/metadata/'+isbn).read()
+        root = etree.fromstring(response_xml)
+        check_for_errors(root)
+        mi.title = root.findtext('.//'+AWS('Title'))
+        authors = [x.text for x in root.findall('.//'+AWS('Author'))]
+        if authors:
+            mi.authors = []
+            for x in authors:
+                mi.authors.extend(string_to_authors(x))
+        mi.publisher = root.findtext('.//'+AWS('Publisher'))
+        try:
+            d = root.findtext('.//'+AWS('PublicationDate'))
+            if d:
+                default = datetime.utcnow()
+                default = datetime(default.year, default.month, 15)
+                d = parser.parse(d[0].text, default=default)
+                mi.pubdate = d
+        except:
+            pass
+        try:
+            rating = float(root.findtext('.//'+AWS('AverageRating')))
+            num_of_reviews = int(root.findtext('.//'+AWS('TotalReviews')))
+            if num_of_reviews > 4 and rating > 0 and rating < 5:
+                mi.rating = rating
+        except:
+            pass
+        tags = [x.text for x in root.findall('.//%s/%s'%(AWS('Subjects'),
+            AWS('Subject')))]
+        if tags:
+            mi.tags = []
+            for x in tags:
+                mi.tags.extend([y.strip() for y in x.split('/')])
+        comments = root.find('.//%s/%s'%(AWS('EditorialReview'),
+            AWS('Content')))
+        if comments is not None:
+            mi.comments = etree.tostring(comments,
+                    method='text', encoding=unicode)
+            mi.comments = re.sub('<([pP]|DIV)>', '\n\n', mi.comments)
+            mi.comments = re.sub('</?[iI]>', '*', mi.comments)
+            mi.comments = re.sub('</?[bB]>', '**', mi.comments)
+            mi.comments = re.sub('<BR>', '\n\n', mi.comments)
+            mi.comments = re.sub('<[^>]+>', '', mi.comments)
+            mi.comments = mi.comments.strip()
+            mi.comments = _('EDITORIAL REVIEW')+':\n\n'+mi.comments
+
+        return mi


-BASE_URL = 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=%(key)s&Operation=ItemLookup&ItemId=1416551727&ResponseGroup=%(group)s'
-
-import sys
-
-def get_rating(isbn, key):
-    br = browser()
-    url = BASE_URL%dict(key=key, group='Reviews')
-    raw = br.open(url).read()
-    match = re.search(r'<AverageRating>([\d.]+)</AverageRating>', raw)
-    if match:
-        return float(match.group(1))
-    
-def get_cover_url(isbn, key):
-    br = browser()
-    url = BASE_URL%dict(key=key, group='Images')
-    raw = br.open(url).read()
-    match = re.search(r'<LargeImage><URL>(.+?)</URL>', raw)
-    if match:
-        return match.group(1)
-
-def get_editorial_review(isbn, key):
-    br = browser()
-    url = BASE_URL%dict(key=key, group='EditorialReview')
-    raw = br.open(url).read()
-    match = re.compile(r'<EditorialReview>.*?<Content>(.+?)</Content>', re.DOTALL).search(raw)
-    if match:
-        return match.group(1)

 def main(args=sys.argv):
-    print 'Rating:', get_rating(args[1], args[2])
-    print 'Cover:', get_rating(args[1], args[2])
-    print 'EditorialReview:', get_editorial_review(args[1], args[2])
-    
+    print get_social_metadata(None, None, None, '9781416551720')
    return 0

 if __name__ == '__main__':
-    sys.exit(main())
+    sys.exit(main())
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en'
 import traceback, sys, textwrap, re
 from threading import Thread

-from calibre import preferred_encoding
+from calibre import prints
 from calibre.utils.config import OptionParser
 from calibre.utils.logging import default_log

@ -15,7 +15,14 @@ from calibre.customize import Plugin
 class MetadataSource(Plugin):

    author = 'Kovid Goyal'
+
    supported_platforms = ['windows', 'osx', 'linux']
+
+    #: The type of metadata fetched. 'basic' means basic metadata like
+    #: title/author/isbn/etc. 'social' means social metadata like
+    #: tags/rating/reviews/etc.
+    metadata_type = 'basic'
+
    type = _('Metadata download')

    def __call__(self, title, author, publisher, isbn, verbose, log=None,
@ -49,6 +56,7 @@ class MetadataSource(Plugin):
    def join(self):
        return self.worker.join()

+
 class GoogleBooks(MetadataSource):

    name = 'Google Books'
@ -104,6 +112,22 @@ class ISBNDB(MetadataSource):
            ans = ans.replace('%s', '')
        return ans

+class Amazon(MetadataSource):
+
+    name = 'Amazon'
+    metadata_type = 'social'
+
+    def fetch(self):
+        if not self.isbn:
+            return
+        from calibre.ebooks.metadata.amazon import get_social_metadata
+        try:
+            self.results = get_social_metadata(self.title, self.author,
+                    self.publisher, self.isbn)
+        except Exception, e:
+            self.exception = e
+            self.tb = traceback.format_exc()
+
 def result_index(source, result):
    if not result.isbn:
        return -1
@ -134,16 +158,56 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
        fetcher(title, author, publisher, isbn, verbose)
    for fetcher in fetchers:
        fetcher.join()
+    results = list(fetchers[0].results)
    for fetcher in fetchers[1:]:
-        merge_results(fetchers[0].results, fetcher.results)
+        merge_results(results, fetcher.results)

-    results = sorted(fetchers[0].results, cmp=lambda x, y : cmp(
+    results = sorted(results, cmp=lambda x, y : cmp(
            (x.comments.strip() if x.comments else ''),
            (y.comments.strip() if y.comments else '')
                                                  ), reverse=True)

    return results, [(x.name, x.exception, x.tb) for x in fetchers]

+def get_social_metadata(mi, verbose=0):
+    from calibre.customize.ui import metadata_sources
+    fetchers = list(metadata_sources(metadata_type='social'))
+    for fetcher in fetchers:
+        fetcher(mi.title, mi.authors, mi.publisher, mi.isbn, verbose)
+    for fetcher in fetchers:
+        fetcher.join()
+    ratings, tags, comments = [], set([]), set([])
+    for fetcher in fetchers:
+        if fetcher.results:
+            dmi = fetcher.results
+            if dmi.rating is not None:
+                ratings.append(dmi.rating)
+            if dmi.tags:
+                for t in dmi.tags:
+                    tags.add(t)
+            if mi.pubdate is None and dmi.pubdate is not None:
+                mi.pubdate = dmi.pubdate
+            if dmi.comments:
+                comments.add(dmi.comments)
+    if ratings:
+        rating = sum(ratings)/float(len(ratings))
+        if mi.rating is None:
+            mi.rating = rating
+        else:
+            mi.rating = (mi.rating + rating)/2.0
+    if tags:
+        if not mi.tags:
+            mi.tags = []
+        mi.tags += list(tags)
+        mi.tags = list(sorted(list(set(mi.tags))))
+    if comments:
+        mi.comments = ''
+        for x in comments:
+            mi.comments += '\n\n'+x
+
+    return [(x.name, x.exception, x.tb) for x in fetchers]
+
+

 def option_parser():
    parser = OptionParser(textwrap.dedent(
@ -174,11 +238,13 @@ def main(args=sys.argv):
    opts, args = parser.parse_args(args)
    results, exceptions = search(opts.title, opts.author, opts.publisher,
                                 opts.isbn, opts.isbndb_key, opts.verbose)
+    social_exceptions = []
    for result in results:
-        print unicode(result).encode(preferred_encoding)
+        social_exceptions.extend(get_social_metadata(result, opts.verbose))
+        prints(unicode(result))
        print

-    for name, exception, tb in exceptions:
+    for name, exception, tb in exceptions+social_exceptions:
        if exception is not None:
            print 'WARNING: Fetching from', name, 'failed with error:'
            print exception
--- a/src/calibre/ebooks/metadata/google_books.py
+++ b/src/calibre/ebooks/metadata/google_books.py
@ -135,7 +135,11 @@ class ResultList(list):

    def get_tags(self, entry, verbose):
        try:
-            tags = [x.text for x in subject(entry)]
+            btags = [x.text for x in subject(entry)]
+            tags = []
+            for t in btags:
+                tags.extend([y.strip() for y in t.split('/')])
+            tags = list(sorted(list(set(tags))))
        except:
            report(verbose)
            tags = []
--- a/src/calibre/ebooks/metadata/isbndb.py
+++ b/src/calibre/ebooks/metadata/isbndb.py
@ -125,7 +125,16 @@ def create_books(opts, args, timeout=5.):
    if opts.verbose:
        print ('ISBNDB query: '+url)

-    return [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)]
+    tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)]
+    ans = []
+    for x in tans:
+        add = True
+        for y in ans:
+            if y.isbn == x.isbn:
+                add = False
+        if add:
+            ans.append(x)
+    return ans

 def main(args=sys.argv):
    parser = option_parser()
--- a/src/calibre/manual/plugins.rst
+++ b/src/calibre/manual/plugins.rst
@ -122,6 +122,8 @@ Metadata download plugins
    :class:`MetaInformation` objects. If there is an error, it should be stored
    in `self.exception` and `self.tb` (for the traceback).

+.. automember:: calibre.ebooks.metadata.fetch.MetadataSource.metadata_type
+
 .. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.fetch

 .. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.is_ok