Beginning of the new amazon metadata download plugin

2025-08-11 09:13:57 -04:00 · 2011-03-15 14:39:03 -06:00 · 2011-03-15 14:39:03 -06:00 · 2f4876f474
commit 2f4876f474
parent b7a92e7e3e
5 changed files with 260 additions and 10 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -3,7 +3,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import uuid, sys, os, re, logging, time, \
+import uuid, sys, os, re, logging, time, random, \
       __builtin__, warnings, multiprocessing
 from urllib import getproxies
 __builtin__.__dict__['dynamic_property'] = lambda(func): func(None)
@ -268,6 +268,17 @@ def get_parsed_proxy(typ='http', debug=True):
                    prints('Using http proxy', str(ans))
                return ans

+def random_user_agent():
+    choices = [
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
+        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)'
+        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19'
+        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
+    ]
+    return choices[random.randint(0, len(choices)-1)]
+

 def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
    '''
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -1031,7 +1031,8 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, Search, InputOptions,

 # New metadata download plugins {{{
 from calibre.ebooks.metadata.sources.google import GoogleBooks
+from calibre.ebooks.metadata.sources.amazon import Amazon

-plugins += [GoogleBooks]
+plugins += [GoogleBooks, Amazon]

 # }}}
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -7,16 +7,231 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

+import socket, time
+from urllib import urlencode
+from threading import Thread

+from lxml.html import soupparser, tostring
+
+from calibre import as_unicode
+from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import Source
+from calibre.utils.cleantext import clean_ascii_chars
+from calibre.ebooks.chardet import xml_to_unicode
+
+class Worker(Thread):
+
+    def __init__(self, url, result_queue, browser, log, timeout=10):
+        self.url, self.result_queue = url, result_queue
+        self.log, self.timeout = log, timeout
+        self.browser = browser.clone_browser()
+        self.cover_url = self.amazon_id = None
+
+    def run(self):
+        try:
+            self.get_details()
+        except:
+            self.log.error('get_details failed for url: %r'%self.url)
+
+    def get_details(self):
+        try:
+            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
+        except Exception, e:
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                self.log.error('URL malformed: %r'%self.url)
+                return
+            attr = getattr(e, 'args', [None])
+            attr = attr if attr else [None]
+            if isinstance(attr[0], socket.timeout):
+                msg = 'Amazon timed out. Try again later.'
+                self.log.error(msg)
+            else:
+                msg = 'Failed to make details query: %r'%self.url
+                self.log.exception(msg)
+            return
+
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+
+        if '<title>404 - ' in raw:
+            self.log.error('URL malformed: %r'%self.url)
+            return
+
+        try:
+            root = soupparser.fromstring(clean_ascii_chars(raw))
+        except:
+            msg = 'Failed to parse amazon details page: %r'%self.url
+            self.log.exception(msg)
+            return
+
+        errmsg = root.xpath('//*[@id="errorMessage"]')
+        if errmsg:
+            msg = 'Failed to parse amazon details page: %r'%self.url
+            msg += tostring(errmsg, method='text', encoding=unicode).strip()
+            self.log.error(msg)
+            return
+
+        self.parse_details(root)
+
+    def parse_details(self, root):
+        pass
+

 class Amazon(Source):

    name = 'Amazon'
    description = _('Downloads metadata from Amazon')

-    capabilities = frozenset(['identify', 'cover'])
-    touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate',
-        'comments', 'cover_data'])
+    capabilities = frozenset(['identify'])
+    touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate', 'comments'])
+
+    AMAZON_DOMAINS = {
+            'com': _('US'),
+            'fr' : _('France'),
+            'de' : _('Germany'),
+    }
+
+    def create_query(self, log, title=None, authors=None, identifiers={}):
+        domain = self.prefs.get('domain', 'com')
+
+        # See the amazon detailed search page to get all options
+        q = {   'search-alias' : 'aps',
+                'unfiltered' : '1',
+            }
+
+        if domain == 'com':
+            q['sort'] = 'relevanceexprank'
+        else:
+            q['sort'] = 'relevancerank'
+
+        asin = identifiers.get('amazon', None)
+        isbn = check_isbn(identifiers.get('isbn', None))
+
+        if asin is not None:
+            q['field-keywords'] = asin
+        elif isbn is not None:
+            q['field-isbn'] = isbn
+        else:
+            # Only return book results
+            q['search-alias'] = 'stripbooks'
+            if title:
+                title_tokens = list(self.get_title_tokens(title))
+                if title_tokens:
+                    q['field-title'] = ' '.join(title_tokens)
+            if authors:
+                author_tokens = self.get_author_tokens(authors,
+                        only_first_author=True)
+                if author_tokens:
+                    q['field-author'] = ' '.join(author_tokens)
+
+        if not ('field-keywords' in q or 'field-isbn' in q or
+                ('field-title' in q and 'field-author' in q)):
+            # Insufficient metadata to make an identify query
+            return None
+
+        utf8q = dict([(x.encode('utf-8'), y.encode('utf-8')) for x, y in
+            q.iteritems()])
+        url = 'http://www.amazon.%s/s/?'%domain + urlencode(utf8q)
+        return url
+
+
+    def identify(self, log, result_queue, abort, title=None, authors=None,
+            identifiers={}, timeout=10):
+        query = self.create_query(log, title=title, authors=authors,
+                identifiers=identifiers)
+        if query is None:
+            log.error('Insufficient metadata to construct query')
+            return
+        br = self.browser
+        try:
+            raw = br.open_novisit(query, timeout=timeout).read().strip()
+        except Exception, e:
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                log.error('Query malformed: %r'%query)
+                return
+            attr = getattr(e, 'args', [None])
+            attr = attr if attr else [None]
+            if isinstance(attr[0], socket.timeout):
+                msg = _('Amazon timed out. Try again later.')
+                log.error(msg)
+            else:
+                msg = 'Failed to make identify query: %r'%query
+                log.exception(msg)
+            return as_unicode(msg)
+
+
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+
+        if '<title>404 - ' in raw:
+            log.error('No matches found for query: %r'%query)
+            return
+
+        try:
+            root = soupparser.fromstring(clean_ascii_chars(raw))
+        except:
+            msg = 'Failed to parse amazon page for query: %r'%query
+            log.exception(msg)
+            return msg
+
+        errmsg = root.xpath('//*[@id="errorMessage"]')
+        if errmsg:
+            msg = tostring(errmsg, method='text', encoding=unicode).strip()
+            log.error(msg)
+            # The error is almost always a not found error
+            return
+
+        matches = []
+        for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
+            for a in div.xpath(r'descendant::a[@class="title" and @href]'):
+                title = tostring(a, method='text', encoding=unicode).lower()
+                if 'bulk pack' not in title:
+                    matches.append(a.get('href'))
+                break
+
+        # Keep only the top 5 matches as the matches are sorted by relevance by
+        # Amazon so lower matches are not likely to be very relevant
+        matches = matches[:5]
+
+        if not matches:
+            log.error('No matches found with query: %r'%query)
+            return
+
+        workers = [Worker(url, result_queue, br, log) for url in matches]
+
+        for w in workers:
+            w.start()
+            # Don't send all requests at the same time
+            time.sleep(0.1)
+
+        while not abort.is_set():
+            a_worker_is_alive = False
+            for w in workers:
+                w.join(0.2)
+                if abort.is_set():
+                    break
+                if w.is_alive():
+                    a_worker_is_alive = True
+            if not a_worker_is_alive:
+                break
+
+        return None
+
+
+if __name__ == '__main__':
+    # To run these test use: calibre-debug -e
+    # src/calibre/ebooks/metadata/sources/amazon.py
+    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
+            title_test)
+    test_identify_plugin(Amazon.name,
+        [
+
+            (
+                {'identifiers':{'isbn': '0743273567'}},
+                [title_test('The great gatsby', exact=True)]
+            ),
+        ])


--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -9,8 +9,12 @@ __docformat__ = 'restructuredtext en'

 import re, threading

+from calibre import browser, random_user_agent
 from calibre.customize import Plugin
 from calibre.utils.logging import ThreadSafeLog, FileStream
+from calibre.utils.config import JSONConfig
+
+msprefs = JSONConfig('metadata_sources.json')

 def create_log(ostream=None):
    log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -24,8 +28,6 @@ class Source(Plugin):

    supported_platforms = ['windows', 'osx', 'linux']

-    result_of_identify_is_complete = True
-
    capabilities = frozenset()

    touched_fields = frozenset()
@ -34,6 +36,27 @@ class Source(Plugin):
        Plugin.__init__(self, *args, **kwargs)
        self._isbn_to_identifier_cache = {}
        self.cache_lock = threading.RLock()
+        self._config_obj = None
+        self._browser = None
+
+    # Configuration {{{
+
+    @property
+    def prefs(self):
+        if self._config_obj is None:
+            self._config_obj = JSONConfig('metadata_sources/%s.json'%self.name)
+        return self._config_obj
+    # }}}
+
+    # Browser {{{
+
+    @property
+    def browser(self):
+        if self._browser is None:
+            self._browser = browser(user_agent=random_user_agent())
+        return self._browser
+
+    # }}}

    # Utility functions {{{

--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -19,7 +19,7 @@ from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.date import parse_date, utcnow
 from calibre.utils.cleantext import clean_ascii_chars
-from calibre import browser, as_unicode
+from calibre import as_unicode

 NAMESPACES = {
              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
@ -150,7 +150,7 @@ class GoogleBooks(Source):

    def create_query(self, log, title=None, authors=None, identifiers={}):
        BASE_URL = 'http://books.google.com/books/feeds/volumes?'
-        isbn = identifiers.get('isbn', None)
+        isbn = check_isbn(identifiers.get('isbn', None))
        q = ''
        if isbn is not None:
            q += 'isbn:'+isbn
@ -212,7 +212,7 @@ class GoogleBooks(Source):
            identifiers={}, timeout=5):
        query = self.create_query(log, title=title, authors=authors,
                identifiers=identifiers)
-        br = browser()
+        br = self.browser()
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception, e: