...

2025-07-09 03:04:10 -04:00 · 2011-03-16 17:42:42 -06:00 · 2011-03-16 17:42:42 -06:00 · cc14a6a657
commit cc14a6a657
parent 68f63e807a
4 changed files with 134 additions and 43 deletions
--- a/src/calibre/ebooks/metadata/book/base.py
+++ b/src/calibre/ebooks/metadata/book/base.py
@ -633,10 +633,6 @@ class Metadata(object):
            fmt('Publisher', self.publisher)
        if getattr(self, 'book_producer', False):
            fmt('Book Producer', self.book_producer)
-        if self.comments:
-            fmt('Comments', self.comments)
-        if self.isbn:
-            fmt('ISBN', self.isbn)
        if self.tags:
            fmt('Tags', u', '.join([unicode(t) for t in self.tags]))
        if self.series:
@ -654,6 +650,9 @@ class Metadata(object):
        if self.identifiers:
            fmt('Identifiers', u', '.join(['%s:%s'%(k, v) for k, v in
                self.identifiers.iteritems()]))
+        if self.comments:
+            fmt('Comments', self.comments)
+
        for key in self.custom_field_keys():
            val = self.get(key, None)
            if val:
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -20,6 +20,7 @@ from calibre.utils.cleantext import clean_ascii_chars
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.library.comments import sanitize_comments_html
+from calibre.utils.date import parse_date

 class Worker(Thread):

@ -28,10 +29,12 @@ class Worker(Thread):
    '''

    def __init__(self, url, result_queue, browser, log, timeout=20):
+        Thread.__init__(self)
+        self.daemon = True
        self.url, self.result_queue = url, result_queue
        self.log, self.timeout = log, timeout
        self.browser = browser.clone_browser()
-        self.cover_url = self.amazon_id = None
+        self.cover_url = self.amazon_id = self.isbn = None

    def run(self):
        try:
@ -111,7 +114,7 @@ class Worker(Thread):
        self.amazon_id = asin

        try:
-            mi.rating = self.parse_ratings(root)
+            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

@ -125,6 +128,37 @@ class Worker(Thread):
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)

+        pd = root.xpath('//h2[text()="Product Details"]/../div[@class="content"]')
+        if pd:
+            pd = pd[0]
+
+            try:
+                isbn = self.parse_isbn(pd)
+                if isbn:
+                    self.isbn = mi.isbn = isbn
+            except:
+                self.log.exception('Error parsing ISBN for url: %r'%self.url)
+
+            try:
+                mi.publisher = self.parse_publisher(pd)
+            except:
+                self.log.exception('Error parsing publisher for url: %r'%self.url)
+
+            try:
+                mi.pubdate = self.parse_pubdate(pd)
+            except:
+                self.log.exception('Error parsing publish date for url: %r'%self.url)
+
+            try:
+                lang = self.parse_language(pd)
+                if lang:
+                    mi.language = lang
+            except:
+                self.log.exception('Error parsing language for url: %r'%self.url)
+
+        else:
+            self.log.warning('Failed to find product description for url: %r'%self.url)
+
        self.result_queue.put(mi)

    def parse_asin(self, root):
@ -140,27 +174,23 @@ class Worker(Thread):
                    method='text').strip()
        else:
            title = tostring(tdiv, encoding=unicode, method='text').strip()
-        return re.sub(r'[([].*[)]]', '', title).strip()
+        return re.sub(r'[(\[].*[)\]]', '', title).strip()

    def parse_authors(self, root):
-        bdiv = root.xpath('//div[@class="buying"]')[0]
-        aname = bdiv.xpath('descendant::span[@class="contributorNameTrigger"]')
+        aname = root.xpath('//span[@class="contributorNameTrigger"]')
        authors = [tostring(x, encoding=unicode, method='text').strip() for x
                in aname]
        return authors

-    def parse_ratings(self, root):
-        ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
+    def parse_rating(self, root):
+        ratings = root.xpath('//div[@class="jumpBar"]/descendant::span[@class="asinReviewsSummary"]')
        pat = re.compile(r'([0-9.]+) out of (\d+) stars')
        if ratings:
            for elem in ratings[0].xpath('descendant::*[@title]'):
-                t = elem.get('title')
+                t = elem.get('title').strip()
                m = pat.match(t)
                if m is not None:
-                    try:
-                        return float(m.group(1))/float(m.group(2)) * 5
-                    except:
-                        pass
+                    return float(m.group(1))/float(m.group(2)) * 5

    def parse_comments(self, root):
        desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
@ -193,6 +223,37 @@ class Worker(Thread):
                    bn = sparts[0] + sparts[-1]
                    return ('/'.join(parts[:-1]))+'/'+bn

+    def parse_isbn(self, pd):
+        for x in reversed(pd.xpath(
+            'descendant::*[starts-with(text(), "ISBN")]')):
+            if x.tail:
+                ans = check_isbn(x.tail.strip())
+                if ans:
+                    return ans
+
+    def parse_publisher(self, pd):
+        for x in reversed(pd.xpath(
+            'descendant::*[starts-with(text(), "Publisher:")]')):
+            if x.tail:
+                ans = x.tail.partition(';')[0]
+                return ans.partition('(')[0].strip()
+
+    def parse_pubdate(self, pd):
+        for x in reversed(pd.xpath(
+            'descendant::*[starts-with(text(), "Publisher:")]')):
+            if x.tail:
+                ans = x.tail
+                date = ans.partition('(')[-1].replace(')', '').strip()
+                return parse_date(date, assume_utc=True)
+
+    def parse_language(self, pd):
+        for x in reversed(pd.xpath(
+            'descendant::*[starts-with(text(), "Language:")]')):
+            if x.tail:
+                ans = x.tail.strip()
+                if ans == 'English':
+                    return 'en'
+

 class Amazon(Source):

@ -200,7 +261,8 @@ class Amazon(Source):
    description = _('Downloads metadata from Amazon')

    capabilities = frozenset(['identify'])
-    touched_fields = frozenset(['title', 'authors', 'isbn', 'pubdate', 'comments'])
+    touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
+        'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])

    AMAZON_DOMAINS = {
            'com': _('US'),
@ -254,6 +316,10 @@ class Amazon(Source):

    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=20):
+        '''
+        Note this method will retry without identifiers automatically if no
+        match is found with identifiers.
+        '''
        query = self.create_query(log, title=title, authors=authors,
                identifiers=identifiers)
        if query is None:
@ -281,37 +347,45 @@ class Amazon(Source):
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]

-        if '<title>404 - ' in raw:
-            log.error('No matches found for query: %r'%query)
-            return
-
-        try:
-            root = soupparser.fromstring(clean_ascii_chars(raw))
-        except:
-            msg = 'Failed to parse amazon page for query: %r'%query
-            log.exception(msg)
-            return msg
-
-        errmsg = root.xpath('//*[@id="errorMessage"]')
-        if errmsg:
-            msg = tostring(errmsg, method='text', encoding=unicode).strip()
-            log.error(msg)
-            # The error is almost always a not found error
-            return
-
        matches = []
-        for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
-            for a in div.xpath(r'descendant::a[@class="title" and @href]'):
-                title = tostring(a, method='text', encoding=unicode).lower()
-                if 'bulk pack' not in title:
-                    matches.append(a.get('href'))
-                break
+        found = '<title>404 - ' not in raw
+
+        if found:
+            try:
+                root = soupparser.fromstring(clean_ascii_chars(raw))
+            except:
+                msg = 'Failed to parse amazon page for query: %r'%query
+                log.exception(msg)
+                return msg
+
+                errmsg = root.xpath('//*[@id="errorMessage"]')
+                if errmsg:
+                    msg = tostring(errmsg, method='text', encoding=unicode).strip()
+                    log.error(msg)
+                    # The error is almost always a not found error
+                    found = False
+
+        if found:
+            for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
+                for a in div.xpath(r'descendant::a[@class="title" and @href]'):
+                    title = tostring(a, method='text', encoding=unicode).lower()
+                    if 'bulk pack' not in title:
+                        matches.append(a.get('href'))
+                    break

        # Keep only the top 5 matches as the matches are sorted by relevance by
        # Amazon so lower matches are not likely to be very relevant
        matches = matches[:5]

+        if abort.is_set():
+            return
+
        if not matches:
+            if identifiers and title and authors:
+                self.log('No matches found with identifiers, retrying using only'
+                        ' title and authors')
+                return self.identify(log, result_queue, abort, title=title,
+                        authors=authors, timeout=timeout)
            log.error('No matches found with query: %r'%query)
            return

@ -333,6 +407,14 @@ class Amazon(Source):
            if not a_worker_is_alive:
                break

+        for w in workers:
+            if w.amazon_id:
+                if w.isbn:
+                    self.cache_isbn_to_identifier(w.isbn, w.amazon_id)
+                if w.cover_url:
+                    self.cache_identifier_to_cover_url(w.amazon_id,
+                            w.cover_url)
+
        return None


--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -35,6 +35,7 @@ class Source(Plugin):
    def __init__(self, *args, **kwargs):
        Plugin.__init__(self, *args, **kwargs)
        self._isbn_to_identifier_cache = {}
+        self._identifier_to_cover_url_cache = {}
        self.cache_lock = threading.RLock()
        self._config_obj = None
        self._browser = None
@ -68,6 +69,14 @@ class Source(Plugin):
        with self.cache_lock:
            return self._isbn_to_identifier_cache.get(isbn, None)

+    def cache_identifier_to_cover_url(self, id_, url):
+        with self.cache_lock:
+            self._identifier_to_cover_url_cache[id_] = url
+
+    def cached_identifier_to_cover_url(self, id_):
+        with self.cache_lock:
+            return self._identifier_to_cover_url_cache.get(id_, None)
+
    def get_author_tokens(self, authors, only_first_author=True):
        '''
        Take a list of authors and return a list of tokens useful for an
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -145,8 +145,9 @@ class GoogleBooks(Source):
    description = _('Downloads metadata from Google Books')

    capabilities = frozenset(['identify'])
-    touched_fields = frozenset(['title', 'authors', 'isbn', 'tags', 'pubdate',
-        'comments', 'publisher', 'author_sort']) # language currently disabled
+    touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
+        'comments', 'publisher', 'author_sort', 'identifier:isbn',
+        'identifier:google']) # language currently disabled

    def create_query(self, log, title=None, authors=None, identifiers={}):
        BASE_URL = 'http://books.google.com/books/feeds/volumes?'