From 3a2daf39e333e5dff40e214b1e1c04b96859eb62 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 13 Feb 2011 15:47:28 -0700
Subject: [PATCH] Replace LibraryThing cover download plugin with a new plugin
 to download covers from Amazon

---
 src/calibre/customize/builtins.py       |   4 +-
 src/calibre/ebooks/metadata/__init__.py |   2 +
 src/calibre/ebooks/metadata/amazon.py   | 105 +++++++++++++++++++++---
 src/calibre/ebooks/metadata/covers.py   |  60 +++-----------
 4 files changed, 110 insertions(+), 61 deletions(-)

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 3ccc07040b..1dd575f45b 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -511,14 +511,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
 from calibre.ebooks.metadata.douban import DoubanBooks
 from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
 from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
-        LibraryThingCovers, DoubanCovers
+        AmazonCovers, DoubanCovers
 from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
 from calibre.ebooks.epub.fix.unmanifested import Unmanifested
 from calibre.ebooks.epub.fix.epubcheck import Epubcheck
 
 plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
         KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
-        Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers,
+        Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers,
         NiceBooksCovers]
 plugins += [
     ComicInput,
diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py
index fcd4491fd3..6078a0aa94 100644
--- a/src/calibre/ebooks/metadata/__init__.py
+++ b/src/calibre/ebooks/metadata/__init__.py
@@ -271,6 +271,8 @@ def check_isbn13(isbn):
     return None
 
 def check_isbn(isbn):
+    if not isbn:
+        return None
     isbn = re.sub(r'[^0-9X]', '', isbn.upper())
     if len(isbn) == 10:
         return check_isbn10(isbn)
diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py
index cf96c9732c..98a2ac6d36 100644
--- a/src/calibre/ebooks/metadata/amazon.py
+++ b/src/calibre/ebooks/metadata/amazon.py
@@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
 Fetch metadata using Amazon AWS
 '''
 import sys, re
+from threading import RLock
 
 from lxml import html
 from lxml.html import soupparser
@@ -17,6 +18,10 @@ from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.library.comments import sanitize_comments_html
 
+asin_cache = {}
+cover_url_cache = {}
+cache_lock = RLock()
+
 def find_asin(br, isbn):
     q = 'http://www.amazon.com/s?field-keywords='+isbn
     raw = br.open_novisit(q).read()
@@ -29,6 +34,12 @@ def find_asin(br, isbn):
         return revs[0]
 
 def to_asin(br, isbn):
+    with cache_lock:
+        ans = asin_cache.get(isbn, None)
+    if ans:
+        return ans
+    if ans is False:
+        return None
     if len(isbn) == 13:
         try:
             asin = find_asin(br, isbn)
@@ -38,8 +49,11 @@ def to_asin(br, isbn):
             asin = None
     else:
         asin = isbn
+    with cache_lock:
+        asin_cache[isbn] = ans if ans else False
     return asin
 
+
 def get_social_metadata(title, authors, publisher, isbn):
     mi = Metadata(title, authors)
     if not isbn:
@@ -58,6 +72,68 @@ def get_social_metadata(title, authors, publisher, isbn):
             return mi
     return mi
 
+def get_cover_url(isbn, br):
+    isbn = check_isbn(isbn)
+    if not isbn:
+        return None
+    with cache_lock:
+        ans = cover_url_cache.get(isbn, None)
+    if ans:
+        return ans
+    if ans is False:
+        return None
+    asin = to_asin(br, isbn)
+    if asin:
+        ans = _get_cover_url(br, asin)
+        if ans:
+            with cache_lock:
+                cover_url_cache[isbn] = ans
+            return ans
+    from calibre.ebooks.metadata.xisbn import xisbn
+    for i in xisbn.get_associated_isbns(isbn):
+        asin = to_asin(br, i)
+        if asin:
+            ans = _get_cover_url(br, asin)
+            if ans:
+                with cache_lock:
+                    cover_url_cache[isbn] = ans
+                    cover_url_cache[i] = ans
+                return ans
+    with cache_lock:
+        cover_url_cache[isbn] = False
+    return None
+
+def _get_cover_url(br, asin):
+    q = 'http://amzn.com/'+asin
+    try:
+        raw = br.open_novisit(q).read()
+    except Exception, e:
+        if callable(getattr(e, 'getcode', None)) and \
+                e.getcode() == 404:
+            return None
+        raise
+    if '<title>404 - ' in raw:
+        return None
+    raw = xml_to_unicode(raw, strip_encoding_pats=True,
+            resolve_entities=True)[0]
+    try:
+        root = soupparser.fromstring(raw)
+    except:
+        return False
+
+    imgs = root.xpath('//img[@id="prodImage" and @src]')
+    if imgs:
+        src = imgs[0].get('src')
+        parts = src.split('/')
+        if len(parts) > 3:
+            bn = parts[-1]
+            sparts = bn.split('_')
+            if len(sparts) > 2:
+                bn = sparts[0] + sparts[-1]
+                return ('/'.join(parts[:-1]))+'/'+bn
+    return None
+
+
 def get_metadata(br, asin, mi):
     q = 'http://amzn.com/'+asin
     try:
@@ -111,18 +187,25 @@ def get_metadata(br, asin, mi):
 
 
 def main(args=sys.argv):
-    # Test xisbn
-    print get_social_metadata('Learning Python', None, None, '8324616489')
-    print
+    import tempfile, os
+    tdir = tempfile.gettempdir()
+    br = browser()
+    for title, isbn in [
+            ('Learning Python', '8324616489'), # Test xisbn
+            ('Angels & Demons', '9781416580829'), # Test sophisticated comment formatting
+            # Random tests
+            ('Star Trek: Destiny: Mere Mortals', '9781416551720'),
+            ('The Great Gatsby', '0743273567'),
+            ]:
+        cpath = os.path.join(tdir, title+'.jpg')
+        curl = get_cover_url(isbn, br)
+        if curl is None:
+            print 'No cover found for', title
+        else:
+            open(cpath, 'wb').write(br.open_novisit(curl).read())
+            print 'Cover for', title, 'saved to', cpath
 
-    # Test sophisticated comment formatting
-    print get_social_metadata('Angels & Demons', None, None, '9781416580829')
-    print
-
-    # Random tests
-    print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')
-    print
-    print get_social_metadata('The Great Gatsby', None, None, '0743273567')
+        print get_social_metadata(title, None, None, isbn)
 
     return 0
 
diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py
index 3deb54da10..15e0a05c1e 100644
--- a/src/calibre/ebooks/metadata/covers.py
+++ b/src/calibre/ebooks/metadata/covers.py
@@ -5,7 +5,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import traceback, socket, re, sys
+import traceback, socket, sys
 from functools import partial
 from threading import Thread, Event
 from Queue import Queue, Empty
@@ -15,7 +15,6 @@ import mechanize
 
 from calibre.customize import Plugin
 from calibre import browser, prints
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.constants import preferred_encoding, DEBUG
 
 class CoverDownload(Plugin):
@@ -112,73 +111,38 @@ class OpenLibraryCovers(CoverDownload): # {{{
 
 # }}}
 
-class LibraryThingCovers(CoverDownload): # {{{
+class AmazonCovers(CoverDownload): # {{{
 
-    name = 'librarything.com covers'
-    description = _('Download covers from librarything.com')
+    name = 'amazon.com covers'
+    description = _('Download covers from amazon.com')
     author = 'Kovid Goyal'
 
-    LIBRARYTHING = 'http://www.librarything.com/isbn/'
-
-    def get_cover_url(self, isbn, br, timeout=5.):
-
-        try:
-            src = br.open_novisit('http://www.librarything.com/isbn/'+isbn,
-                    timeout=timeout).read().decode('utf-8', 'replace')
-        except Exception, err:
-            if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
-                err = Exception(_('LibraryThing.com timed out. Try again later.'))
-            raise err
-        else:
-            if '/wiki/index.php/HelpThing:Verify' in src:
-                raise Exception('LibraryThing is blocking calibre.')
-            s = BeautifulSoup(src)
-            url = s.find('td', attrs={'class':'left'})
-            if url is None:
-                if s.find('div', attrs={'class':'highloadwarning'}) is not None:
-                    raise Exception(_('Could not fetch cover as server is experiencing high load. Please try again later.'))
-                raise Exception(_('ISBN: %s not found')%isbn)
-            url = url.find('img')
-            if url is None:
-                raise Exception(_('LibraryThing.com server error. Try again later.'))
-            url = re.sub(r'_S[XY]\d+', '', url['src'])
-            return url
 
     def has_cover(self, mi, ans, timeout=5.):
-        return False
-        if not mi.isbn or not self.site_customization:
+        if not mi.isbn:
             return False
-        from calibre.ebooks.metadata.library_thing import get_browser, login
-        br = get_browser()
-        un, _, pw = self.site_customization.partition(':')
-        login(br, un, pw)
+        from calibre.ebooks.metadata.amazon import get_cover_url
+        br = browser()
         try:
-            self.get_cover_url(mi.isbn, br, timeout=timeout)
+            get_cover_url(mi.isbn, br)
             self.debug('cover for', mi.isbn, 'found')
             ans.set()
         except Exception, e:
             self.debug(e)
 
     def get_covers(self, mi, result_queue, abort, timeout=5.):
-        if not mi.isbn or not self.site_customization:
+        if not mi.isbn:
             return
-        from calibre.ebooks.metadata.library_thing import get_browser, login
-        br = get_browser()
-        un, _, pw = self.site_customization.partition(':')
-        login(br, un, pw)
+        from calibre.ebooks.metadata.amazon import get_cover_url
+        br = browser()
         try:
-            url = self.get_cover_url(mi.isbn, br, timeout=timeout)
+            url = get_cover_url(mi.isbn, br)
             cover_data = br.open_novisit(url).read()
             result_queue.put((True, cover_data, 'jpg', self.name))
         except Exception, e:
             result_queue.put((False, self.exception_to_string(e),
                 traceback.format_exc(), self.name))
 
-    def customization_help(self, gui=False):
-        ans = _('To use librarything.com you must sign up for a %sfree account%s '
-                'and enter your username and password separated by a : below.')
-        return '<p>'+ans%('<a href="http://www.librarything.com">', '</a>')
-
 # }}}
 
 def check_for_cover(mi, timeout=5.): # {{{