From 3a2daf39e333e5dff40e214b1e1c04b96859eb62 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 13 Feb 2011 15:47:28 -0700 Subject: [PATCH] Replace LibraryThing cover download plugin with a new plugin to download covers from Amazon --- src/calibre/customize/builtins.py | 4 +- src/calibre/ebooks/metadata/__init__.py | 2 + src/calibre/ebooks/metadata/amazon.py | 105 +++++++++++++++++++++--- src/calibre/ebooks/metadata/covers.py | 60 +++----------- 4 files changed, 110 insertions(+), 61 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 3ccc07040b..1dd575f45b 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -511,14 +511,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ - LibraryThingCovers, DoubanCovers + AmazonCovers, DoubanCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, + Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, NiceBooksCovers] plugins += [ ComicInput, diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index fcd4491fd3..6078a0aa94 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -271,6 +271,8 @@ def check_isbn13(isbn): return None def check_isbn(isbn): + if not isbn: + return None isbn = re.sub(r'[^0-9X]', '', isbn.upper()) if len(isbn) == 10: return check_isbn10(isbn) diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index cf96c9732c..98a2ac6d36 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en' Fetch metadata using Amazon AWS ''' import sys, re +from threading import RLock from lxml import html from lxml.html import soupparser @@ -17,6 +18,10 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html +asin_cache = {} +cover_url_cache = {} +cache_lock = RLock() + def find_asin(br, isbn): q = 'http://www.amazon.com/s?field-keywords='+isbn raw = br.open_novisit(q).read() @@ -29,6 +34,12 @@ def find_asin(br, isbn): return revs[0] def to_asin(br, isbn): + with cache_lock: + ans = asin_cache.get(isbn, None) + if ans: + return ans + if ans is False: + return None if len(isbn) == 13: try: asin = find_asin(br, isbn) @@ -38,8 +49,11 @@ def to_asin(br, isbn): asin = None else: asin = isbn + with cache_lock: + asin_cache[isbn] = ans if ans else False return asin + def get_social_metadata(title, authors, publisher, isbn): mi = Metadata(title, authors) if not isbn: @@ -58,6 +72,68 @@ def get_social_metadata(title, authors, publisher, isbn): return mi return mi +def get_cover_url(isbn, br): + isbn = check_isbn(isbn) + if not isbn: + return None + with cache_lock: + ans = cover_url_cache.get(isbn, None) + if ans: + return ans + if ans is False: + return None + asin = to_asin(br, isbn) + if asin: + ans = _get_cover_url(br, asin) + if ans: + with cache_lock: + cover_url_cache[isbn] = ans + return ans + from calibre.ebooks.metadata.xisbn import xisbn + for i in xisbn.get_associated_isbns(isbn): + asin = to_asin(br, i) + if asin: + ans = _get_cover_url(br, asin) + if ans: + with cache_lock: + cover_url_cache[isbn] = ans + cover_url_cache[i] = ans + return ans + with cache_lock: + cover_url_cache[isbn] = False + return None + +def _get_cover_url(br, asin): + q = 'http://amzn.com/'+asin + try: + raw = br.open_novisit(q).read() + except Exception, e: + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None + raise + if '404 - ' in raw: + return None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + root = soupparser.fromstring(raw) + except: + return False + + imgs = root.xpath('//img[@id="prodImage" and @src]') + if imgs: + src = imgs[0].get('src') + parts = src.split('/') + if len(parts) > 3: + bn = parts[-1] + sparts = bn.split('_') + if len(sparts) > 2: + bn = sparts[0] + sparts[-1] + return ('/'.join(parts[:-1]))+'/'+bn + return None + + def get_metadata(br, asin, mi): q = 'http://amzn.com/'+asin try: @@ -111,18 +187,25 @@ def get_metadata(br, asin, mi): def main(args=sys.argv): - # Test xisbn - print get_social_metadata('Learning Python', None, None, '8324616489') - print + import tempfile, os + tdir = tempfile.gettempdir() + br = browser() + for title, isbn in [ + ('Learning Python', '8324616489'), # Test xisbn + ('Angels & Demons', '9781416580829'), # Test sophisticated comment formatting + # Random tests + ('Star Trek: Destiny: Mere Mortals', '9781416551720'), + ('The Great Gatsby', '0743273567'), + ]: + cpath = os.path.join(tdir, title+'.jpg') + curl = get_cover_url(isbn, br) + if curl is None: + print 'No cover found for', title + else: + open(cpath, 'wb').write(br.open_novisit(curl).read()) + print 'Cover for', title, 'saved to', cpath - # Test sophisticated comment formatting - print get_social_metadata('Angels & Demons', None, None, '9781416580829') - print - - # Random tests - print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720') - print - print get_social_metadata('The Great Gatsby', None, None, '0743273567') + print get_social_metadata(title, None, None, isbn) return 0 diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py index 3deb54da10..15e0a05c1e 100644 --- a/src/calibre/ebooks/metadata/covers.py +++ b/src/calibre/ebooks/metadata/covers.py @@ -5,7 +5,7 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import traceback, socket, re, sys +import traceback, socket, sys from functools import partial from threading import Thread, Event from Queue import Queue, Empty @@ -15,7 +15,6 @@ import mechanize from calibre.customize import Plugin from calibre import browser, prints -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.constants import preferred_encoding, DEBUG class CoverDownload(Plugin): @@ -112,73 +111,38 @@ class OpenLibraryCovers(CoverDownload): # {{{ # }}} -class LibraryThingCovers(CoverDownload): # {{{ +class AmazonCovers(CoverDownload): # {{{ - name = 'librarything.com covers' - description = _('Download covers from librarything.com') + name = 'amazon.com covers' + description = _('Download covers from amazon.com') author = 'Kovid Goyal' - LIBRARYTHING = 'http://www.librarything.com/isbn/' - - def get_cover_url(self, isbn, br, timeout=5.): - - try: - src = br.open_novisit('http://www.librarything.com/isbn/'+isbn, - timeout=timeout).read().decode('utf-8', 'replace') - except Exception, err: - if isinstance(getattr(err, 'args', [None])[0], socket.timeout): - err = Exception(_('LibraryThing.com timed out. Try again later.')) - raise err - else: - if '/wiki/index.php/HelpThing:Verify' in src: - raise Exception('LibraryThing is blocking calibre.') - s = BeautifulSoup(src) - url = s.find('td', attrs={'class':'left'}) - if url is None: - if s.find('div', attrs={'class':'highloadwarning'}) is not None: - raise Exception(_('Could not fetch cover as server is experiencing high load. Please try again later.')) - raise Exception(_('ISBN: %s not found')%isbn) - url = url.find('img') - if url is None: - raise Exception(_('LibraryThing.com server error. Try again later.')) - url = re.sub(r'_S[XY]\d+', '', url['src']) - return url def has_cover(self, mi, ans, timeout=5.): - return False - if not mi.isbn or not self.site_customization: + if not mi.isbn: return False - from calibre.ebooks.metadata.library_thing import get_browser, login - br = get_browser() - un, _, pw = self.site_customization.partition(':') - login(br, un, pw) + from calibre.ebooks.metadata.amazon import get_cover_url + br = browser() try: - self.get_cover_url(mi.isbn, br, timeout=timeout) + get_cover_url(mi.isbn, br) self.debug('cover for', mi.isbn, 'found') ans.set() except Exception, e: self.debug(e) def get_covers(self, mi, result_queue, abort, timeout=5.): - if not mi.isbn or not self.site_customization: + if not mi.isbn: return - from calibre.ebooks.metadata.library_thing import get_browser, login - br = get_browser() - un, _, pw = self.site_customization.partition(':') - login(br, un, pw) + from calibre.ebooks.metadata.amazon import get_cover_url + br = browser() try: - url = self.get_cover_url(mi.isbn, br, timeout=timeout) + url = get_cover_url(mi.isbn, br) cover_data = br.open_novisit(url).read() result_queue.put((True, cover_data, 'jpg', self.name)) except Exception, e: result_queue.put((False, self.exception_to_string(e), traceback.format_exc(), self.name)) - def customization_help(self, gui=False): - ans = _('To use librarything.com you must sign up for a %sfree account%s ' - 'and enter your username and password separated by a : below.') - return '<p>'+ans%('<a href="http://www.librarything.com">', '</a>') - # }}} def check_for_cover(mi, timeout=5.): # {{{