Replace LibraryThing cover download plugin with a new plugin to download covers from Amazon

This commit is contained in:
Kovid Goyal 2011-02-13 15:47:28 -07:00
parent 73119e2597
commit 3a2daf39e3
4 changed files with 110 additions and 61 deletions

View File

@ -511,14 +511,14 @@ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.douban import DoubanBooks
from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
LibraryThingCovers, DoubanCovers AmazonCovers, DoubanCovers
from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers,
NiceBooksCovers] NiceBooksCovers]
plugins += [ plugins += [
ComicInput, ComicInput,

View File

@ -271,6 +271,8 @@ def check_isbn13(isbn):
return None return None
def check_isbn(isbn): def check_isbn(isbn):
if not isbn:
return None
isbn = re.sub(r'[^0-9X]', '', isbn.upper()) isbn = re.sub(r'[^0-9X]', '', isbn.upper())
if len(isbn) == 10: if len(isbn) == 10:
return check_isbn10(isbn) return check_isbn10(isbn)

View File

@ -7,6 +7,7 @@ __docformat__ = 'restructuredtext en'
Fetch metadata using Amazon AWS Fetch metadata using Amazon AWS
''' '''
import sys, re import sys, re
from threading import RLock
from lxml import html from lxml import html
from lxml.html import soupparser from lxml.html import soupparser
@ -17,6 +18,10 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
asin_cache = {}
cover_url_cache = {}
cache_lock = RLock()
def find_asin(br, isbn): def find_asin(br, isbn):
q = 'http://www.amazon.com/s?field-keywords='+isbn q = 'http://www.amazon.com/s?field-keywords='+isbn
raw = br.open_novisit(q).read() raw = br.open_novisit(q).read()
@ -29,6 +34,12 @@ def find_asin(br, isbn):
return revs[0] return revs[0]
def to_asin(br, isbn): def to_asin(br, isbn):
with cache_lock:
ans = asin_cache.get(isbn, None)
if ans:
return ans
if ans is False:
return None
if len(isbn) == 13: if len(isbn) == 13:
try: try:
asin = find_asin(br, isbn) asin = find_asin(br, isbn)
@ -38,8 +49,11 @@ def to_asin(br, isbn):
asin = None asin = None
else: else:
asin = isbn asin = isbn
with cache_lock:
asin_cache[isbn] = ans if ans else False
return asin return asin
def get_social_metadata(title, authors, publisher, isbn): def get_social_metadata(title, authors, publisher, isbn):
mi = Metadata(title, authors) mi = Metadata(title, authors)
if not isbn: if not isbn:
@ -58,6 +72,68 @@ def get_social_metadata(title, authors, publisher, isbn):
return mi return mi
return mi return mi
def get_cover_url(isbn, br):
isbn = check_isbn(isbn)
if not isbn:
return None
with cache_lock:
ans = cover_url_cache.get(isbn, None)
if ans:
return ans
if ans is False:
return None
asin = to_asin(br, isbn)
if asin:
ans = _get_cover_url(br, asin)
if ans:
with cache_lock:
cover_url_cache[isbn] = ans
return ans
from calibre.ebooks.metadata.xisbn import xisbn
for i in xisbn.get_associated_isbns(isbn):
asin = to_asin(br, i)
if asin:
ans = _get_cover_url(br, asin)
if ans:
with cache_lock:
cover_url_cache[isbn] = ans
cover_url_cache[i] = ans
return ans
with cache_lock:
cover_url_cache[isbn] = False
return None
def _get_cover_url(br, asin):
q = 'http://amzn.com/'+asin
try:
raw = br.open_novisit(q).read()
except Exception, e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return None
raise
if '<title>404 - ' in raw:
return None
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
root = soupparser.fromstring(raw)
except:
return False
imgs = root.xpath('//img[@id="prodImage" and @src]')
if imgs:
src = imgs[0].get('src')
parts = src.split('/')
if len(parts) > 3:
bn = parts[-1]
sparts = bn.split('_')
if len(sparts) > 2:
bn = sparts[0] + sparts[-1]
return ('/'.join(parts[:-1]))+'/'+bn
return None
def get_metadata(br, asin, mi): def get_metadata(br, asin, mi):
q = 'http://amzn.com/'+asin q = 'http://amzn.com/'+asin
try: try:
@ -111,18 +187,25 @@ def get_metadata(br, asin, mi):
def main(args=sys.argv): def main(args=sys.argv):
# Test xisbn import tempfile, os
print get_social_metadata('Learning Python', None, None, '8324616489') tdir = tempfile.gettempdir()
print br = browser()
for title, isbn in [
('Learning Python', '8324616489'), # Test xisbn
('Angels & Demons', '9781416580829'), # Test sophisticated comment formatting
# Random tests
('Star Trek: Destiny: Mere Mortals', '9781416551720'),
('The Great Gatsby', '0743273567'),
]:
cpath = os.path.join(tdir, title+'.jpg')
curl = get_cover_url(isbn, br)
if curl is None:
print 'No cover found for', title
else:
open(cpath, 'wb').write(br.open_novisit(curl).read())
print 'Cover for', title, 'saved to', cpath
# Test sophisticated comment formatting print get_social_metadata(title, None, None, isbn)
print get_social_metadata('Angels & Demons', None, None, '9781416580829')
print
# Random tests
print get_social_metadata('Star Trek: Destiny: Mere Mortals', None, None, '9781416551720')
print
print get_social_metadata('The Great Gatsby', None, None, '0743273567')
return 0 return 0

View File

@ -5,7 +5,7 @@ __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import traceback, socket, re, sys import traceback, socket, sys
from functools import partial from functools import partial
from threading import Thread, Event from threading import Thread, Event
from Queue import Queue, Empty from Queue import Queue, Empty
@ -15,7 +15,6 @@ import mechanize
from calibre.customize import Plugin from calibre.customize import Plugin
from calibre import browser, prints from calibre import browser, prints
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.constants import preferred_encoding, DEBUG from calibre.constants import preferred_encoding, DEBUG
class CoverDownload(Plugin): class CoverDownload(Plugin):
@ -112,73 +111,38 @@ class OpenLibraryCovers(CoverDownload): # {{{
# }}} # }}}
class LibraryThingCovers(CoverDownload): # {{{ class AmazonCovers(CoverDownload): # {{{
name = 'librarything.com covers' name = 'amazon.com covers'
description = _('Download covers from librarything.com') description = _('Download covers from amazon.com')
author = 'Kovid Goyal' author = 'Kovid Goyal'
LIBRARYTHING = 'http://www.librarything.com/isbn/'
def get_cover_url(self, isbn, br, timeout=5.):
try:
src = br.open_novisit('http://www.librarything.com/isbn/'+isbn,
timeout=timeout).read().decode('utf-8', 'replace')
except Exception, err:
if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
err = Exception(_('LibraryThing.com timed out. Try again later.'))
raise err
else:
if '/wiki/index.php/HelpThing:Verify' in src:
raise Exception('LibraryThing is blocking calibre.')
s = BeautifulSoup(src)
url = s.find('td', attrs={'class':'left'})
if url is None:
if s.find('div', attrs={'class':'highloadwarning'}) is not None:
raise Exception(_('Could not fetch cover as server is experiencing high load. Please try again later.'))
raise Exception(_('ISBN: %s not found')%isbn)
url = url.find('img')
if url is None:
raise Exception(_('LibraryThing.com server error. Try again later.'))
url = re.sub(r'_S[XY]\d+', '', url['src'])
return url
def has_cover(self, mi, ans, timeout=5.): def has_cover(self, mi, ans, timeout=5.):
return False if not mi.isbn:
if not mi.isbn or not self.site_customization:
return False return False
from calibre.ebooks.metadata.library_thing import get_browser, login from calibre.ebooks.metadata.amazon import get_cover_url
br = get_browser() br = browser()
un, _, pw = self.site_customization.partition(':')
login(br, un, pw)
try: try:
self.get_cover_url(mi.isbn, br, timeout=timeout) get_cover_url(mi.isbn, br)
self.debug('cover for', mi.isbn, 'found') self.debug('cover for', mi.isbn, 'found')
ans.set() ans.set()
except Exception, e: except Exception, e:
self.debug(e) self.debug(e)
def get_covers(self, mi, result_queue, abort, timeout=5.): def get_covers(self, mi, result_queue, abort, timeout=5.):
if not mi.isbn or not self.site_customization: if not mi.isbn:
return return
from calibre.ebooks.metadata.library_thing import get_browser, login from calibre.ebooks.metadata.amazon import get_cover_url
br = get_browser() br = browser()
un, _, pw = self.site_customization.partition(':')
login(br, un, pw)
try: try:
url = self.get_cover_url(mi.isbn, br, timeout=timeout) url = get_cover_url(mi.isbn, br)
cover_data = br.open_novisit(url).read() cover_data = br.open_novisit(url).read()
result_queue.put((True, cover_data, 'jpg', self.name)) result_queue.put((True, cover_data, 'jpg', self.name))
except Exception, e: except Exception, e:
result_queue.put((False, self.exception_to_string(e), result_queue.put((False, self.exception_to_string(e),
traceback.format_exc(), self.name)) traceback.format_exc(), self.name))
def customization_help(self, gui=False):
ans = _('To use librarything.com you must sign up for a %sfree account%s '
'and enter your username and password separated by a : below.')
return '<p>'+ans%('<a href="http://www.librarything.com">', '</a>')
# }}} # }}}
def check_for_cover(mi, timeout=5.): # {{{ def check_for_cover(mi, timeout=5.): # {{{