Open library covers plugin migrated. Google plugin adds ratings and can now detect when an entry has a cover

This commit is contained in:
Kovid Goyal 2011-03-23 19:10:22 -06:00
parent 2848e0d2f1
commit d8e1dcf8e5
6 changed files with 104 additions and 42 deletions

View File

@ -1032,7 +1032,8 @@ plugins += [LookAndFeel, Behavior, Columns, Toolbar, Search, InputOptions,
# New metadata download plugins {{{ # New metadata download plugins {{{
from calibre.ebooks.metadata.sources.google import GoogleBooks from calibre.ebooks.metadata.sources.google import GoogleBooks
from calibre.ebooks.metadata.sources.amazon import Amazon from calibre.ebooks.metadata.sources.amazon import Amazon
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
plugins += [GoogleBooks, Amazon] plugins += [GoogleBooks, Amazon, OpenLibrary]
# }}} # }}}

View File

@ -468,7 +468,7 @@ class Amazon(Source):
if cached_url is not None: if cached_url is not None:
break break
if cached_url is None: if cached_url is None:
log.info('No cover found for') log.info('No cover found')
return return
if abort.is_set(): if abort.is_set():

View File

@ -47,12 +47,12 @@ class InternalMetadataCompareKeyGen(object):
The algorithm is: The algorithm is:
1. Prefer results that have the same ISBN as specified in the query * Prefer results that have the same ISBN as specified in the query
2. Prefer results with all available fields filled in * Prefer results with all available fields filled in
3. Prefer results that are an exact title match to the query * Prefer results that are an exact title match to the query
4. Prefer results with longer comments (greater than 10 % longer) * Prefer results with a cached cover URL
5. Prefer results with a cached cover URL * Prefer results with longer comments (greater than 10 % longer)
6. Use the relevance of the result as reported by the metadata source's search * Use the relevance of the result as reported by the metadata source's search
engine engine
''' '''
@ -67,9 +67,9 @@ class InternalMetadataCompareKeyGen(object):
has_cover = 2 if source_plugin.get_cached_cover_url(mi.identifiers)\ has_cover = 2 if source_plugin.get_cached_cover_url(mi.identifiers)\
is None else 1 is None else 1
self.base = (isbn, all_fields, exact_title) self.base = (isbn, all_fields, exact_title, has_cover)
self.comments_len = len(mi.comments.strip() if mi.comments else '') self.comments_len = len(mi.comments.strip() if mi.comments else '')
self.extra = (has_cover, getattr(mi, 'source_relevance', 0)) self.extra = (getattr(mi, 'source_relevance', 0), )
def __cmp__(self, other): def __cmp__(self, other):
result = cmp(self.base, other.base) result = cmp(self.base, other.base)
@ -130,6 +130,12 @@ class Source(Plugin):
# Utility functions {{{ # Utility functions {{{
def get_related_isbns(self, id_):
with self.cache_lock:
for isbn, q in self._isbn_to_identifier_cache.iteritems():
if q == id_:
yield isbn
def cache_isbn_to_identifier(self, isbn, identifier): def cache_isbn_to_identifier(self, isbn, identifier):
with self.cache_lock: with self.cache_lock:
self._isbn_to_identifier_cache[isbn] = identifier self._isbn_to_identifier_cache[isbn] = identifier

View File

@ -25,7 +25,8 @@ from calibre import as_unicode
NAMESPACES = { NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
'atom' : 'http://www.w3.org/2005/Atom', 'atom' : 'http://www.w3.org/2005/Atom',
'dc': 'http://purl.org/dc/terms' 'dc' : 'http://purl.org/dc/terms',
'gd' : 'http://schemas.google.com/g/2005'
} }
XPath = partial(etree.XPath, namespaces=NAMESPACES) XPath = partial(etree.XPath, namespaces=NAMESPACES)
@ -42,6 +43,7 @@ publisher = XPath('descendant::dc:publisher')
subject = XPath('descendant::dc:subject') subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description') description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language') language = XPath('descendant::dc:language')
rating = XPath('descendant::gd:rating[@average]')
def get_details(browser, url, timeout): # {{{ def get_details(browser, url, timeout): # {{{
try: try:
@ -114,8 +116,10 @@ def to_metadata(browser, log, entry_, timeout): # {{{
btags = [x.text for x in subject(extra) if x.text] btags = [x.text for x in subject(extra) if x.text]
tags = [] tags = []
for t in btags: for t in btags:
tags.extend([y.strip() for y in t.split('/')]) atags = [y.strip() for y in t.split('/')]
tags = list(sorted(list(set(tags)))) for tag in atags:
if tag not in tags:
tags.append(tag)
except: except:
log.exception('Failed to parse tags:') log.exception('Failed to parse tags:')
tags = [] tags = []
@ -131,6 +135,18 @@ def to_metadata(browser, log, entry_, timeout): # {{{
except: except:
log.exception('Failed to parse pubdate') log.exception('Failed to parse pubdate')
# Ratings
for x in rating(extra):
try:
mi.rating = float(x.get('average'))
if mi.rating > 5:
mi.rating /= 2
except:
log.exception('Failed to parse rating')
# Cover
mi.has_google_cover = len(extra.xpath(
'//*[@rel="http://schemas.google.com/books/2008/thumbnail"]')) > 0
return mi return mi
# }}} # }}}
@ -142,9 +158,11 @@ class GoogleBooks(Source):
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
'comments', 'publisher', 'identifier:isbn', 'comments', 'publisher', 'identifier:isbn', 'rating',
'identifier:google']) # language currently disabled 'identifier:google']) # language currently disabled
GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
BASE_URL = 'http://books.google.com/books/feeds/volumes?' BASE_URL = 'http://books.google.com/books/feeds/volumes?'
isbn = check_isbn(identifiers.get('isbn', None)) isbn = check_isbn(identifiers.get('isbn', None))
@ -175,18 +193,9 @@ class GoogleBooks(Source):
}) })
# }}} # }}}
def cover_url_from_identifiers(self, identifiers):
goog = identifiers.get('google', None)
if goog is None:
isbn = identifiers.get('isbn', None)
goog = self.cached_isbn_to_identifier(isbn)
if goog is not None:
return ('http://books.google.com/books?id=%s&printsec=frontcover&img=1' %
goog)
def download_cover(self, log, result_queue, abort, # {{{ def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30): title=None, authors=None, identifiers={}, timeout=30):
cached_url = self.cover_url_from_identifiers(identifiers) cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None: if cached_url is None:
log.info('No cached cover found, running identify') log.info('No cached cover found, running identify')
rq = Queue() rq = Queue()
@ -215,32 +224,38 @@ class GoogleBooks(Source):
br = self.browser br = self.browser
try: try:
cdata = br.open_novisit(cached_url, timeout=timeout).read() cdata = br.open_novisit(cached_url, timeout=timeout).read()
if self.is_cover_image_valid(cdata):
result_queue.put(cdata) result_queue.put(cdata)
else:
log.error('No cover found for %r'%identifiers)
except: except:
log.exception('Failed to download cover from:', cached_url) log.exception('Failed to download cover from:', cached_url)
# }}} # }}}
def get_cached_cover_url(self, identifiers): # {{{
url = None
goog = identifiers.get('google', None)
if goog is None:
isbn = identifiers.get('isbn', None)
if isbn is not None:
goog = self.cached_isbn_to_identifier(isbn)
if goog is not None:
url = self.cached_identifier_to_cover_url(goog)
def is_cover_image_valid(self, raw): return url
# When no cover is present, returns a PNG saying image not available # }}}
# Try for example google identifier llNqPwAACAAJ
# I have yet to see an actual cover in PNG format
return raw and len(raw) > 17000 and raw[1:4] != b'PNG'
def get_all_details(self, br, log, entries, abort, result_queue, timeout): def get_all_details(self, br, log, entries, abort, # {{{
result_queue, timeout):
for relevance, i in enumerate(entries): for relevance, i in enumerate(entries):
try: try:
ans = to_metadata(br, log, i, timeout) ans = to_metadata(br, log, i, timeout)
if isinstance(ans, Metadata): if isinstance(ans, Metadata):
ans.source_relevance = relevance ans.source_relevance = relevance
goog = ans.identifiers['google']
for isbn in getattr(ans, 'all_isbns', []): for isbn in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbn, self.cache_isbn_to_identifier(isbn, goog)
ans.identifiers['google']) if ans.has_google_cover:
self.cache_identifier_to_cover_url(goog,
self.GOOGLE_COVER%goog)
result_queue.put(ans) result_queue.put(ans)
except: except:
log.exception( log.exception(
@ -248,6 +263,7 @@ class GoogleBooks(Source):
etree.tostring(i)) etree.tostring(i))
if abort.is_set(): if abort.is_set():
break break
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30): identifiers={}, timeout=30):
@ -281,7 +297,7 @@ class GoogleBooks(Source):
return None return None
# }}} # }}}
if __name__ == '__main__': if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
title_test, authors_test) title_test, authors_test)
@ -296,8 +312,10 @@ if __name__ == '__main__':
authors_test(['Francis Scott Fitzgerald'])] authors_test(['Francis Scott Fitzgerald'])]
), ),
#( (
# {'title': 'Great Expectations', 'authors':['Charles Dickens']}, {'title': 'Flatland', 'authors':['Abbott']},
# [title_test('Great Expectations', exact=True)] [title_test('Flatland', exact=False)]
#), ),
]) ])
# }}}

View File

@ -0,0 +1,35 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.metadata.sources.base import Source
class OpenLibrary(Source):
name = 'Open Library'
description = _('Downloads metadata from The Open Library')
capabilities = frozenset(['cover'])
OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'
def download_cover(self, log, result_queue, abort,
title=None, authors=None, identifiers={}, timeout=30):
if 'isbn' not in identifiers:
return
isbn = identifiers['isbn']
br = self.browser
try:
ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()
result_queue.put(ans)
except Exception as e:
if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
log.error('No cover for ISBN: %r found'%isbn)
else:
log.exception('Failed to download cover for ISBN:', isbn)

View File

@ -99,6 +99,8 @@ def test_identify_plugin(name, tests):
for i, mi in enumerate(results): for i, mi in enumerate(results):
prints('*'*30, 'Relevance:', i, '*'*30) prints('*'*30, 'Relevance:', i, '*'*30)
prints(mi) prints(mi)
prints('\nCached cover URL :',
plugin.get_cached_cover_url(mi.identifiers))
prints('*'*75, '\n\n') prints('*'*75, '\n\n')
possibles = [] possibles = []