mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
caching of google identifiers and logic to get cover url from google identifier
This commit is contained in:
parent
6460f08b7f
commit
52c19d7b9b
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import re, threading
|
||||||
|
|
||||||
from calibre.customize import Plugin
|
from calibre.customize import Plugin
|
||||||
from calibre.utils.logging import ThreadSafeLog, FileStream
|
from calibre.utils.logging import ThreadSafeLog, FileStream
|
||||||
@ -30,7 +30,21 @@ class Source(Plugin):
|
|||||||
|
|
||||||
touched_fields = frozenset()
|
touched_fields = frozenset()
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
Plugin.__init__(self, *args, **kwargs)
|
||||||
|
self._isbn_to_identifier_cache = {}
|
||||||
|
self.cache_lock = threading.RLock()
|
||||||
|
|
||||||
# Utility functions {{{
|
# Utility functions {{{
|
||||||
|
|
||||||
|
def cache_isbn_to_identifier(self, isbn, identifier):
|
||||||
|
with self.cache_lock:
|
||||||
|
self._isbn_to_identifier_cache[isbn] = identifier
|
||||||
|
|
||||||
|
def cached_isbn_to_identifier(self, isbn):
|
||||||
|
with self.cache_lock:
|
||||||
|
return self._isbn_to_identifier_cache.get(isbn, None)
|
||||||
|
|
||||||
def get_author_tokens(self, authors, only_first_author=True):
|
def get_author_tokens(self, authors, only_first_author=True):
|
||||||
'''
|
'''
|
||||||
Take a list of authors and return a list of tokens useful for an
|
Take a list of authors and return a list of tokens useful for an
|
||||||
|
@ -13,6 +13,7 @@ from functools import partial
|
|||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata import check_isbn
|
||||||
from calibre.ebooks.metadata.sources.base import Source
|
from calibre.ebooks.metadata.sources.base import Source
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
@ -69,6 +70,7 @@ def to_metadata(browser, log, entry_, timeout):
|
|||||||
|
|
||||||
|
|
||||||
id_url = entry_id(entry_)[0].text
|
id_url = entry_id(entry_)[0].text
|
||||||
|
google_id = id_url.split('/')[-1]
|
||||||
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||||
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||||
if not authors:
|
if not authors:
|
||||||
@ -78,6 +80,7 @@ def to_metadata(browser, log, entry_, timeout):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
mi = Metadata(title_, authors)
|
mi = Metadata(title_, authors)
|
||||||
|
mi.identifiers = {'google':google_id}
|
||||||
try:
|
try:
|
||||||
raw = get_details(browser, id_url, timeout)
|
raw = get_details(browser, id_url, timeout)
|
||||||
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||||
@ -103,9 +106,12 @@ def to_metadata(browser, log, entry_, timeout):
|
|||||||
t = str(x.text).strip()
|
t = str(x.text).strip()
|
||||||
if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
|
if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
|
||||||
if t[:5].upper() == 'ISBN:':
|
if t[:5].upper() == 'ISBN:':
|
||||||
isbns.append(t[5:])
|
t = check_isbn(t[5:])
|
||||||
|
if t:
|
||||||
|
isbns.append(t)
|
||||||
if isbns:
|
if isbns:
|
||||||
mi.isbn = sorted(isbns, key=len)[-1]
|
mi.isbn = sorted(isbns, key=len)[-1]
|
||||||
|
mi.all_isbns = isbns
|
||||||
|
|
||||||
# Tags
|
# Tags
|
||||||
try:
|
try:
|
||||||
@ -133,20 +139,6 @@ def to_metadata(browser, log, entry_, timeout):
|
|||||||
return mi
|
return mi
|
||||||
|
|
||||||
|
|
||||||
def get_all_details(br, log, entries, abort, result_queue, timeout):
|
|
||||||
for i in entries:
|
|
||||||
try:
|
|
||||||
ans = to_metadata(br, log, i, timeout)
|
|
||||||
if isinstance(ans, Metadata):
|
|
||||||
result_queue.put(ans)
|
|
||||||
except:
|
|
||||||
log.exception(
|
|
||||||
'Failed to get metadata for identify entry:',
|
|
||||||
etree.tostring(i))
|
|
||||||
if abort.is_set():
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
class GoogleBooks(Source):
|
class GoogleBooks(Source):
|
||||||
|
|
||||||
name = 'Google Books'
|
name = 'Google Books'
|
||||||
@ -185,6 +177,36 @@ class GoogleBooks(Source):
|
|||||||
'min-viewability':'none',
|
'min-viewability':'none',
|
||||||
})
|
})
|
||||||
|
|
||||||
|
def cover_url_from_identifiers(self, identifiers):
|
||||||
|
goog = identifiers.get('google', None)
|
||||||
|
if goog is None:
|
||||||
|
isbn = identifiers.get('isbn', None)
|
||||||
|
goog = self.cached_isbn_to_identifier(isbn)
|
||||||
|
if goog is not None:
|
||||||
|
return ('http://books.google.com/books?id=%s&printsec=frontcover&img=1' %
|
||||||
|
goog)
|
||||||
|
|
||||||
|
def is_cover_image_valid(self, raw):
|
||||||
|
# When no cover is present, returns a PNG saying image not available
|
||||||
|
# Try for example google identifier llNqPwAACAAJ
|
||||||
|
# I have yet to see an actual cover in PNG format
|
||||||
|
return raw and len(raw) > 17000 and raw[1:4] != 'PNG'
|
||||||
|
|
||||||
|
def get_all_details(self, br, log, entries, abort, result_queue, timeout):
|
||||||
|
for i in entries:
|
||||||
|
try:
|
||||||
|
ans = to_metadata(br, log, i, timeout)
|
||||||
|
if isinstance(ans, Metadata):
|
||||||
|
result_queue.put(ans)
|
||||||
|
for isbn in ans.all_isbns:
|
||||||
|
self.cache_isbn_to_identifier(isbn,
|
||||||
|
ans.identifiers['google'])
|
||||||
|
except:
|
||||||
|
log.exception(
|
||||||
|
'Failed to get metadata for identify entry:',
|
||||||
|
etree.tostring(i))
|
||||||
|
if abort.is_set():
|
||||||
|
break
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||||
identifiers={}, timeout=5):
|
identifiers={}, timeout=5):
|
||||||
@ -207,8 +229,8 @@ class GoogleBooks(Source):
|
|||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
# There is no point running these queries in threads as google
|
# There is no point running these queries in threads as google
|
||||||
# throttles requests returning Forbidden errors
|
# throttles requests returning 403 Forbidden errors
|
||||||
get_all_details(br, log, entries, abort, result_queue, timeout)
|
self.get_all_details(br, log, entries, abort, result_queue, timeout)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -218,8 +240,14 @@ if __name__ == '__main__':
|
|||||||
title_test)
|
title_test)
|
||||||
test_identify_plugin(GoogleBooks.name,
|
test_identify_plugin(GoogleBooks.name,
|
||||||
[
|
[
|
||||||
|
|
||||||
(
|
(
|
||||||
{'title': 'Great Expectations', 'authors':['Charles Dickens']},
|
{'identifiers':{'isbn': '0743273567'}},
|
||||||
[title_test('Great Expectations', exact=True)]
|
[title_test('The great gatsby', exact=True)]
|
||||||
),
|
),
|
||||||
|
|
||||||
|
#(
|
||||||
|
# {'title': 'Great Expectations', 'authors':['Charles Dickens']},
|
||||||
|
# [title_test('Great Expectations', exact=True)]
|
||||||
|
#),
|
||||||
])
|
])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user