mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
pep8
This commit is contained in:
parent
4f5155d190
commit
4ae11fa295
@ -1,37 +1,34 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
from __future__ import (unicode_literals, division, absolute_import,
|
# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
print_function)
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
import hashlib
|
||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
import time
|
||||||
__docformat__ = 'restructuredtext en'
|
|
||||||
|
|
||||||
import time, hashlib
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from Queue import Queue, Empty
|
from Queue import Empty, Queue
|
||||||
|
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre import as_unicode
|
||||||
from calibre.ebooks.metadata.sources.base import Source
|
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
from calibre.ebooks.metadata import check_isbn
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.ebooks.metadata.sources.base import Source
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.utils.localization import canonicalize_lang
|
from calibre.utils.localization import canonicalize_lang
|
||||||
from calibre import as_unicode
|
|
||||||
|
|
||||||
NAMESPACES = {
|
NAMESPACES = {
|
||||||
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||||
'atom' : 'http://www.w3.org/2005/Atom',
|
'atom': 'http://www.w3.org/2005/Atom',
|
||||||
'dc' : 'http://purl.org/dc/terms',
|
'dc': 'http://purl.org/dc/terms',
|
||||||
'gd' : 'http://schemas.google.com/g/2005'
|
'gd': 'http://schemas.google.com/g/2005'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_details(browser, url, timeout): # {{{
|
def get_details(browser, url, timeout): # {{{
|
||||||
try:
|
try:
|
||||||
raw = browser.open_novisit(url, timeout=timeout).read()
|
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
gc = getattr(e, 'getcode', lambda : -1)
|
gc = getattr(e, 'getcode', lambda: -1)
|
||||||
if gc() != 403:
|
if gc() != 403:
|
||||||
raise
|
raise
|
||||||
# Google is throttling us, wait a little
|
# Google is throttling us, wait a little
|
||||||
@ -39,6 +36,8 @@ def get_details(browser, url, timeout): # {{{
|
|||||||
raw = browser.open_novisit(url, timeout=timeout).read()
|
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||||
|
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
@ -49,17 +48,17 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
# total_results = XPath('//openSearch:totalResults')
|
# total_results = XPath('//openSearch:totalResults')
|
||||||
# start_index = XPath('//openSearch:startIndex')
|
# start_index = XPath('//openSearch:startIndex')
|
||||||
# items_per_page = XPath('//openSearch:itemsPerPage')
|
# items_per_page = XPath('//openSearch:itemsPerPage')
|
||||||
entry = XPath('//atom:entry')
|
entry = XPath('//atom:entry')
|
||||||
entry_id = XPath('descendant::atom:id')
|
entry_id = XPath('descendant::atom:id')
|
||||||
creator = XPath('descendant::dc:creator')
|
creator = XPath('descendant::dc:creator')
|
||||||
identifier = XPath('descendant::dc:identifier')
|
identifier = XPath('descendant::dc:identifier')
|
||||||
title = XPath('descendant::dc:title')
|
title = XPath('descendant::dc:title')
|
||||||
date = XPath('descendant::dc:date')
|
date = XPath('descendant::dc:date')
|
||||||
publisher = XPath('descendant::dc:publisher')
|
publisher = XPath('descendant::dc:publisher')
|
||||||
subject = XPath('descendant::dc:subject')
|
subject = XPath('descendant::dc:subject')
|
||||||
description = XPath('descendant::dc:description')
|
description = XPath('descendant::dc:description')
|
||||||
language = XPath('descendant::dc:language')
|
language = XPath('descendant::dc:language')
|
||||||
rating = XPath('descendant::gd:rating[@average]')
|
rating = XPath('descendant::gd:rating[@average]')
|
||||||
|
|
||||||
def get_text(extra, x):
|
def get_text(extra, x):
|
||||||
try:
|
try:
|
||||||
@ -83,11 +82,12 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
mi = Metadata(title_, authors)
|
mi = Metadata(title_, authors)
|
||||||
mi.identifiers = {'google':google_id}
|
mi.identifiers = {'google': google_id}
|
||||||
try:
|
try:
|
||||||
raw = get_details(browser, id_url, timeout)
|
raw = get_details(browser, id_url, timeout)
|
||||||
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
feed = etree.fromstring(
|
||||||
strip_encoding_pats=True)[0])
|
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]
|
||||||
|
)
|
||||||
extra = entry(feed)[0]
|
extra = entry(feed)[0]
|
||||||
except:
|
except:
|
||||||
log.exception('Failed to get additional details for', mi.title)
|
log.exception('Failed to get additional details for', mi.title)
|
||||||
@ -135,7 +135,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
default = utcnow().replace(day=15)
|
default = utcnow().replace(day=15)
|
||||||
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||||
except:
|
except:
|
||||||
log.error('Failed to parse pubdate %r'%pubdate)
|
log.error('Failed to parse pubdate %r' % pubdate)
|
||||||
|
|
||||||
# Ratings
|
# Ratings
|
||||||
for x in rating(extra):
|
for x in rating(extra):
|
||||||
@ -149,11 +149,14 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
# Cover
|
# Cover
|
||||||
mi.has_google_cover = None
|
mi.has_google_cover = None
|
||||||
for x in extra.xpath(
|
for x in extra.xpath(
|
||||||
'//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
|
'//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'
|
||||||
|
):
|
||||||
mi.has_google_cover = x.get('href')
|
mi.has_google_cover = x.get('href')
|
||||||
break
|
break
|
||||||
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
@ -162,21 +165,23 @@ class GoogleBooks(Source):
|
|||||||
name = 'Google'
|
name = 'Google'
|
||||||
description = _('Downloads metadata and covers from Google Books')
|
description = _('Downloads metadata and covers from Google Books')
|
||||||
|
|
||||||
capabilities = frozenset(['identify', 'cover'])
|
capabilities = frozenset({'identify', 'cover'})
|
||||||
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
|
touched_fields = frozenset({
|
||||||
'comments', 'publisher', 'identifier:isbn', 'rating',
|
'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
|
||||||
'identifier:google', 'languages'])
|
'identifier:isbn', 'rating', 'identifier:google', 'languages'
|
||||||
|
})
|
||||||
supports_gzip_transfer_encoding = True
|
supports_gzip_transfer_encoding = True
|
||||||
cached_cover_url_is_reliable = False
|
cached_cover_url_is_reliable = False
|
||||||
|
|
||||||
GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'
|
GOOGLE_COVER = 'https://books.google.com/books?id=%s&printsec=frontcover&img=1'
|
||||||
|
|
||||||
DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657'])
|
DUMMY_IMAGE_MD5 = frozenset({'0de4383ebad0adad5eeb8975cd796657'})
|
||||||
|
|
||||||
def get_book_url(self, identifiers): # {{{
|
def get_book_url(self, identifiers): # {{{
|
||||||
goog = identifiers.get('google', None)
|
goog = identifiers.get('google', None)
|
||||||
if goog is not None:
|
if goog is not None:
|
||||||
return ('google', goog, 'https://books.google.com/books?id=%s'%goog)
|
return ('google', goog, 'https://books.google.com/books?id=%s' % goog)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||||
@ -185,39 +190,55 @@ class GoogleBooks(Source):
|
|||||||
isbn = check_isbn(identifiers.get('isbn', None))
|
isbn = check_isbn(identifiers.get('isbn', None))
|
||||||
q = ''
|
q = ''
|
||||||
if isbn is not None:
|
if isbn is not None:
|
||||||
q += 'isbn:'+isbn
|
q += 'isbn:' + isbn
|
||||||
elif title or authors:
|
elif title or authors:
|
||||||
|
|
||||||
def build_term(prefix, parts):
|
def build_term(prefix, parts):
|
||||||
return ' '.join('in'+prefix + ':' + x for x in parts)
|
return ' '.join('in' + prefix + ':' + x for x in parts)
|
||||||
|
|
||||||
title_tokens = list(self.get_title_tokens(title))
|
title_tokens = list(self.get_title_tokens(title))
|
||||||
if title_tokens:
|
if title_tokens:
|
||||||
q += build_term('title', title_tokens)
|
q += build_term('title', title_tokens)
|
||||||
author_tokens = self.get_author_tokens(authors,
|
author_tokens = self.get_author_tokens(authors, only_first_author=True)
|
||||||
only_first_author=True)
|
|
||||||
if author_tokens:
|
if author_tokens:
|
||||||
q += ('+' if q else '') + build_term('author',
|
q += ('+' if q else '') + build_term('author', author_tokens)
|
||||||
author_tokens)
|
|
||||||
|
|
||||||
if isinstance(q, unicode):
|
if isinstance(q, unicode):
|
||||||
q = q.encode('utf-8')
|
q = q.encode('utf-8')
|
||||||
if not q:
|
if not q:
|
||||||
return None
|
return None
|
||||||
return BASE_URL+urlencode({
|
return BASE_URL + urlencode({
|
||||||
'q':q,
|
'q': q,
|
||||||
'max-results':20,
|
'max-results': 20,
|
||||||
'start-index':1,
|
'start-index': 1,
|
||||||
'min-viewability':'none',
|
'min-viewability': 'none',
|
||||||
})
|
})
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def download_cover(self, log, result_queue, abort, # {{{
|
def download_cover( # {{{
|
||||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
self,
|
||||||
|
log,
|
||||||
|
result_queue,
|
||||||
|
abort,
|
||||||
|
title=None,
|
||||||
|
authors=None,
|
||||||
|
identifiers={},
|
||||||
|
timeout=30,
|
||||||
|
get_best_cover=False
|
||||||
|
):
|
||||||
cached_url = self.get_cached_cover_url(identifiers)
|
cached_url = self.get_cached_cover_url(identifiers)
|
||||||
if cached_url is None:
|
if cached_url is None:
|
||||||
log.info('No cached cover found, running identify')
|
log.info('No cached cover found, running identify')
|
||||||
rq = Queue()
|
rq = Queue()
|
||||||
self.identify(log, rq, abort, title=title, authors=authors,
|
self.identify(
|
||||||
identifiers=identifiers)
|
log,
|
||||||
|
rq,
|
||||||
|
abort,
|
||||||
|
title=title,
|
||||||
|
authors=authors,
|
||||||
|
identifiers=identifiers
|
||||||
|
)
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
return
|
return
|
||||||
results = []
|
results = []
|
||||||
@ -226,8 +247,11 @@ class GoogleBooks(Source):
|
|||||||
results.append(rq.get_nowait())
|
results.append(rq.get_nowait())
|
||||||
except Empty:
|
except Empty:
|
||||||
break
|
break
|
||||||
results.sort(key=self.identify_results_keygen(
|
results.sort(
|
||||||
title=title, authors=authors, identifiers=identifiers))
|
key=self.identify_results_keygen(
|
||||||
|
title=title, authors=authors, identifiers=identifiers
|
||||||
|
)
|
||||||
|
)
|
||||||
for mi in results:
|
for mi in results:
|
||||||
cached_url = self.get_cached_cover_url(mi.identifiers)
|
cached_url = self.get_cached_cover_url(mi.identifiers)
|
||||||
if cached_url is not None:
|
if cached_url is not None:
|
||||||
@ -263,10 +287,18 @@ class GoogleBooks(Source):
|
|||||||
url = self.cached_identifier_to_cover_url(goog)
|
url = self.cached_identifier_to_cover_url(goog)
|
||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_all_details(self, br, log, entries, abort, # {{{
|
def get_all_details( # {{{
|
||||||
result_queue, timeout):
|
self,
|
||||||
|
br,
|
||||||
|
log,
|
||||||
|
entries,
|
||||||
|
abort,
|
||||||
|
result_queue,
|
||||||
|
timeout
|
||||||
|
):
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
for relevance, i in enumerate(entries):
|
for relevance, i in enumerate(entries):
|
||||||
try:
|
try:
|
||||||
@ -277,26 +309,37 @@ class GoogleBooks(Source):
|
|||||||
for isbn in getattr(ans, 'all_isbns', []):
|
for isbn in getattr(ans, 'all_isbns', []):
|
||||||
self.cache_isbn_to_identifier(isbn, goog)
|
self.cache_isbn_to_identifier(isbn, goog)
|
||||||
if getattr(ans, 'has_google_cover', False):
|
if getattr(ans, 'has_google_cover', False):
|
||||||
self.cache_identifier_to_cover_url(goog,
|
self.cache_identifier_to_cover_url(
|
||||||
self.GOOGLE_COVER%goog)
|
goog, self.GOOGLE_COVER % goog
|
||||||
|
)
|
||||||
self.clean_downloaded_metadata(ans)
|
self.clean_downloaded_metadata(ans)
|
||||||
result_queue.put(ans)
|
result_queue.put(ans)
|
||||||
except:
|
except:
|
||||||
log.exception(
|
log.exception(
|
||||||
'Failed to get metadata for identify entry:',
|
'Failed to get metadata for identify entry:', etree.tostring(i)
|
||||||
etree.tostring(i))
|
)
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
break
|
break
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
def identify( # {{{
|
||||||
identifiers={}, timeout=30):
|
self,
|
||||||
|
log,
|
||||||
|
result_queue,
|
||||||
|
abort,
|
||||||
|
title=None,
|
||||||
|
authors=None,
|
||||||
|
identifiers={},
|
||||||
|
timeout=30
|
||||||
|
):
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||||
entry = XPath('//atom:entry')
|
entry = XPath('//atom:entry')
|
||||||
|
|
||||||
query = self.create_query(log, title=title, authors=authors,
|
query = self.create_query(
|
||||||
identifiers=identifiers)
|
log, title=title, authors=authors, identifiers=identifiers
|
||||||
|
)
|
||||||
if not query:
|
if not query:
|
||||||
log.error('Insufficient metadata to construct query')
|
log.error('Insufficient metadata to construct query')
|
||||||
return
|
return
|
||||||
@ -304,13 +347,15 @@ class GoogleBooks(Source):
|
|||||||
try:
|
try:
|
||||||
raw = br.open_novisit(query, timeout=timeout).read()
|
raw = br.open_novisit(query, timeout=timeout).read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception('Failed to make identify query: %r'%query)
|
log.exception('Failed to make identify query: %r' % query)
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
parser = etree.XMLParser(recover=True, no_network=True)
|
||||||
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
feed = etree.fromstring(
|
||||||
strip_encoding_pats=True)[0], parser=parser)
|
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||||
|
parser=parser
|
||||||
|
)
|
||||||
entries = entry(feed)
|
entries = entry(feed)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception('Failed to parse identify results')
|
log.exception('Failed to parse identify results')
|
||||||
@ -318,34 +363,45 @@ class GoogleBooks(Source):
|
|||||||
|
|
||||||
if not entries and identifiers and title and authors and \
|
if not entries and identifiers and title and authors and \
|
||||||
not abort.is_set():
|
not abort.is_set():
|
||||||
return self.identify(log, result_queue, abort, title=title,
|
return self.identify(
|
||||||
authors=authors, timeout=timeout)
|
log,
|
||||||
|
result_queue,
|
||||||
|
abort,
|
||||||
|
title=title,
|
||||||
|
authors=authors,
|
||||||
|
timeout=timeout
|
||||||
|
)
|
||||||
|
|
||||||
# There is no point running these queries in threads as google
|
# There is no point running these queries in threads as google
|
||||||
# throttles requests returning 403 Forbidden errors
|
# throttles requests returning 403 Forbidden errors
|
||||||
self.get_all_details(br, log, entries, abort, result_queue, timeout)
|
self.get_all_details(br, log, entries, abort, result_queue, timeout)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__': # tests {{{
|
if __name__ == '__main__': # tests {{{
|
||||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
|
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
|
||||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
from calibre.ebooks.metadata.sources.test import (
|
||||||
title_test, authors_test)
|
test_identify_plugin, title_test, authors_test
|
||||||
test_identify_plugin(GoogleBooks.name,
|
)
|
||||||
[
|
test_identify_plugin(
|
||||||
|
GoogleBooks.name, [
|
||||||
|
({
|
||||||
(
|
'identifiers': {
|
||||||
{'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby',
|
'isbn': '0743273567'
|
||||||
'authors':['Fitzgerald']},
|
},
|
||||||
[title_test('The great gatsby', exact=True),
|
'title': 'Great Gatsby',
|
||||||
authors_test(['F. Scott Fitzgerald'])]
|
'authors': ['Fitzgerald']
|
||||||
),
|
}, [
|
||||||
|
title_test('The great gatsby', exact=True),
|
||||||
(
|
authors_test(['F. Scott Fitzgerald'])
|
||||||
{'title': 'Flatland', 'authors':['Abbott']},
|
]),
|
||||||
[title_test('Flatland', exact=False)]
|
({
|
||||||
),
|
'title': 'Flatland',
|
||||||
])
|
'authors': ['Abbott']
|
||||||
|
}, [title_test('Flatland', exact=False)]),
|
||||||
|
]
|
||||||
|
)
|
||||||
# }}}
|
# }}}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user