mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
remove threading from google metadata download plugin as google throttles requests
This commit is contained in:
parent
5fe546c70c
commit
1976ca3663
@ -85,7 +85,8 @@ class Source(Plugin):
|
|||||||
|
|
||||||
# Metadata API {{{
|
# Metadata API {{{
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||||
|
identifiers={}, timeout=5):
|
||||||
'''
|
'''
|
||||||
Identify a book by its title/author/isbn/etc.
|
Identify a book by its title/author/isbn/etc.
|
||||||
|
|
||||||
@ -98,6 +99,8 @@ class Source(Plugin):
|
|||||||
:param authors: A list of authors of the book, can be None
|
:param authors: A list of authors of the book, can be None
|
||||||
:param identifiers: A dictionary of other identifiers, most commonly
|
:param identifiers: A dictionary of other identifiers, most commonly
|
||||||
{'isbn':'1234...'}
|
{'isbn':'1234...'}
|
||||||
|
:param timeout: Timeout in seconds, no network request should hang for
|
||||||
|
longer than timeout.
|
||||||
:return: None if no errors occurred, otherwise a unicode representation
|
:return: None if no errors occurred, otherwise a unicode representation
|
||||||
of the error suitable for showing to the user
|
of the error suitable for showing to the user
|
||||||
|
|
||||||
|
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import time
|
import time
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from threading import Thread
|
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
@ -18,6 +17,7 @@ from calibre.ebooks.metadata.sources.base import Source
|
|||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.utils.date import parse_date, utcnow
|
from calibre.utils.date import parse_date, utcnow
|
||||||
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre import browser, as_unicode
|
from calibre import browser, as_unicode
|
||||||
|
|
||||||
NAMESPACES = {
|
NAMESPACES = {
|
||||||
@ -41,20 +41,20 @@ subject = XPath('descendant::dc:subject')
|
|||||||
description = XPath('descendant::dc:description')
|
description = XPath('descendant::dc:description')
|
||||||
language = XPath('descendant::dc:language')
|
language = XPath('descendant::dc:language')
|
||||||
|
|
||||||
def get_details(browser, url):
|
def get_details(browser, url, timeout):
|
||||||
try:
|
try:
|
||||||
raw = browser.open_novisit(url).read()
|
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
gc = getattr(e, 'getcode', lambda : -1)
|
gc = getattr(e, 'getcode', lambda : -1)
|
||||||
if gc() != 403:
|
if gc() != 403:
|
||||||
raise
|
raise
|
||||||
# Google is throttling us, wait a little
|
# Google is throttling us, wait a little
|
||||||
time.sleep(2)
|
time.sleep(1)
|
||||||
raw = browser.open_novisit(url).read()
|
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||||
|
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
def to_metadata(browser, log, entry_):
|
def to_metadata(browser, log, entry_, timeout):
|
||||||
|
|
||||||
def get_text(extra, x):
|
def get_text(extra, x):
|
||||||
try:
|
try:
|
||||||
@ -79,8 +79,9 @@ def to_metadata(browser, log, entry_):
|
|||||||
|
|
||||||
mi = Metadata(title_, authors)
|
mi = Metadata(title_, authors)
|
||||||
try:
|
try:
|
||||||
raw = get_details(browser, id_url)
|
raw = get_details(browser, id_url, timeout)
|
||||||
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0])
|
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||||
|
strip_encoding_pats=True)[0])
|
||||||
extra = entry(feed)[0]
|
extra = entry(feed)[0]
|
||||||
except:
|
except:
|
||||||
log.exception('Failed to get additional details for', mi.title)
|
log.exception('Failed to get additional details for', mi.title)
|
||||||
@ -131,26 +132,19 @@ def to_metadata(browser, log, entry_):
|
|||||||
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
class Worker(Thread):
|
|
||||||
|
|
||||||
def __init__(self, log, entries, abort, result_queue):
|
def get_all_details(br, log, entries, abort, result_queue, timeout):
|
||||||
self.browser, self.log, self.entries = browser(), log, entries
|
for i in entries:
|
||||||
self.abort, self.result_queue = abort, result_queue
|
try:
|
||||||
Thread.__init__(self)
|
ans = to_metadata(br, log, i, timeout)
|
||||||
self.daemon = True
|
if isinstance(ans, Metadata):
|
||||||
|
result_queue.put(ans)
|
||||||
def run(self):
|
except:
|
||||||
for i in self.entries:
|
log.exception(
|
||||||
try:
|
'Failed to get metadata for identify entry:',
|
||||||
ans = to_metadata(self.browser, self.log, i)
|
etree.tostring(i))
|
||||||
if isinstance(ans, Metadata):
|
if abort.is_set():
|
||||||
self.result_queue.put(ans)
|
break
|
||||||
except:
|
|
||||||
self.log.exception(
|
|
||||||
'Failed to get metadata for identify entry:',
|
|
||||||
etree.tostring(i))
|
|
||||||
if self.abort.is_set():
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
class GoogleBooks(Source):
|
class GoogleBooks(Source):
|
||||||
@ -192,54 +186,40 @@ class GoogleBooks(Source):
|
|||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||||
|
identifiers={}, timeout=5):
|
||||||
query = self.create_query(log, title=title, authors=authors,
|
query = self.create_query(log, title=title, authors=authors,
|
||||||
identifiers=identifiers)
|
identifiers=identifiers)
|
||||||
|
br = browser()
|
||||||
try:
|
try:
|
||||||
raw = browser().open_novisit(query).read()
|
raw = br.open_novisit(query, timeout=timeout).read()
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
log.exception('Failed to make identify query: %r'%query)
|
log.exception('Failed to make identify query: %r'%query)
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
parser = etree.XMLParser(recover=True, no_network=True)
|
||||||
feed = etree.fromstring(xml_to_unicode(raw,
|
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||||
strip_encoding_pats=True)[0], parser=parser)
|
strip_encoding_pats=True)[0], parser=parser)
|
||||||
entries = entry(feed)
|
entries = entry(feed)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
log.exception('Failed to parse identify results')
|
log.exception('Failed to parse identify results')
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
|
# There is no point running these queries in threads as google
|
||||||
groups = self.split_jobs(entries, 5) # At most 5 threads
|
# throttles requests returning Forbidden errors
|
||||||
if not groups:
|
get_all_details(br, log, entries, abort, result_queue, timeout)
|
||||||
return None
|
|
||||||
workers = [Worker(log, entries, abort, result_queue) for entries in
|
|
||||||
groups]
|
|
||||||
|
|
||||||
if abort.is_set():
|
|
||||||
return None
|
|
||||||
|
|
||||||
for worker in workers: worker.start()
|
|
||||||
|
|
||||||
has_alive_worker = True
|
|
||||||
while has_alive_worker and not abort.is_set():
|
|
||||||
time.sleep(0.1)
|
|
||||||
has_alive_worker = False
|
|
||||||
for worker in workers:
|
|
||||||
if worker.is_alive():
|
|
||||||
has_alive_worker = True
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
|
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
|
||||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
isbn_test)
|
title_test)
|
||||||
test_identify_plugin(GoogleBooks.name,
|
test_identify_plugin(GoogleBooks.name,
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
{'title': 'Great Expectations', 'authors':['Charles Dickens']},
|
{'title': 'Great Expectations', 'authors':['Charles Dickens']},
|
||||||
[isbn_test('9781607541592')]
|
[title_test('Great Expectations', exact=True)]
|
||||||
),
|
),
|
||||||
])
|
])
|
||||||
|
@ -26,6 +26,17 @@ def isbn_test(isbn):
|
|||||||
|
|
||||||
return test
|
return test
|
||||||
|
|
||||||
|
def title_test(title, exact=False):
|
||||||
|
|
||||||
|
title = title.lower()
|
||||||
|
|
||||||
|
def test(mi):
|
||||||
|
mt = mi.title.lower()
|
||||||
|
return (exact and mt == title) or \
|
||||||
|
(not exact and title in mt)
|
||||||
|
|
||||||
|
return test
|
||||||
|
|
||||||
def test_identify_plugin(name, tests):
|
def test_identify_plugin(name, tests):
|
||||||
'''
|
'''
|
||||||
:param name: Plugin name
|
:param name: Plugin name
|
||||||
|
Loading…
x
Reference in New Issue
Block a user