remove threading from google metadata download plugin as google throttles requests

This commit is contained in:
Kovid Goyal 2011-02-20 12:47:31 -07:00
parent 5fe546c70c
commit 1976ca3663
3 changed files with 46 additions and 52 deletions

View File

@ -85,7 +85,8 @@ class Source(Plugin):
# Metadata API {{{ # Metadata API {{{
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}): def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=5):
''' '''
Identify a book by its title/author/isbn/etc. Identify a book by its title/author/isbn/etc.
@ -98,6 +99,8 @@ class Source(Plugin):
:param authors: A list of authors of the book, can be None :param authors: A list of authors of the book, can be None
:param identifiers: A dictionary of other identifiers, most commonly :param identifiers: A dictionary of other identifiers, most commonly
{'isbn':'1234...'} {'isbn':'1234...'}
:param timeout: Timeout in seconds, no network request should hang for
longer than timeout.
:return: None if no errors occurred, otherwise a unicode representation :return: None if no errors occurred, otherwise a unicode representation
of the error suitable for showing to the user of the error suitable for showing to the user

View File

@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
import time import time
from urllib import urlencode from urllib import urlencode
from functools import partial from functools import partial
from threading import Thread
from lxml import etree from lxml import etree
@ -18,6 +17,7 @@ from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.date import parse_date, utcnow from calibre.utils.date import parse_date, utcnow
from calibre.utils.cleantext import clean_ascii_chars
from calibre import browser, as_unicode from calibre import browser, as_unicode
NAMESPACES = { NAMESPACES = {
@ -41,20 +41,20 @@ subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description') description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language') language = XPath('descendant::dc:language')
def get_details(browser, url): def get_details(browser, url, timeout):
try: try:
raw = browser.open_novisit(url).read() raw = browser.open_novisit(url, timeout=timeout).read()
except Exception as e: except Exception as e:
gc = getattr(e, 'getcode', lambda : -1) gc = getattr(e, 'getcode', lambda : -1)
if gc() != 403: if gc() != 403:
raise raise
# Google is throttling us, wait a little # Google is throttling us, wait a little
time.sleep(2) time.sleep(1)
raw = browser.open_novisit(url).read() raw = browser.open_novisit(url, timeout=timeout).read()
return raw return raw
def to_metadata(browser, log, entry_): def to_metadata(browser, log, entry_, timeout):
def get_text(extra, x): def get_text(extra, x):
try: try:
@ -79,8 +79,9 @@ def to_metadata(browser, log, entry_):
mi = Metadata(title_, authors) mi = Metadata(title_, authors)
try: try:
raw = get_details(browser, id_url) raw = get_details(browser, id_url, timeout)
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True)[0]) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
strip_encoding_pats=True)[0])
extra = entry(feed)[0] extra = entry(feed)[0]
except: except:
log.exception('Failed to get additional details for', mi.title) log.exception('Failed to get additional details for', mi.title)
@ -131,26 +132,19 @@ def to_metadata(browser, log, entry_):
return mi return mi
class Worker(Thread):
def __init__(self, log, entries, abort, result_queue): def get_all_details(br, log, entries, abort, result_queue, timeout):
self.browser, self.log, self.entries = browser(), log, entries for i in entries:
self.abort, self.result_queue = abort, result_queue try:
Thread.__init__(self) ans = to_metadata(br, log, i, timeout)
self.daemon = True if isinstance(ans, Metadata):
result_queue.put(ans)
def run(self): except:
for i in self.entries: log.exception(
try: 'Failed to get metadata for identify entry:',
ans = to_metadata(self.browser, self.log, i) etree.tostring(i))
if isinstance(ans, Metadata): if abort.is_set():
self.result_queue.put(ans) break
except:
self.log.exception(
'Failed to get metadata for identify entry:',
etree.tostring(i))
if self.abort.is_set():
break
class GoogleBooks(Source): class GoogleBooks(Source):
@ -192,54 +186,40 @@ class GoogleBooks(Source):
}) })
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}): def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=5):
query = self.create_query(log, title=title, authors=authors, query = self.create_query(log, title=title, authors=authors,
identifiers=identifiers) identifiers=identifiers)
br = browser()
try: try:
raw = browser().open_novisit(query).read() raw = br.open_novisit(query, timeout=timeout).read()
except Exception, e: except Exception, e:
log.exception('Failed to make identify query: %r'%query) log.exception('Failed to make identify query: %r'%query)
return as_unicode(e) return as_unicode(e)
try: try:
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(raw, feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
strip_encoding_pats=True)[0], parser=parser) strip_encoding_pats=True)[0], parser=parser)
entries = entry(feed) entries = entry(feed)
except Exception, e: except Exception, e:
log.exception('Failed to parse identify results') log.exception('Failed to parse identify results')
return as_unicode(e) return as_unicode(e)
# There is no point running these queries in threads as google
groups = self.split_jobs(entries, 5) # At most 5 threads # throttles requests returning Forbidden errors
if not groups: get_all_details(br, log, entries, abort, result_queue, timeout)
return None
workers = [Worker(log, entries, abort, result_queue) for entries in
groups]
if abort.is_set():
return None
for worker in workers: worker.start()
has_alive_worker = True
while has_alive_worker and not abort.is_set():
time.sleep(0.1)
has_alive_worker = False
for worker in workers:
if worker.is_alive():
has_alive_worker = True
return None return None
if __name__ == '__main__': if __name__ == '__main__':
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test) title_test)
test_identify_plugin(GoogleBooks.name, test_identify_plugin(GoogleBooks.name,
[ [
( (
{'title': 'Great Expectations', 'authors':['Charles Dickens']}, {'title': 'Great Expectations', 'authors':['Charles Dickens']},
[isbn_test('9781607541592')] [title_test('Great Expectations', exact=True)]
), ),
]) ])

View File

@ -26,6 +26,17 @@ def isbn_test(isbn):
return test return test
def title_test(title, exact=False):
title = title.lower()
def test(mi):
mt = mi.title.lower()
return (exact and mt == title) or \
(not exact and title in mt)
return test
def test_identify_plugin(name, tests): def test_identify_plugin(name, tests):
''' '''
:param name: Plugin name :param name: Plugin name