From 4d20351e8b583e883cdfa4695c987ed70fc7d6bc Mon Sep 17 00:00:00 2001 From: Sengian Date: Wed, 8 Dec 2010 06:47:25 +0100 Subject: [PATCH] Add threading to nicebooks.py --- src/calibre/ebooks/metadata/nicebooks.py | 142 ++++++++++++++--------- 1 file changed, 85 insertions(+), 57 deletions(-) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 8914e2d985..7beececd7e 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -3,7 +3,8 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian ' __docformat__ = 'restructuredtext en' -import sys, textwrap, re, traceback, socket +import sys, textwrap, re, traceback, socket, threading +from Queue import Queue from urllib import urlencode from math import ceil from copy import deepcopy @@ -23,7 +24,7 @@ from calibre.utils.config import OptionParser class NiceBooks(MetadataSource): name = 'Nicebooks' - description = _('Downloads metadata from french Nicebooks') + description = _('Downloads metadata from French Nicebooks') supported_platforms = ['windows', 'osx', 'linux'] author = 'Sengian' version = (1, 0, 0) @@ -78,10 +79,50 @@ class NiceBooksError(Exception): class ISBNNotFound(NiceBooksError): pass +class BrowserThread(threading.Thread): + + def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'): + self.url = url + self.ex = ex + self.name = name + self.verbose = verbose + self.timeout = timeout + self.result = None + threading.Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + try: + raw = browser().open_novisit(self.url, timeout=self.timeout).read() + except Exception, e: + report(self.verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + self.result = None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise self.ex(_('%s timed out. Try again later.') % self.name) + raise self.ex(_('%s encountered an error.') % self.name) + if '404 - ' in raw: + report(self.verbose) + self.result = None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + self.result = soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + self.result = soupparser.fromstring(clean_ascii_chars(raw)) + except: + self.result = None + def report(verbose): if verbose: traceback.print_exc() + class Query(object): BASE_URL = 'http://fr.nicebooks.com/' @@ -224,68 +265,53 @@ class ResultList(list): report(verbose) return mi - def fill_MI(self, entry, title, authors, verbose): + def fill_MI(self, data, verbose): + '''create and return an mi if possible, None otherwise''' + try: + entry = data.xpath("//div[@id='container']/div[@id='book-info']")[0] + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return None mi = MetaInformation(title, authors) mi.author_sort = authors_to_sort_string(authors) mi.comments = self.get_description(entry, verbose) return self.get_book_info(entry, mi, verbose) - def get_individual_metadata(self, browser, linkdata, verbose): - try: - raw = browser.open_novisit(self.BASE_URL + linkdata).read() - except Exception, e: - report(verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise NiceBooksError(_('Nicebooks timed out. Try again later.')) - raise NiceBooksError(_('Nicebooks encountered an error.')) - if '<title>404 - ' in raw: - report(verbose) - return - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - feed = soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - feed = soupparser.fromstring(clean_ascii_chars(raw)) - except: - return None + def producer(self, q, data, verbose=False): + for x in data: + thread = BrowserThread(self.BASE_URL+x, verbose=verbose, ex=NiceBooksError, + name='Nicebooks') + thread.start() + q.put(thread, True) - # get results - return feed.xpath("//div[@id='container']")[0] + def consumer(self, q, total_entries, verbose=False): + while len(self) < total_entries: + thread = q.get(True) + thread.join() + mi, order = thread.get_result() + if mi is None: + self.append(None) + self.append(self.fill_MI(mi, verbose)) - def populate(self, entries, browser, verbose=False): - #single entry + def populate(self, entries, verbose=False, brcall=3): if len(entries) == 1 and not isinstance(entries[0], str): - try: - entry = entries[0].xpath("//div[@id='container']")[0] - entry = entry.find("div[@id='book-info']") - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - return - self.append(self.fill_MI(entry, title, authors, verbose)) + #single entry + mi = self.fill_MI(entries[0], verbose) + if mi: + self.append(mi) else: - #multiple entries - for x in entries: - try: - entry = self.get_individual_metadata(browser, x, verbose) - entry = entry.find("div[@id='book-info']") - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - continue - self.append(self.fill_MI(entry, title, authors, verbose)) + #multiple entries + q = Queue(brcall) + prod_thread = threading.Thread(target=self.producer, args=(q, entries, verbose)) + cons_thread = threading.Thread(target=self.consumer, args=(q, len(entries), verbose)) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() class Covers(object): @@ -328,14 +354,14 @@ def search(title=None, author=None, publisher=None, isbn=None, max_results=5, verbose=False, keywords=None): br = browser() entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, - keywords=keywords, max_results=max_results)(br, verbose,timeout = 10.) + keywords=keywords, max_results=max_results)(br, verbose, timeout = 10.) if entries is None or len(entries) == 0: return None #List of entry ans = ResultList() - ans.populate(entries, br, verbose) + ans.populate(entries, verbose) return ans def check_for_cover(isbn): @@ -409,3 +435,5 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) + +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\nicebooks.py" -m 5 -a mankel >data.html \ No newline at end of file