From f766eb871c54fa249f2c0e6b71067ee9517b5de8 Mon Sep 17 00:00:00 2001 From: Sengian Date: Wed, 8 Dec 2010 22:50:57 +0100 Subject: [PATCH] Add threading to Amazon (still lagging like hell) --- src/calibre/ebooks/metadata/amazonfr.py | 151 +++++++++++++++------ src/calibre/ebooks/metadata/fictionwise.py | 21 +-- src/calibre/ebooks/metadata/nicebooks.py | 8 +- 3 files changed, 128 insertions(+), 52 deletions(-) diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py index 156fff3d75..6d8c2e407c 100644 --- a/src/calibre/ebooks/metadata/amazonfr.py +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -3,11 +3,12 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian ' import sys, textwrap, re, traceback +from threading import Thread +from Queue import Queue from urllib import urlencode from math import ceil -from lxml import html -from lxml.html import soupparser +from lxml.html import soupparser, tostring from calibre.utils.date import parse_date, utcnow, replace_months from calibre.utils.cleantext import clean_ascii_chars @@ -116,6 +117,48 @@ def report(verbose): if verbose: traceback.print_exc() +class AmazonError(Exception): + pass + +class BrowserThread(Thread): + + def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'): + self.url = url + self.ex = ex + self.plugname = name + self.verbose = verbose + self.timeout = timeout + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + try: + raw = browser().open_novisit(self.url, timeout=self.timeout).read() + except Exception, e: + report(self.verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + self.result = None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise self.ex(_('%s timed out. Try again later.') % self.plugname) + raise self.ex(_('%s encountered an error.') % self.plugname) + if '404 - ' in raw: + report(self.verbose) + self.result = None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + self.result = soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + self.result = soupparser.fromstring(clean_ascii_chars(raw)) + except: + self.result = None + class Query(object): @@ -189,7 +232,7 @@ class Query(object): def __call__(self, browser, verbose, timeout = 5.): if verbose: - print 'Query:', self.urldata + print _('Query: %s') % self.urldata try: raw = browser.open_novisit(self.urldata, timeout=timeout).read() @@ -197,10 +240,12 @@ class Query(object): report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: - return - raise + return None, self.urldata + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise AmazonError(_('Amazon timed out. Try again later.')) + raise AmazonError(_('Amazon encountered an error.')) if '<title>404 - ' in raw: - return + return None, self.urldata raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] @@ -315,7 +360,7 @@ class ResultList(list): inv_class = ('seeAll', 'emptyClear') inv_tags ={'img': True, 'a': False} self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class) - description = html.tostring(description, method='html', encoding=unicode).strip() + description = tostring(description, method='html', encoding=unicode).strip() # remove all attributes from tags description = self.reattr.sub(r'<\1>', description) # Remove the notice about text referring to out of print editions @@ -327,7 +372,7 @@ class ResultList(list): report(verbose) return None - def get_tags(self, entry, browser, verbose): + def get_tags(self, entry, verbose): try: tags = entry.get_element_by_id('tagContentHolder') testptag = tags.find_class('see-all') @@ -338,7 +383,7 @@ class ResultList(list): if alink[0].get('class') == 'tgJsActive': continue link = self.baseurl + alink[0].get('href') - entry = self.get_individual_metadata(browser, link, verbose) + entry = self.get_individual_metadata(link, verbose) tags = entry.get_element_by_id('tagContentHolder') break tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag'] @@ -402,26 +447,41 @@ class ResultList(list): mi.rating = float(ratings[0])/float(ratings[1]) * 5 return mi - def fill_MI(self, entry, title, authors, browser, verbose): + def fill_MI(self, entry, verbose): + try: + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + print _('URL who failed: %s') % x + report(verbose) + return None mi = MetaInformation(title, authors) mi.author_sort = authors_to_sort_string(authors) - mi.comments = self.get_description(entry, verbose) - mi = self.get_book_info(entry, mi, verbose) - mi.tags = self.get_tags(entry, browser, verbose) + try: + mi.comments = self.get_description(entry, verbose) + mi = self.get_book_info(entry, mi, verbose) + mi.tags = self.get_tags(entry, verbose) + except: + pass return mi - def get_individual_metadata(self, browser, linkdata, verbose): + def get_individual_metadata(self, url, verbose): try: - raw = browser.open_novisit(linkdata).read() + raw = browser().open_novisit(url).read() except Exception, e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: - return - raise + return None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise AmazonError(_('Amazon timed out. Try again later.')) + raise AmazonError(_('Amazon encountered an error.')) if '<title>404 - ' in raw: report(verbose) - return + return None raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: @@ -432,27 +492,34 @@ class ResultList(list): return soupparser.fromstring(clean_ascii_chars(raw)) except: report(verbose) - return + return None - def populate(self, entries, browser, verbose=False): - for x in entries: - try: - entry = self.get_individual_metadata(browser, x, verbose) - # clean results - # inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop') - # inv_class = ('buyingDetailsGrid', 'productImageGrid') - # inv_tags ={'script': True, 'style': True, 'form': False} - # self.clean_entry(entry, invalid_id=inv_ids) - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - print 'URL who failed:', x - report(verbose) - continue - self.append(self.fill_MI(entry, title, authors, browser, verbose)) + def producer(self, q, data, verbose=False): + for x in data: + thread = BrowserThread(x, verbose=verbose, ex=AmazonError, + name='Amazon') + thread.start() + q.put(thread, True) + + def consumer(self, q, total_entries, verbose=False): + while len(self) < total_entries: + thread = q.get(True) + thread.join() + mi = thread.get_result() + if mi is None: + self.append(None) + else: + self.append(self.fill_MI(mi, verbose)) + + def populate(self, entries, verbose=False, brcall=5): + #multiple entries + q = Queue(brcall) + prod_thread = Thread(target=self.producer, args=(q, entries, verbose)) + cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose)) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() def search(title=None, author=None, publisher=None, isbn=None, @@ -466,8 +533,8 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList(baseurl, lang) - ans.populate(entries, br, verbose) - return ans + ans.populate(entries, verbose) + return [x for x in ans if x is not None] def option_parser(): parser = OptionParser(textwrap.dedent(\ @@ -506,7 +573,7 @@ def main(args=sys.argv): parser.print_help() return 1 if results is None or len(results) == 0: - print 'No result found for this search!' + print _('No result found for this search!') return 0 for result in results: print unicode(result).encode(preferred_encoding, 'replace') @@ -514,3 +581,5 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) + +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index a06516c7dc..892e286810 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -80,11 +80,11 @@ class BrowserThread(Thread): except: self.result = None - def report(verbose): if verbose: traceback.print_exc() + class Query(object): BASE_URL = 'http://www.fictionwise.com/servlet/mw' @@ -322,15 +322,18 @@ class ResultList(list): print e return None mi = MetaInformation(title, authors) - ratings = entry.xpath("./p/table") - if len(ratings) >= 2: - mi.rating = self.get_rating(ratings[1], verbose) - mi.comments = self.get_description(entry) - mi.publisher = self.get_publisher(entry) - mi.tags = self.get_tags(entry) - mi.pubdate = self.get_date(entry, verbose) - mi.isbn = self.get_ISBN(entry) mi.author_sort = authors_to_sort_string(authors) + try: + ratings = entry.xpath("./p/table") + if len(ratings) >= 2: + mi.rating = self.get_rating(ratings[1], verbose) + mi.comments = self.get_description(entry) + mi.publisher = self.get_publisher(entry) + mi.tags = self.get_tags(entry) + mi.pubdate = self.get_date(entry, verbose) + mi.isbn = self.get_ISBN(entry) + except: + pass return mi def producer(self, q, data, verbose=False): diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 5bd360ed6c..8911b31c08 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -279,8 +279,12 @@ class ResultList(list): return None mi = MetaInformation(title, authors) mi.author_sort = authors_to_sort_string(authors) - mi.comments = self.get_description(entry, verbose) - return self.get_book_info(entry, mi, verbose) + try: + mi.comments = self.get_description(entry, verbose) + mi = self.get_book_info(entry, mi, verbose) + except: + pass + return mi def producer(self, q, data, verbose=False): for x in data: