Add threading to Amazon (still lagging like hell)

2025-07-09 03:04:10 -04:00 · 2010-12-08 22:50:57 +01:00 · 2010-12-08 22:50:57 +01:00 · f766eb871c
commit f766eb871c
parent bb2d2a6641
3 changed files with 128 additions and 52 deletions
--- a/src/calibre/ebooks/metadata/amazonfr.py
+++ b/src/calibre/ebooks/metadata/amazonfr.py
@ -3,11 +3,12 @@ __license__ = 'GPL 3'
 __copyright__ = '2010, sengian <sengian1@gmail.com>'

 import sys, textwrap, re, traceback
+from threading import Thread
+from Queue import Queue
 from urllib import urlencode
 from math import ceil

-from lxml import html
-from lxml.html import soupparser
+from lxml.html import soupparser, tostring

 from calibre.utils.date import parse_date, utcnow, replace_months
 from calibre.utils.cleantext import clean_ascii_chars
@ -116,6 +117,48 @@ def report(verbose):
    if verbose:
        traceback.print_exc()

+class AmazonError(Exception):
+    pass
+
+class BrowserThread(Thread):
+
+    def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'):
+        self.url = url
+        self.ex = ex
+        self.plugname = name
+        self.verbose = verbose
+        self.timeout = timeout
+        self.result = None
+        Thread.__init__(self)
+
+    def get_result(self):
+        return self.result
+
+    def run(self):
+        try:
+            raw = browser().open_novisit(self.url, timeout=self.timeout).read()
+        except Exception, e:
+            report(self.verbose)
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                self.result = None
+            if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
+                raise self.ex(_('%s timed out. Try again later.') % self.plugname)
+            raise self.ex(_('%s encountered an error.') % self.plugname)
+        if '<title>404 - ' in raw:
+            report(self.verbose)
+            self.result = None
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+        try:
+            self.result = soupparser.fromstring(raw)
+        except:
+            try:
+                #remove ASCII invalid chars
+                self.result = soupparser.fromstring(clean_ascii_chars(raw))
+            except:
+                self.result = None
+

 class Query(object):

@ -189,7 +232,7 @@ class Query(object):

    def __call__(self, browser, verbose, timeout = 5.):
        if verbose:
-            print 'Query:', self.urldata
+            print _('Query: %s') % self.urldata

        try:
            raw = browser.open_novisit(self.urldata, timeout=timeout).read()
@ -197,10 +240,12 @@ class Query(object):
            report(verbose)
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
-                return
-            raise
+                return None, self.urldata
+            if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
+                raise AmazonError(_('Amazon timed out. Try again later.'))
+            raise AmazonError(_('Amazon encountered an error.'))
        if '<title>404 - ' in raw:
-            return
+            return None, self.urldata
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]

@ -315,7 +360,7 @@ class ResultList(list):
            inv_class = ('seeAll', 'emptyClear')
            inv_tags ={'img': True, 'a': False}
            self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class)
-            description = html.tostring(description, method='html', encoding=unicode).strip()
+            description = tostring(description, method='html', encoding=unicode).strip()
            # remove all attributes from tags
            description = self.reattr.sub(r'<\1>', description)
            # Remove the notice about text referring to out of print editions
@ -327,7 +372,7 @@ class ResultList(list):
            report(verbose)
            return None

-    def get_tags(self, entry, browser, verbose):
+    def get_tags(self, entry, verbose):
        try:
            tags = entry.get_element_by_id('tagContentHolder')
            testptag = tags.find_class('see-all')
@ -338,7 +383,7 @@ class ResultList(list):
                        if alink[0].get('class') == 'tgJsActive':
                            continue
                        link = self.baseurl + alink[0].get('href')
-                        entry = self.get_individual_metadata(browser, link, verbose)
+                        entry = self.get_individual_metadata(link, verbose)
                        tags = entry.get_element_by_id('tagContentHolder')
                        break
            tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag']
@ -402,26 +447,41 @@ class ResultList(list):
                    mi.rating = float(ratings[0])/float(ratings[1]) * 5
        return mi

-    def fill_MI(self, entry, title, authors, browser, verbose):
+    def fill_MI(self, entry, verbose):
+        try:
+            title = self.get_title(entry)
+            authors = self.get_authors(entry)
+        except Exception, e:
+            if verbose:
+                print _('Failed to get all details for an entry')
+                print e
+                print _('URL who failed: %s') % x
+                report(verbose)
+            return None
        mi = MetaInformation(title, authors)
        mi.author_sort = authors_to_sort_string(authors)
-        mi.comments = self.get_description(entry, verbose)
-        mi = self.get_book_info(entry, mi, verbose)
-        mi.tags = self.get_tags(entry, browser, verbose)
+        try:
+            mi.comments = self.get_description(entry, verbose)
+            mi = self.get_book_info(entry, mi, verbose)
+            mi.tags = self.get_tags(entry, verbose)
+        except:
+            pass
        return mi

-    def get_individual_metadata(self, browser, linkdata, verbose):
+    def get_individual_metadata(self, url, verbose):
        try:
-            raw = browser.open_novisit(linkdata).read()
+            raw = browser().open_novisit(url).read()
        except Exception, e:
            report(verbose)
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
-                return
-            raise
+                return None
+            if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
+                raise AmazonError(_('Amazon timed out. Try again later.'))
+            raise AmazonError(_('Amazon encountered an error.'))
        if '<title>404 - ' in raw:
            report(verbose)
-            return
+            return None
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        try:
@ -432,27 +492,34 @@ class ResultList(list):
                return soupparser.fromstring(clean_ascii_chars(raw))
            except:
                report(verbose)
-                return
+                return None

-    def populate(self, entries, browser, verbose=False):
-        for x in entries:
-            try:
-                entry = self.get_individual_metadata(browser, x, verbose)
-                # clean results
-                # inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop')
-                # inv_class = ('buyingDetailsGrid', 'productImageGrid')
-                # inv_tags ={'script': True, 'style': True, 'form': False}
-                # self.clean_entry(entry, invalid_id=inv_ids)
-                title = self.get_title(entry)
-                authors = self.get_authors(entry)
-            except Exception, e:
-                if verbose:
-                    print 'Failed to get all details for an entry'
-                    print e
-                    print 'URL who failed:', x
-                    report(verbose)
-                continue
-            self.append(self.fill_MI(entry, title, authors, browser, verbose))
+    def producer(self, q, data, verbose=False):
+        for x in data:
+            thread = BrowserThread(x, verbose=verbose, ex=AmazonError,
+                name='Amazon')
+            thread.start()
+            q.put(thread, True)
+
+    def consumer(self, q, total_entries, verbose=False):
+        while len(self) < total_entries:
+            thread = q.get(True)
+            thread.join()
+            mi = thread.get_result()
+            if mi is None:
+                self.append(None)
+            else:
+                self.append(self.fill_MI(mi, verbose))
+
+    def populate(self, entries, verbose=False, brcall=5):
+        #multiple entries
+        q = Queue(brcall)
+        prod_thread = Thread(target=self.producer, args=(q, entries, verbose))
+        cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose))
+        prod_thread.start()
+        cons_thread.start()
+        prod_thread.join()
+        cons_thread.join()


 def search(title=None, author=None, publisher=None, isbn=None,
@ -466,8 +533,8 @@ def search(title=None, author=None, publisher=None, isbn=None,

    #List of entry
    ans = ResultList(baseurl, lang)
-    ans.populate(entries, br, verbose)
-    return ans
+    ans.populate(entries, verbose)
+    return [x for x in ans if x is not None]

 def option_parser():
    parser = OptionParser(textwrap.dedent(\
@ -506,7 +573,7 @@ def main(args=sys.argv):
        parser.print_help()
        return 1
    if results is None or len(results) == 0:
-        print 'No result found for this search!'
+        print _('No result found for this search!')
        return 0
    for result in results:
        print unicode(result).encode(preferred_encoding, 'replace')
@ -514,3 +581,5 @@ def main(args=sys.argv):

 if __name__ == '__main__':
    sys.exit(main())
+
+# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html
--- a/src/calibre/ebooks/metadata/fictionwise.py
+++ b/src/calibre/ebooks/metadata/fictionwise.py
@ -80,11 +80,11 @@ class BrowserThread(Thread):
            except:
                self.result = None

-
 def report(verbose):
    if verbose:
        traceback.print_exc()

+
 class Query(object):

    BASE_URL = 'http://www.fictionwise.com/servlet/mw'
@ -322,15 +322,18 @@ class ResultList(list):
                print e
            return None
        mi = MetaInformation(title, authors)
-        ratings = entry.xpath("./p/table")
-        if len(ratings) >= 2:
-            mi.rating = self.get_rating(ratings[1], verbose)
-        mi.comments = self.get_description(entry)
-        mi.publisher = self.get_publisher(entry)
-        mi.tags = self.get_tags(entry)
-        mi.pubdate = self.get_date(entry, verbose)
-        mi.isbn = self.get_ISBN(entry)
        mi.author_sort = authors_to_sort_string(authors)
+        try:
+            ratings = entry.xpath("./p/table")
+            if len(ratings) >= 2:
+                mi.rating = self.get_rating(ratings[1], verbose)
+            mi.comments = self.get_description(entry)
+            mi.publisher = self.get_publisher(entry)
+            mi.tags = self.get_tags(entry)
+            mi.pubdate = self.get_date(entry, verbose)
+            mi.isbn = self.get_ISBN(entry)
+        except:
+            pass
        return mi

    def producer(self, q, data, verbose=False):
--- a/src/calibre/ebooks/metadata/nicebooks.py
+++ b/src/calibre/ebooks/metadata/nicebooks.py
@ -279,8 +279,12 @@ class ResultList(list):
            return None
        mi = MetaInformation(title, authors)
        mi.author_sort = authors_to_sort_string(authors)
-        mi.comments = self.get_description(entry, verbose)
-        return self.get_book_info(entry, mi, verbose)
+        try:
+            mi.comments = self.get_description(entry, verbose)
+            mi = self.get_book_info(entry, mi, verbose)
+        except:
+            pass
+        return mi

    def producer(self, q, data, verbose=False):
        for x in data: