Add threading to Amazon (still lagging like hell)

This commit is contained in:
Sengian 2010-12-08 22:50:57 +01:00
parent bb2d2a6641
commit f766eb871c
3 changed files with 128 additions and 52 deletions

View File

@ -3,11 +3,12 @@ __license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>' __copyright__ = '2010, sengian <sengian1@gmail.com>'
import sys, textwrap, re, traceback import sys, textwrap, re, traceback
from threading import Thread
from Queue import Queue
from urllib import urlencode from urllib import urlencode
from math import ceil from math import ceil
from lxml import html from lxml.html import soupparser, tostring
from lxml.html import soupparser
from calibre.utils.date import parse_date, utcnow, replace_months from calibre.utils.date import parse_date, utcnow, replace_months
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
@ -116,6 +117,48 @@ def report(verbose):
if verbose: if verbose:
traceback.print_exc() traceback.print_exc()
class AmazonError(Exception):
pass
class BrowserThread(Thread):
def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'):
self.url = url
self.ex = ex
self.plugname = name
self.verbose = verbose
self.timeout = timeout
self.result = None
Thread.__init__(self)
def get_result(self):
return self.result
def run(self):
try:
raw = browser().open_novisit(self.url, timeout=self.timeout).read()
except Exception, e:
report(self.verbose)
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
self.result = None
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
raise self.ex(_('%s timed out. Try again later.') % self.plugname)
raise self.ex(_('%s encountered an error.') % self.plugname)
if '<title>404 - ' in raw:
report(self.verbose)
self.result = None
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
self.result = soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
self.result = soupparser.fromstring(clean_ascii_chars(raw))
except:
self.result = None
class Query(object): class Query(object):
@ -189,7 +232,7 @@ class Query(object):
def __call__(self, browser, verbose, timeout = 5.): def __call__(self, browser, verbose, timeout = 5.):
if verbose: if verbose:
print 'Query:', self.urldata print _('Query: %s') % self.urldata
try: try:
raw = browser.open_novisit(self.urldata, timeout=timeout).read() raw = browser.open_novisit(self.urldata, timeout=timeout).read()
@ -197,10 +240,12 @@ class Query(object):
report(verbose) report(verbose)
if callable(getattr(e, 'getcode', None)) and \ if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404: e.getcode() == 404:
return return None, self.urldata
raise if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
raise AmazonError(_('Amazon timed out. Try again later.'))
raise AmazonError(_('Amazon encountered an error.'))
if '<title>404 - ' in raw: if '<title>404 - ' in raw:
return return None, self.urldata
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
@ -315,7 +360,7 @@ class ResultList(list):
inv_class = ('seeAll', 'emptyClear') inv_class = ('seeAll', 'emptyClear')
inv_tags ={'img': True, 'a': False} inv_tags ={'img': True, 'a': False}
self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class) self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class)
description = html.tostring(description, method='html', encoding=unicode).strip() description = tostring(description, method='html', encoding=unicode).strip()
# remove all attributes from tags # remove all attributes from tags
description = self.reattr.sub(r'<\1>', description) description = self.reattr.sub(r'<\1>', description)
# Remove the notice about text referring to out of print editions # Remove the notice about text referring to out of print editions
@ -327,7 +372,7 @@ class ResultList(list):
report(verbose) report(verbose)
return None return None
def get_tags(self, entry, browser, verbose): def get_tags(self, entry, verbose):
try: try:
tags = entry.get_element_by_id('tagContentHolder') tags = entry.get_element_by_id('tagContentHolder')
testptag = tags.find_class('see-all') testptag = tags.find_class('see-all')
@ -338,7 +383,7 @@ class ResultList(list):
if alink[0].get('class') == 'tgJsActive': if alink[0].get('class') == 'tgJsActive':
continue continue
link = self.baseurl + alink[0].get('href') link = self.baseurl + alink[0].get('href')
entry = self.get_individual_metadata(browser, link, verbose) entry = self.get_individual_metadata(link, verbose)
tags = entry.get_element_by_id('tagContentHolder') tags = entry.get_element_by_id('tagContentHolder')
break break
tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag'] tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag']
@ -402,26 +447,41 @@ class ResultList(list):
mi.rating = float(ratings[0])/float(ratings[1]) * 5 mi.rating = float(ratings[0])/float(ratings[1]) * 5
return mi return mi
def fill_MI(self, entry, title, authors, browser, verbose): def fill_MI(self, entry, verbose):
try:
title = self.get_title(entry)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print _('Failed to get all details for an entry')
print e
print _('URL who failed: %s') % x
report(verbose)
return None
mi = MetaInformation(title, authors) mi = MetaInformation(title, authors)
mi.author_sort = authors_to_sort_string(authors) mi.author_sort = authors_to_sort_string(authors)
mi.comments = self.get_description(entry, verbose) try:
mi = self.get_book_info(entry, mi, verbose) mi.comments = self.get_description(entry, verbose)
mi.tags = self.get_tags(entry, browser, verbose) mi = self.get_book_info(entry, mi, verbose)
mi.tags = self.get_tags(entry, verbose)
except:
pass
return mi return mi
def get_individual_metadata(self, browser, linkdata, verbose): def get_individual_metadata(self, url, verbose):
try: try:
raw = browser.open_novisit(linkdata).read() raw = browser().open_novisit(url).read()
except Exception, e: except Exception, e:
report(verbose) report(verbose)
if callable(getattr(e, 'getcode', None)) and \ if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404: e.getcode() == 404:
return return None
raise if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
raise AmazonError(_('Amazon timed out. Try again later.'))
raise AmazonError(_('Amazon encountered an error.'))
if '<title>404 - ' in raw: if '<title>404 - ' in raw:
report(verbose) report(verbose)
return return None
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
try: try:
@ -432,27 +492,34 @@ class ResultList(list):
return soupparser.fromstring(clean_ascii_chars(raw)) return soupparser.fromstring(clean_ascii_chars(raw))
except: except:
report(verbose) report(verbose)
return return None
def populate(self, entries, browser, verbose=False): def producer(self, q, data, verbose=False):
for x in entries: for x in data:
try: thread = BrowserThread(x, verbose=verbose, ex=AmazonError,
entry = self.get_individual_metadata(browser, x, verbose) name='Amazon')
# clean results thread.start()
# inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop') q.put(thread, True)
# inv_class = ('buyingDetailsGrid', 'productImageGrid')
# inv_tags ={'script': True, 'style': True, 'form': False} def consumer(self, q, total_entries, verbose=False):
# self.clean_entry(entry, invalid_id=inv_ids) while len(self) < total_entries:
title = self.get_title(entry) thread = q.get(True)
authors = self.get_authors(entry) thread.join()
except Exception, e: mi = thread.get_result()
if verbose: if mi is None:
print 'Failed to get all details for an entry' self.append(None)
print e else:
print 'URL who failed:', x self.append(self.fill_MI(mi, verbose))
report(verbose)
continue def populate(self, entries, verbose=False, brcall=5):
self.append(self.fill_MI(entry, title, authors, browser, verbose)) #multiple entries
q = Queue(brcall)
prod_thread = Thread(target=self.producer, args=(q, entries, verbose))
cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose))
prod_thread.start()
cons_thread.start()
prod_thread.join()
cons_thread.join()
def search(title=None, author=None, publisher=None, isbn=None, def search(title=None, author=None, publisher=None, isbn=None,
@ -466,8 +533,8 @@ def search(title=None, author=None, publisher=None, isbn=None,
#List of entry #List of entry
ans = ResultList(baseurl, lang) ans = ResultList(baseurl, lang)
ans.populate(entries, br, verbose) ans.populate(entries, verbose)
return ans return [x for x in ans if x is not None]
def option_parser(): def option_parser():
parser = OptionParser(textwrap.dedent(\ parser = OptionParser(textwrap.dedent(\
@ -506,7 +573,7 @@ def main(args=sys.argv):
parser.print_help() parser.print_help()
return 1 return 1
if results is None or len(results) == 0: if results is None or len(results) == 0:
print 'No result found for this search!' print _('No result found for this search!')
return 0 return 0
for result in results: for result in results:
print unicode(result).encode(preferred_encoding, 'replace') print unicode(result).encode(preferred_encoding, 'replace')
@ -514,3 +581,5 @@ def main(args=sys.argv):
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())
# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html

View File

@ -80,11 +80,11 @@ class BrowserThread(Thread):
except: except:
self.result = None self.result = None
def report(verbose): def report(verbose):
if verbose: if verbose:
traceback.print_exc() traceback.print_exc()
class Query(object): class Query(object):
BASE_URL = 'http://www.fictionwise.com/servlet/mw' BASE_URL = 'http://www.fictionwise.com/servlet/mw'
@ -322,15 +322,18 @@ class ResultList(list):
print e print e
return None return None
mi = MetaInformation(title, authors) mi = MetaInformation(title, authors)
ratings = entry.xpath("./p/table")
if len(ratings) >= 2:
mi.rating = self.get_rating(ratings[1], verbose)
mi.comments = self.get_description(entry)
mi.publisher = self.get_publisher(entry)
mi.tags = self.get_tags(entry)
mi.pubdate = self.get_date(entry, verbose)
mi.isbn = self.get_ISBN(entry)
mi.author_sort = authors_to_sort_string(authors) mi.author_sort = authors_to_sort_string(authors)
try:
ratings = entry.xpath("./p/table")
if len(ratings) >= 2:
mi.rating = self.get_rating(ratings[1], verbose)
mi.comments = self.get_description(entry)
mi.publisher = self.get_publisher(entry)
mi.tags = self.get_tags(entry)
mi.pubdate = self.get_date(entry, verbose)
mi.isbn = self.get_ISBN(entry)
except:
pass
return mi return mi
def producer(self, q, data, verbose=False): def producer(self, q, data, verbose=False):

View File

@ -279,8 +279,12 @@ class ResultList(list):
return None return None
mi = MetaInformation(title, authors) mi = MetaInformation(title, authors)
mi.author_sort = authors_to_sort_string(authors) mi.author_sort = authors_to_sort_string(authors)
mi.comments = self.get_description(entry, verbose) try:
return self.get_book_info(entry, mi, verbose) mi.comments = self.get_description(entry, verbose)
mi = self.get_book_info(entry, mi, verbose)
except:
pass
return mi
def producer(self, q, data, verbose=False): def producer(self, q, data, verbose=False):
for x in data: for x in data: