mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Amazon threading
This commit is contained in:
parent
a743464987
commit
8aa50c106e
@ -3,7 +3,7 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||||
|
|
||||||
import sys, textwrap, re, traceback
|
import sys, textwrap, re, traceback
|
||||||
from threading import Thread
|
from threading import Thread, Lock
|
||||||
from Queue import Queue
|
from Queue import Queue
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from math import ceil
|
from math import ceil
|
||||||
@ -122,9 +122,12 @@ class AmazonError(Exception):
|
|||||||
|
|
||||||
class BrowserThread(Thread):
|
class BrowserThread(Thread):
|
||||||
|
|
||||||
def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'):
|
def __init__(self, url, qbr, qsync, nb, verbose=False, timeout=10., ex=Exception, name='Meta'):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.ex = ex
|
self.ex = ex
|
||||||
|
self.qbr = qbr
|
||||||
|
self.qsync = qsync
|
||||||
|
self.nb = nb
|
||||||
self.plugname = name
|
self.plugname = name
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
@ -133,10 +136,11 @@ class BrowserThread(Thread):
|
|||||||
Thread.__init__(self)
|
Thread.__init__(self)
|
||||||
|
|
||||||
def get_result(self):
|
def get_result(self):
|
||||||
return self.result, self.br
|
return self.result
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
|
browser = self.qbr.get(True)
|
||||||
raw = self.br.open_novisit(self.url, timeout=self.timeout).read()
|
raw = self.br.open_novisit(self.url, timeout=self.timeout).read()
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
report(self.verbose)
|
report(self.verbose)
|
||||||
@ -146,9 +150,13 @@ class BrowserThread(Thread):
|
|||||||
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||||
raise self.ex(_('%s timed out. Try again later.') % self.plugname)
|
raise self.ex(_('%s timed out. Try again later.') % self.plugname)
|
||||||
raise self.ex(_('%s encountered an error.') % self.plugname)
|
raise self.ex(_('%s encountered an error.') % self.plugname)
|
||||||
|
finally:
|
||||||
|
self.qbr.put(browser, True)
|
||||||
|
|
||||||
if '<title>404 - ' in raw:
|
if '<title>404 - ' in raw:
|
||||||
report(self.verbose)
|
report(self.verbose)
|
||||||
self.result = None
|
self.result = None
|
||||||
|
return None
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True)[0]
|
resolve_entities=True)[0]
|
||||||
try:
|
try:
|
||||||
@ -159,6 +167,8 @@ class BrowserThread(Thread):
|
|||||||
self.result = soupparser.fromstring(clean_ascii_chars(raw))
|
self.result = soupparser.fromstring(clean_ascii_chars(raw))
|
||||||
except:
|
except:
|
||||||
self.result = None
|
self.result = None
|
||||||
|
finally:
|
||||||
|
self.qsync.put(self.nb, True)
|
||||||
|
|
||||||
|
|
||||||
class Query(object):
|
class Query(object):
|
||||||
@ -174,7 +184,7 @@ class Query(object):
|
|||||||
assert (max_results < 21)
|
assert (max_results < 21)
|
||||||
|
|
||||||
self.max_results = int(max_results)
|
self.max_results = int(max_results)
|
||||||
self.renbres = re.compile(u'\s*(\d+)\s*')
|
self.renbres = re.compile(u'\s*([0-9.,]+)\s*')
|
||||||
|
|
||||||
q = { 'search-alias' : 'stripbooks' ,
|
q = { 'search-alias' : 'stripbooks' ,
|
||||||
'unfiltered' : '1',
|
'unfiltered' : '1',
|
||||||
@ -262,6 +272,7 @@ class Query(object):
|
|||||||
#nb of page
|
#nb of page
|
||||||
try:
|
try:
|
||||||
nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text)
|
nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text)
|
||||||
|
nbresults = [re.sub(r'[.,]', '', x) for x in nbresults]
|
||||||
except:
|
except:
|
||||||
return None, self.urldata
|
return None, self.urldata
|
||||||
|
|
||||||
@ -294,11 +305,14 @@ class Query(object):
|
|||||||
for i in x.xpath("//a/span[@class='srTitle']")])
|
for i in x.xpath("//a/span[@class='srTitle']")])
|
||||||
return results[:self.max_results], self.baseurl
|
return results[:self.max_results], self.baseurl
|
||||||
|
|
||||||
class ResultList(list):
|
class ResultList(object):
|
||||||
|
|
||||||
def __init__(self, baseurl, lang = 'all'):
|
def __init__(self, baseurl, lang = 'all'):
|
||||||
self.baseurl = baseurl
|
self.baseurl = baseurl
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
|
self.thread = []
|
||||||
|
self.res = []
|
||||||
|
self.nbtag = 0
|
||||||
self.repub = re.compile(u'\((.*)\)')
|
self.repub = re.compile(u'\((.*)\)')
|
||||||
self.rerat = re.compile(u'([0-9.]+)')
|
self.rerat = re.compile(u'([0-9.]+)')
|
||||||
self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>')
|
self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>')
|
||||||
@ -383,15 +397,12 @@ class ResultList(list):
|
|||||||
if alink:
|
if alink:
|
||||||
if alink[0].get('class') == 'tgJsActive':
|
if alink[0].get('class') == 'tgJsActive':
|
||||||
continue
|
continue
|
||||||
link = self.baseurl + alink[0].get('href')
|
return self.baseurl + alink[0].get('href'), True
|
||||||
entry = self.get_individual_metadata(link, verbose)
|
|
||||||
tags = entry.get_element_by_id('tagContentHolder')
|
|
||||||
break
|
|
||||||
tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag']
|
tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag']
|
||||||
except:
|
except:
|
||||||
report(verbose)
|
report(verbose)
|
||||||
tags = []
|
tags = [], False
|
||||||
return tags
|
return tags, False
|
||||||
|
|
||||||
def get_book_info(self, entry, mi, verbose):
|
def get_book_info(self, entry, mi, verbose):
|
||||||
try:
|
try:
|
||||||
@ -429,9 +440,12 @@ class ResultList(list):
|
|||||||
if check_isbn(isbn):
|
if check_isbn(isbn):
|
||||||
mi.isbn = unicode(isbn)
|
mi.isbn = unicode(isbn)
|
||||||
elif len(elt) > 1:
|
elif len(elt) > 1:
|
||||||
isbn = elt[1].find('b').tail.replace('-', '').strip()
|
isbnone = elt[1].find('b').tail.replace('-', '').strip()
|
||||||
if check_isbn(isbn):
|
if check_isbn(isbnone):
|
||||||
mi.isbn = unicode(isbn)
|
mi.isbn = unicode(isbnone)
|
||||||
|
else:
|
||||||
|
#assume ASIN-> find a check for asin
|
||||||
|
mi.isbn = unicode(isbn)
|
||||||
#Langue
|
#Langue
|
||||||
elt = filter(lambda x: self.relang.search(x.find('b').text), elts)
|
elt = filter(lambda x: self.relang.search(x.find('b').text), elts)
|
||||||
if elt:
|
if elt:
|
||||||
@ -448,7 +462,7 @@ class ResultList(list):
|
|||||||
mi.rating = float(ratings[0])/float(ratings[1]) * 5
|
mi.rating = float(ratings[0])/float(ratings[1]) * 5
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def fill_MI(self, entry, br, verbose):
|
def fill_MI(self, entry, verbose):
|
||||||
try:
|
try:
|
||||||
title = self.get_title(entry)
|
title = self.get_title(entry)
|
||||||
authors = self.get_authors(entry)
|
authors = self.get_authors(entry)
|
||||||
@ -464,63 +478,65 @@ class ResultList(list):
|
|||||||
try:
|
try:
|
||||||
mi.comments = self.get_description(entry, verbose)
|
mi.comments = self.get_description(entry, verbose)
|
||||||
mi = self.get_book_info(entry, mi, verbose)
|
mi = self.get_book_info(entry, mi, verbose)
|
||||||
mi.tags = self.get_tags(entry, br, verbose)
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def get_individual_metadata(self, url, br, verbose):
|
def producer(self, sync, data, br, verbose=False):
|
||||||
try:
|
for i in xrange(len(data)):
|
||||||
raw = br.open_novisit(url).read()
|
thread = BrowserThread(data[i], br, sync, i, verbose=verbose, ex=AmazonError,
|
||||||
except Exception, e:
|
|
||||||
report(verbose)
|
|
||||||
if callable(getattr(e, 'getcode', None)) and \
|
|
||||||
e.getcode() == 404:
|
|
||||||
return None
|
|
||||||
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
|
||||||
raise AmazonError(_('Amazon timed out. Try again later.'))
|
|
||||||
raise AmazonError(_('Amazon encountered an error.'))
|
|
||||||
if '<title>404 - ' in raw:
|
|
||||||
report(verbose)
|
|
||||||
return None
|
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
|
||||||
resolve_entities=True)[0]
|
|
||||||
try:
|
|
||||||
return soupparser.fromstring(raw)
|
|
||||||
except:
|
|
||||||
try:
|
|
||||||
#remove ASCII invalid chars
|
|
||||||
return soupparser.fromstring(clean_ascii_chars(raw))
|
|
||||||
except:
|
|
||||||
report(verbose)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def producer(self, q, data, verbose=False):
|
|
||||||
for x in data:
|
|
||||||
thread = BrowserThread(x, verbose=verbose, ex=AmazonError,
|
|
||||||
name='Amazon')
|
name='Amazon')
|
||||||
thread.start()
|
thread.start()
|
||||||
q.put(thread, True)
|
self.thread.append(thread)
|
||||||
|
|
||||||
def consumer(self, q, total_entries, verbose=False):
|
def consumer(self, sync, syncbis, br, total_entries, verbose=False):
|
||||||
while len(self) < total_entries:
|
i=0
|
||||||
thread = q.get(True)
|
while i < total_entries:
|
||||||
thread.join()
|
nb = int(sync.get(True))
|
||||||
mi, br = thread.get_result()
|
entry = self.thread[nb].get_result()
|
||||||
if mi is None:
|
i+=1
|
||||||
self.append(None)
|
if entry is not None:
|
||||||
else:
|
mi = self.fill_MI(entry, verbose)
|
||||||
self.append(self.fill_MI(mi, br, verbose))
|
if mi is not None:
|
||||||
|
mi.tags, atag = self.get_tags(entry, verbose)
|
||||||
|
self.res[nb] = mi
|
||||||
|
if atag:
|
||||||
|
threadbis = BrowserThread(mi.tags, br, syncbis, nb, verbose=verbose, ex=AmazonError,
|
||||||
|
name='Amazon')
|
||||||
|
self.thread[nb] = threadbis
|
||||||
|
self.nbtag +=1
|
||||||
|
threadbis.start()
|
||||||
|
|
||||||
def populate(self, entries, verbose=False, brcall=5):
|
def populate(self, entries, ibr, verbose=False, brcall=3):
|
||||||
#multiple entries
|
#multiple entries
|
||||||
q = Queue(brcall)
|
br = Queue(brcall)
|
||||||
prod_thread = Thread(target=self.producer, args=(q, entries, verbose))
|
cbr = Queue(brcall-1)
|
||||||
cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose))
|
|
||||||
|
syncp = Queue(1)
|
||||||
|
syncc = Queue(len(entries))
|
||||||
|
|
||||||
|
for i in xrange(brcall-1):
|
||||||
|
br.put(browser(), True)
|
||||||
|
cbr.put(browser(), True)
|
||||||
|
br.put(ibr, True)
|
||||||
|
|
||||||
|
self.res = [None]*len(entries)
|
||||||
|
|
||||||
|
prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose))
|
||||||
|
cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose))
|
||||||
prod_thread.start()
|
prod_thread.start()
|
||||||
cons_thread.start()
|
cons_thread.start()
|
||||||
prod_thread.join()
|
prod_thread.join()
|
||||||
cons_thread.join()
|
cons_thread.join()
|
||||||
|
|
||||||
|
#finish processing
|
||||||
|
for i in xrange(self.nbtag):
|
||||||
|
nb = int(syncc.get(True))
|
||||||
|
tags = self.thread[nb].get_result()
|
||||||
|
if tags is not None:
|
||||||
|
self.res[nb].tags = self.get_tags(tags, verbose)[0]
|
||||||
|
|
||||||
|
return self.res
|
||||||
|
|
||||||
|
|
||||||
def search(title=None, author=None, publisher=None, isbn=None,
|
def search(title=None, author=None, publisher=None, isbn=None,
|
||||||
@ -534,8 +550,7 @@ def search(title=None, author=None, publisher=None, isbn=None,
|
|||||||
|
|
||||||
#List of entry
|
#List of entry
|
||||||
ans = ResultList(baseurl, lang)
|
ans = ResultList(baseurl, lang)
|
||||||
ans.populate(entries, verbose)
|
return [x for x in ans.populate(entries, br, verbose) if x is not None]
|
||||||
return [x for x in ans if x is not None]
|
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
parser = OptionParser(textwrap.dedent(\
|
parser = OptionParser(textwrap.dedent(\
|
||||||
@ -581,9 +596,9 @@ def main(args=sys.argv):
|
|||||||
print
|
print
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
# sys.exit(main())
|
||||||
# import cProfile
|
import cProfile
|
||||||
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()"))
|
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()"))
|
||||||
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()", "profile_tmp"))
|
sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonfr; calibre.ebooks.metadata.amazonfr.main()", "profile_tmp_threading_1"))
|
||||||
|
|
||||||
# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html
|
# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html
|
Loading…
x
Reference in New Issue
Block a user