Fix threading in amazon

This commit is contained in:
Sengian 2010-12-12 11:57:00 +01:00
parent ae781ae614
commit 5c89b576e3

View File

@ -4,6 +4,7 @@ __copyright__ = '2010, sengian <sengian1@gmail.com>'
import sys, textwrap, re, traceback, socket import sys, textwrap, re, traceback, socket
from threading import Thread from threading import Thread
from Queue import Queue
from urllib import urlencode from urllib import urlencode
from math import ceil from math import ceil
@ -21,57 +22,6 @@ from calibre.utils.config import OptionParser
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
# class AmazonFr(MetadataSource):
# name = 'Amazon French'
# description = _('Downloads metadata from amazon.fr')
# supported_platforms = ['windows', 'osx', 'linux']
# author = 'Sengian'
# version = (1, 0, 0)
# has_html_comments = True
# def fetch(self):
# try:
# self.results = search(self.title, self.book_author, self.publisher,
# self.isbn, max_results=10, verbose=self.verbose, lang='fr')
# except Exception, e:
# self.exception = e
# self.tb = traceback.format_exc()
# class AmazonEs(MetadataSource):
# name = 'Amazon Spanish'
# description = _('Downloads metadata from amazon.com in spanish')
# supported_platforms = ['windows', 'osx', 'linux']
# author = 'Sengian'
# version = (1, 0, 0)
# has_html_comments = True
# def fetch(self):
# try:
# self.results = search(self.title, self.book_author, self.publisher,
# self.isbn, max_results=10, verbose=self.verbose, lang='es')
# except Exception, e:
# self.exception = e
# self.tb = traceback.format_exc()
# class AmazonDe(MetadataSource):
# name = 'Amazon German'
# description = _('Downloads metadata from amazon.de')
# supported_platforms = ['windows', 'osx', 'linux']
# author = 'Sengian'
# version = (1, 0, 0)
# has_html_comments = True
# def fetch(self):
# try:
# self.results = search(self.title, self.book_author, self.publisher,
# self.isbn, max_results=10, verbose=self.verbose, lang='de')
# except Exception, e:
# self.exception = e
# self.tb = traceback.format_exc()
class Amazon(MetadataSource): class Amazon(MetadataSource):
name = 'Amazon' name = 'Amazon'
@ -83,8 +33,33 @@ class Amazon(MetadataSource):
def fetch(self): def fetch(self):
try: try:
lang = get_lang()
lang = lang[:2] if re.match(r'(fr.*|de.*)', lang) else 'all'
if lang == 'all':
self.results = search(self.title, self.book_author, self.publisher, self.results = search(self.title, self.book_author, self.publisher,
self.isbn, max_results=5, verbose=self.verbose, lang='all') self.isbn, max_results=5, verbose=self.verbose, lang='all')
else:
tmploc = ThreadwithResults(search, self.title, self.book_author,
self.publisher,self.isbn, max_results=5,
verbose=self.verbose, lang=lang)
tmpnoloc = ThreadwithResults(search, self.title, self.book_author,
self.publisher, self.isbn, max_results=5,
verbose=self.verbose, lang='all')
tmploc.start()
tmpnoloc.start()
tmploc.join()
tmpnoloc.join()
tmploc= tmploc.get_result()
tmpnoloc= tmpnoloc.get_result()
tempres = None
if tmpnoloc is not None:
tempres = tmpnoloc
if tmploc is not None:
tempres = tmploc
if tmpnoloc is not None:
tempres.extend(tmpnoloc)
self.results = tmpres
except Exception, e: except Exception, e:
self.exception = e self.exception = e
self.tb = traceback.format_exc() self.tb = traceback.format_exc()
@ -109,10 +84,10 @@ class AmazonSocial(MetadataSource):
self.results = get_social_metadata(self.title, self.book_author, self.publisher, self.results = get_social_metadata(self.title, self.book_author, self.publisher,
self.isbn, verbose=self.verbose, lang='all')[0] self.isbn, verbose=self.verbose, lang='all')[0]
else: else:
tmploc = ThreadwithResults(AmazonError, self.verbose, get_social_metadata, self.title, tmploc = ThreadwithResults(get_social_metadata, self.title, self.book_author,
self.book_author, self.publisher,self.isbn, verbose=self.verbose, lang=lang) self.publisher,self.isbn, verbose=self.verbose, lang=lang)
tmpnoloc = ThreadwithResults(AmazonError, self.verbose, get_social_metadata, self.title, tmpnoloc = ThreadwithResults(get_social_metadata, self.title, self.book_author,
self.book_author, self.publisher, self.isbn, verbose=self.verbose, lang='all') self.publisher, self.isbn, verbose=self.verbose, lang='all')
tmploc.start() tmploc.start()
tmpnoloc.start() tmpnoloc.start()
tmploc.join() tmploc.join()
@ -123,9 +98,7 @@ class AmazonSocial(MetadataSource):
tmpnoloc= tmpnoloc.get_result() tmpnoloc= tmpnoloc.get_result()
if tmpnoloc is not None: if tmpnoloc is not None:
tmpnoloc = tmpnoloc[0] tmpnoloc = tmpnoloc[0]
print tmpnoloc if tmpnoloc is not None:
if tmploc is not None and tmpnoloc is not None:
if tmploc.rating is None: if tmploc.rating is None:
tmploc.rating = tmpnoloc.rating tmploc.rating = tmpnoloc.rating
if tmploc.comments is not None: if tmploc.comments is not None:
@ -146,12 +119,10 @@ class AmazonError(Exception):
pass pass
class ThreadwithResults(Thread): class ThreadwithResults(Thread):
def __init__(self, error, verb, func, *args, **kargs): def __init__(self, func, *args, **kargs):
self.func = func self.func = func
self.args = args self.args = args
self.kargs = kargs self.kargs = kargs
self.verbose = verb
self.ex = error
self.result = None self.result = None
Thread.__init__(self) Thread.__init__(self)
@ -159,11 +130,8 @@ class ThreadwithResults(Thread):
return self.result return self.result
def run(self): def run(self):
try:
self.result = self.func(*self.args, **self.kargs) self.result = self.func(*self.args, **self.kargs)
except Exception, e:
report(self.verbose)
raise self.ex(_('An error was encountered in the function threading'))
class Query(object): class Query(object):
@ -172,10 +140,10 @@ class Query(object):
BASE_URL_DE = 'http://www.amazon.de' BASE_URL_DE = 'http://www.amazon.de'
def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None,
max_results=10, rlang='all'): max_results=20, rlang='all'):
assert not(title is None and author is None and publisher is None \ assert not(title is None and author is None and publisher is None \
and isbn is None and keywords is None) and isbn is None and keywords is None)
assert (max_results < 11) assert (max_results < 21)
self.max_results = int(max_results) self.max_results = int(max_results)
self.renbres = re.compile(u'\s*([0-9.,]+)\s*') self.renbres = re.compile(u'\s*([0-9.,]+)\s*')
@ -304,6 +272,9 @@ class ResultList(object):
def __init__(self, baseurl, lang = 'all'): def __init__(self, baseurl, lang = 'all'):
self.baseurl = baseurl self.baseurl = baseurl
self.lang = lang self.lang = lang
self.thread = []
self.res = []
self.nbtag = 0
self.repub = re.compile(u'\((.*)\)') self.repub = re.compile(u'\((.*)\)')
self.rerat = re.compile(u'([0-9.]+)') self.rerat = re.compile(u'([0-9.]+)')
self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>')
@ -499,20 +470,72 @@ class ResultList(object):
report(verbose) report(verbose)
return None return None
def populate(self, entries, br, verbose=False): def fetchdatathread(self, qbr, qsync, nb, url, verbose):
res = [] try:
for x in entries: browser = qbr.get(True)
entry = self.get_individual_metadata(x, br, verbose) entry = self.get_individual_metadata(url, browser, verbose)
except:
report(verbose)
entry = None
finally:
qbr.put(browser, True)
qsync.put(nb, True)
return entry
def producer(self, sync, urls, br, verbose=False):
for i in xrange(len(urls)):
thread = ThreadwithResults(self.fetchdatathread, br, sync,
i, urls[i], verbose)
thread.start()
self.thread.append(thread)
def consumer(self, sync, syncbis, br, total_entries, verbose=False):
i=0
while i < total_entries:
nb = int(sync.get(True))
self.thread[nb].join()
entry = self.thread[nb].get_result()
i+=1
if entry is not None: if entry is not None:
mi = self.fill_MI(entry, verbose) mi = self.fill_MI(entry, verbose)
if mi is not None: if mi is not None:
mi.tags, atag = self.get_tags(entry, verbose) mi.tags, atag = self.get_tags(entry, verbose)
self.res[nb] = mi
if atag: if atag:
tags = self.get_individual_metadata(mi.tags, br, verbose) threadbis = ThreadwithResults(self.fetchdatathread,
br, syncbis, nb, mi.tags, verbose)
self.thread[nb] = threadbis
self.nbtag +=1
threadbis.start()
def populate(self, entries, ibr, verbose=False, brcall=3):
br = Queue(brcall)
cbr = Queue(brcall-1)
syncp = Queue(1)
syncc = Queue(len(entries))
for i in xrange(brcall-1):
br.put(browser(), True)
cbr.put(browser(), True)
br.put(ibr, True)
self.res = [None]*len(entries)
prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose))
cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose))
prod_thread.start()
cons_thread.start()
prod_thread.join()
cons_thread.join()
#finish processing
for i in xrange(self.nbtag):
nb = int(syncc.get(True))
tags = self.thread[nb].get_result()
if tags is not None: if tags is not None:
mi.tags = self.get_tags(tags, verbose)[0] self.res[nb].tags = self.get_tags(tags, verbose)[0]
res.append(mi) return self.res
return res
def search(title=None, author=None, publisher=None, isbn=None, def search(title=None, author=None, publisher=None, isbn=None,
@ -561,7 +584,7 @@ def option_parser():
%prog [options] %prog [options]
Fetch book metadata from Amazon. You must specify one of title, author, Fetch book metadata from Amazon. You must specify one of title, author,
ISBN, publisher or keywords. Will fetch a maximum of 10 matches, ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
so you should make your query as specific as possible. so you should make your query as specific as possible.
You can chose the language for metadata retrieval: You can chose the language for metadata retrieval:
english & french & german english & french & german