Threading optimisation (last I hope), now faster than light at least pratchett's for amazon

This commit is contained in:
Sengian 2010-12-14 00:34:25 +01:00
parent 81af8382d6
commit 99921673d6
4 changed files with 53 additions and 133 deletions

View File

@ -121,20 +121,6 @@ def report(verbose):
class AmazonError(Exception): class AmazonError(Exception):
pass pass
class ThreadwithResults(Thread):
def __init__(self, func, *args, **kargs):
self.func = func
self.args = args
self.kargs = kargs
self.result = None
Thread.__init__(self)
def get_result(self):
return self.result
def run(self):
self.result = self.func(*self.args, **self.kargs)
class Query(object): class Query(object):
@ -269,14 +255,11 @@ class Query(object):
for i in x.xpath("//a/span[@class='srTitle']")]) for i in x.xpath("//a/span[@class='srTitle']")])
return results[:self.max_results], self.baseurl return results[:self.max_results], self.baseurl
class ResultList(object): class ResultList(list):
def __init__(self, baseurl, lang = 'all'): def __init__(self, baseurl, lang = 'all'):
self.baseurl = baseurl self.baseurl = baseurl
self.lang = lang self.lang = lang
self.thread = []
self.res = []
self.nbtag = 0
self.repub = re.compile(u'\((.*)\)') self.repub = re.compile(u'\((.*)\)')
self.rerat = re.compile(u'([0-9.]+)') self.rerat = re.compile(u'([0-9.]+)')
self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>')
@ -484,63 +467,65 @@ class ResultList(object):
entry = None entry = None
finally: finally:
qbr.put(browser, True) qbr.put(browser, True)
qsync.put(nb, True) qsync.put((nb, entry), True)
return entry
def producer(self, sync, urls, br, verbose=False): def producer(self, sync, urls, br, verbose=False):
for i in xrange(len(urls)): for i in xrange(len(urls)):
thread = ThreadwithResults(self.fetchdatathread, br, sync, thread = Thread(target=self.fetchdatathread,
i, urls[i], verbose) args=(br, sync, i, urls[i], verbose))
thread.start() thread.start()
self.thread.append(thread)
def consumer(self, sync, syncbis, br, total_entries, verbose=False): def consumer(self, sync, syncbis, br, total_entries, verbose=False):
i=0 i=0
self.extend([None]*total_entries)
while i < total_entries: while i < total_entries:
nb = int(sync.get(True)) rq = sync.get(True)
self.thread[nb].join() nb = int(rq[0])
entry = self.thread[nb].get_result() entry = rq[1]
i+=1 i+=1
if entry is not None: if entry is not None:
mi = self.fill_MI(entry, verbose) mi = self.fill_MI(entry, verbose)
if mi is not None: if mi is not None:
mi.tags, atag = self.get_tags(entry, verbose) mi.tags, atag = self.get_tags(entry, verbose)
self.res[nb] = mi self[nb] = mi
if atag: if atag:
threadbis = ThreadwithResults(self.fetchdatathread, thread = Thread(target=self.fetchdatathread,
br, syncbis, nb, mi.tags, verbose) args=(br, syncbis, nb, mi.tags, verbose))
self.thread[nb] = threadbis thread.start()
self.nbtag +=1 else:
threadbis.start() syncbis.put((nb, None), True)
def final(self, sync, total_entries, verbose):
i=0
while i < total_entries:
rq = sync.get(True)
nb = int(rq[0])
tags = rq[1]
i+=1
if tags is not None:
self[nb].tags = self.get_tags(tags, verbose)[0]
def populate(self, entries, ibr, verbose=False, brcall=3): def populate(self, entries, ibr, verbose=False, brcall=3):
br = Queue(brcall) br = Queue(brcall)
cbr = Queue(brcall-1) cbr = Queue(brcall-1)
syncp = Queue(1) syncp = Queue(1)
syncc = Queue(len(entries)) syncc = Queue(1)
for i in xrange(brcall-1): for i in xrange(brcall-1):
br.put(browser(), True) br.put(browser(), True)
cbr.put(browser(), True) cbr.put(browser(), True)
br.put(ibr, True) br.put(ibr, True)
self.res = [None]*len(entries)
prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose)) prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose))
cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose)) cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose))
fin_thread = Thread(target=self.final, args=(syncc, len(entries), verbose))
prod_thread.start() prod_thread.start()
cons_thread.start() cons_thread.start()
fin_thread.start()
prod_thread.join() prod_thread.join()
cons_thread.join() cons_thread.join()
fin_thread.join()
#finish processing
for i in xrange(self.nbtag):
nb = int(syncc.get(True))
tags = self.thread[nb].get_result()
if tags is not None:
self.res[nb].tags = self.get_tags(tags, verbose)[0]
return self.res
def search(title=None, author=None, publisher=None, isbn=None, def search(title=None, author=None, publisher=None, isbn=None,
@ -554,7 +539,8 @@ def search(title=None, author=None, publisher=None, isbn=None,
#List of entry #List of entry
ans = ResultList(baseurl, lang) ans = ResultList(baseurl, lang)
return [x for x in ans.populate(entries, br, verbose) if x is not None] ans.populate(entries, br, verbose)
return [x for x in ans if x is not None]
def get_social_metadata(title, authors, publisher, isbn, verbose=False, def get_social_metadata(title, authors, publisher, isbn, verbose=False,
max_results=1, lang='all'): max_results=1, lang='all'):

View File

@ -41,20 +41,6 @@ class Fictionwise(MetadataSource):
class FictionwiseError(Exception): class FictionwiseError(Exception):
pass pass
class ThreadwithResults(Thread):
def __init__(self, func, *args, **kargs):
self.func = func
self.args = args
self.kargs = kargs
self.result = None
Thread.__init__(self)
def get_result(self):
return self.result
def run(self):
self.result = self.func(*self.args, **self.kargs)
def report(verbose): def report(verbose):
if verbose: if verbose:
import traceback import traceback
@ -155,7 +141,6 @@ class ResultList(list):
def __init__(self, islink): def __init__(self, islink):
self.islink = islink self.islink = islink
self.thread = []
self.retitle = re.compile(r'\[[^\[\]]+\]') self.retitle = re.compile(r'\[[^\[\]]+\]')
self.rechkauth = re.compile(r'.*book\s*by', re.I) self.rechkauth = re.compile(r'.*book\s*by', re.I)
self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I) self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I)
@ -361,27 +346,21 @@ class ResultList(list):
entry = None entry = None
finally: finally:
qbr.put(browser, True) qbr.put(browser, True)
qsync.put(nb, True) qsync.put((nb, entry), True)
return entry
def producer(self, sync, urls, br, verbose=False): def producer(self, sync, urls, br, verbose=False):
for i in xrange(len(urls)): for i in xrange(len(urls)):
thread = ThreadwithResults(self.fetchdatathread, br, sync, thread = Thread(target=self.fetchdatathread,
i, self.BASE_URL+urls[i], verbose) args=(br, sync, i, self.BASE_URL+urls[i], verbose))
thread.start() thread.start()
self.thread.append(thread)
def consumer(self, sync, total_entries, verbose=False): def consumer(self, sync, total_entries, verbose=False):
res=[None]*total_entries self.extend([None]*total_entries)
i=0 i=0
while i < total_entries: while i < total_entries:
nb = int(sync.get(True)) rq = sync.get(True)
self.thread[nb].join() self[int(rq[0])] = self.fill_MI(rq[1], verbose)
entry = self.thread[nb].get_result()
i+=1 i+=1
if entry is not None:
res[nb] = self.fill_MI(entry, verbose)
return res
def populate(self, entries, br, verbose=False, brcall=3): def populate(self, entries, br, verbose=False, brcall=3):
if not self.islink: if not self.islink:
@ -396,12 +375,11 @@ class ResultList(list):
pbr.put(br, True) pbr.put(br, True)
prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose)) prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose))
cons_thread = ThreadwithResults(self.consumer, sync, len(entries), verbose) cons_thread = Thread(target=self.consumer, args=(sync, len(entries), verbose))
prod_thread.start() prod_thread.start()
cons_thread.start() cons_thread.start()
prod_thread.join() prod_thread.join()
cons_thread.join() cons_thread.join()
self.extend(cons_thread.get_result())
def search(title=None, author=None, publisher=None, isbn=None, def search(title=None, author=None, publisher=None, isbn=None,

View File

@ -61,20 +61,6 @@ class GoogleBooks(MetadataSource):
class GoogleBooksError(Exception): class GoogleBooksError(Exception):
pass pass
class ThreadwithResults(Thread):
def __init__(self, func, *args, **kargs):
self.func = func
self.args = args
self.kargs = kargs
self.result = None
Thread.__init__(self)
def get_result(self):
return self.result
def run(self):
self.result = self.func(*self.args, **self.kargs)
def report(verbose): def report(verbose):
if verbose: if verbose:
import traceback import traceback
@ -173,8 +159,6 @@ class Query(object):
return entries return entries
class ResultList(list): class ResultList(list):
def __init__(self):
self.thread = []
def get_description(self, entry, verbose): def get_description(self, entry, verbose):
try: try:
@ -206,8 +190,7 @@ class ResultList(list):
return val return val
def get_identifiers(self, entry, mi): def get_identifiers(self, entry, mi):
isbns = [str(x.text).strip() for x in identifier(entry)] isbns = [t[5:] for t in [str(x.text).strip() for x in identifier(entry)] \
isbns = [t[5:] for t in isbns \
if t[:5].upper() == 'ISBN:' and check_isbn(t[5:])] if t[:5].upper() == 'ISBN:' and check_isbn(t[5:])]
# for x in identifier(entry): # for x in identifier(entry):
# t = str(x.text).strip() # t = str(x.text).strip()
@ -309,8 +292,7 @@ class ResultList(list):
entry = None entry = None
finally: finally:
qbr.put(browser, True) qbr.put(browser, True)
qsync.put(nb, True) qsync.put((nb, entry), True)
return entry
def producer(self, sync, entries, br, verbose=False): def producer(self, sync, entries, br, verbose=False):
for i in xrange(len(entries)): for i in xrange(len(entries)):
@ -319,21 +301,18 @@ class ResultList(list):
except: except:
id_url = None id_url = None
report(verbose) report(verbose)
thread = ThreadwithResults(self.fetchdatathread, br, sync, thread = Thread(target=self.fetchdatathread,
i, id_url, verbose) args=(br, sync, i, id_url, verbose))
thread.start() thread.start()
self.thread.append(thread)
def consumer(self, entries, sync, total_entries, verbose=False): def consumer(self, entries, sync, total_entries, verbose=False):
res=[None]*total_entries #remove? self.extend([None]*total_entries)
i=0 i=0
while i < total_entries: while i < total_entries:
nb = int(sync.get(True)) rq = sync.get(True)
self.thread[nb].join() nb = int(rq[0])
data = self.thread[nb].get_result() self[nb] = self.fill_MI(entries[nb], rq[1], verbose)
res[nb] = self.fill_MI(entries[nb], data, verbose)
i+=1 i+=1
return res
def populate(self, entries, br, verbose=False, brcall=3): def populate(self, entries, br, verbose=False, brcall=3):
pbr = Queue(brcall) pbr = Queue(brcall)
@ -343,12 +322,11 @@ class ResultList(list):
pbr.put(br, True) pbr.put(br, True)
prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose)) prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose))
cons_thread = ThreadwithResults(self.consumer, entries, sync, len(entries), verbose) cons_thread = Thread(target=self.consumer, args=(entries, sync, len(entries), verbose))
prod_thread.start() prod_thread.start()
cons_thread.start() cons_thread.start()
prod_thread.join() prod_thread.join()
cons_thread.join() cons_thread.join()
self.extend(cons_thread.get_result())
def search(title=None, author=None, publisher=None, isbn=None, def search(title=None, author=None, publisher=None, isbn=None,

View File

@ -82,20 +82,6 @@ class NiceBooksError(Exception):
class ISBNNotFound(NiceBooksError): class ISBNNotFound(NiceBooksError):
pass pass
class ThreadwithResults(Thread):
def __init__(self, func, *args, **kargs):
self.func = func
self.args = args
self.kargs = kargs
self.result = None
Thread.__init__(self)
def get_result(self):
return self.result
def run(self):
self.result = self.func(*self.args, **self.kargs)
def report(verbose): def report(verbose):
if verbose: if verbose:
import traceback import traceback
@ -191,7 +177,6 @@ class ResultList(list):
def __init__(self, islink): def __init__(self, islink):
self.islink = islink self.islink = islink
self.thread = []
self.repub = re.compile(u'\s*.diteur\s*', re.I) self.repub = re.compile(u'\s*.diteur\s*', re.I)
self.reauteur = re.compile(u'\s*auteur.*', re.I) self.reauteur = re.compile(u'\s*auteur.*', re.I)
self.reautclean = re.compile(u'\s*\(.*\)\s*') self.reautclean = re.compile(u'\s*\(.*\)\s*')
@ -302,27 +287,21 @@ class ResultList(list):
entry = None entry = None
finally: finally:
qbr.put(browser, True) qbr.put(browser, True)
qsync.put(nb, True) qsync.put((nb, entry), True)
return entry
def producer(self, sync, urls, br, verbose=False): def producer(self, sync, urls, br, verbose=False):
for i in xrange(len(urls)): for i in xrange(len(urls)):
thread = ThreadwithResults(self.fetchdatathread, br, sync, thread = Thread(target=self.fetchdatathread,
i, self.BASE_URL+urls[i], verbose) args=(br, sync, i, self.BASE_URL+urls[i], verbose))
thread.start() thread.start()
self.thread.append(thread)
def consumer(self, sync, total_entries, verbose=False): def consumer(self, sync, total_entries, verbose=False):
res=[None]*total_entries self.extend([None]*total_entries)
i=0 i=0
while i < total_entries: while i < total_entries:
nb = int(sync.get(True)) rq = sync.get(True)
self.thread[nb].join() self[int(rq[0])] = self.fill_MI(rq[1], verbose)
entry = self.thread[nb].get_result()
i+=1 i+=1
if entry is not None:
res[nb] = self.fill_MI(entry, verbose)
return res
def populate(self, entries, br, verbose=False, brcall=3): def populate(self, entries, br, verbose=False, brcall=3):
if not self.islink: if not self.islink:
@ -337,12 +316,11 @@ class ResultList(list):
pbr.put(br, True) pbr.put(br, True)
prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose)) prod_thread = Thread(target=self.producer, args=(sync, entries, pbr, verbose))
cons_thread = ThreadwithResults(self.consumer, sync, len(entries), verbose) cons_thread = Thread(target=self.consumer, args=(sync, len(entries), verbose))
prod_thread.start() prod_thread.start()
cons_thread.start() cons_thread.start()
prod_thread.join() prod_thread.join()
cons_thread.join() cons_thread.join()
self.extend(cons_thread.get_result())
class Covers(object): class Covers(object):