Add threading to nicebooks.py

This commit is contained in:
Sengian 2010-12-08 06:47:25 +01:00
parent da4cdeb1d1
commit 4d20351e8b

View File

@ -3,7 +3,8 @@ __license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>' __copyright__ = '2010, sengian <sengian1@gmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import sys, textwrap, re, traceback, socket import sys, textwrap, re, traceback, socket, threading
from Queue import Queue
from urllib import urlencode from urllib import urlencode
from math import ceil from math import ceil
from copy import deepcopy from copy import deepcopy
@ -23,7 +24,7 @@ from calibre.utils.config import OptionParser
class NiceBooks(MetadataSource): class NiceBooks(MetadataSource):
name = 'Nicebooks' name = 'Nicebooks'
description = _('Downloads metadata from french Nicebooks') description = _('Downloads metadata from French Nicebooks')
supported_platforms = ['windows', 'osx', 'linux'] supported_platforms = ['windows', 'osx', 'linux']
author = 'Sengian' author = 'Sengian'
version = (1, 0, 0) version = (1, 0, 0)
@ -78,10 +79,50 @@ class NiceBooksError(Exception):
class ISBNNotFound(NiceBooksError): class ISBNNotFound(NiceBooksError):
pass pass
class BrowserThread(threading.Thread):
def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'):
self.url = url
self.ex = ex
self.name = name
self.verbose = verbose
self.timeout = timeout
self.result = None
threading.Thread.__init__(self)
def get_result(self):
return self.result
def run(self):
try:
raw = browser().open_novisit(self.url, timeout=self.timeout).read()
except Exception, e:
report(self.verbose)
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
self.result = None
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
raise self.ex(_('%s timed out. Try again later.') % self.name)
raise self.ex(_('%s encountered an error.') % self.name)
if '<title>404 - ' in raw:
report(self.verbose)
self.result = None
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
self.result = soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
self.result = soupparser.fromstring(clean_ascii_chars(raw))
except:
self.result = None
def report(verbose): def report(verbose):
if verbose: if verbose:
traceback.print_exc() traceback.print_exc()
class Query(object): class Query(object):
BASE_URL = 'http://fr.nicebooks.com/' BASE_URL = 'http://fr.nicebooks.com/'
@ -224,68 +265,53 @@ class ResultList(list):
report(verbose) report(verbose)
return mi return mi
def fill_MI(self, entry, title, authors, verbose): def fill_MI(self, data, verbose):
'''create and return an mi if possible, None otherwise'''
try:
entry = data.xpath("//div[@id='container']/div[@id='book-info']")[0]
title = self.get_title(entry)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print 'Failed to get all details for an entry'
print e
return None
mi = MetaInformation(title, authors) mi = MetaInformation(title, authors)
mi.author_sort = authors_to_sort_string(authors) mi.author_sort = authors_to_sort_string(authors)
mi.comments = self.get_description(entry, verbose) mi.comments = self.get_description(entry, verbose)
return self.get_book_info(entry, mi, verbose) return self.get_book_info(entry, mi, verbose)
def get_individual_metadata(self, browser, linkdata, verbose): def producer(self, q, data, verbose=False):
try: for x in data:
raw = browser.open_novisit(self.BASE_URL + linkdata).read() thread = BrowserThread(self.BASE_URL+x, verbose=verbose, ex=NiceBooksError,
except Exception, e: name='Nicebooks')
report(verbose) thread.start()
if callable(getattr(e, 'getcode', None)) and \ q.put(thread, True)
e.getcode() == 404:
return
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
raise NiceBooksError(_('Nicebooks encountered an error.'))
if '<title>404 - ' in raw:
report(verbose)
return
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
feed = soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
feed = soupparser.fromstring(clean_ascii_chars(raw))
except:
return None
# get results def consumer(self, q, total_entries, verbose=False):
return feed.xpath("//div[@id='container']")[0] while len(self) < total_entries:
thread = q.get(True)
thread.join()
mi, order = thread.get_result()
if mi is None:
self.append(None)
self.append(self.fill_MI(mi, verbose))
def populate(self, entries, browser, verbose=False): def populate(self, entries, verbose=False, brcall=3):
#single entry
if len(entries) == 1 and not isinstance(entries[0], str): if len(entries) == 1 and not isinstance(entries[0], str):
try: #single entry
entry = entries[0].xpath("//div[@id='container']")[0] mi = self.fill_MI(entries[0], verbose)
entry = entry.find("div[@id='book-info']") if mi:
title = self.get_title(entry) self.append(mi)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print 'Failed to get all details for an entry'
print e
return
self.append(self.fill_MI(entry, title, authors, verbose))
else: else:
#multiple entries #multiple entries
for x in entries: q = Queue(brcall)
try: prod_thread = threading.Thread(target=self.producer, args=(q, entries, verbose))
entry = self.get_individual_metadata(browser, x, verbose) cons_thread = threading.Thread(target=self.consumer, args=(q, len(entries), verbose))
entry = entry.find("div[@id='book-info']") prod_thread.start()
title = self.get_title(entry) cons_thread.start()
authors = self.get_authors(entry) prod_thread.join()
except Exception, e: cons_thread.join()
if verbose:
print 'Failed to get all details for an entry'
print e
continue
self.append(self.fill_MI(entry, title, authors, verbose))
class Covers(object): class Covers(object):
@ -328,14 +354,14 @@ def search(title=None, author=None, publisher=None, isbn=None,
max_results=5, verbose=False, keywords=None): max_results=5, verbose=False, keywords=None):
br = browser() br = browser()
entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
keywords=keywords, max_results=max_results)(br, verbose,timeout = 10.) keywords=keywords, max_results=max_results)(br, verbose, timeout = 10.)
if entries is None or len(entries) == 0: if entries is None or len(entries) == 0:
return None return None
#List of entry #List of entry
ans = ResultList() ans = ResultList()
ans.populate(entries, br, verbose) ans.populate(entries, verbose)
return ans return ans
def check_for_cover(isbn): def check_for_cover(isbn):
@ -409,3 +435,5 @@ def main(args=sys.argv):
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())
# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\nicebooks.py" -m 5 -a mankel >data.html