diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 1d6aceecdd..a06516c7dc 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -4,6 +4,7 @@ __copyright__ = '2010, sengian ' __docformat__ = 'restructuredtext en' import sys, textwrap, re, traceback, socket +from threading import Thread from Queue import Queue from urllib import urlencode @@ -17,7 +18,7 @@ from calibre.library.comments import sanitize_comments_html from calibre.ebooks.metadata.fetch import MetadataSource from calibre.utils.config import OptionParser from calibre.utils.date import parse_date, utcnow -from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.cleantext import clean_ascii_chars, unescape class Fictionwise(MetadataSource): # {{{ @@ -40,7 +41,45 @@ class Fictionwise(MetadataSource): # {{{ class FictionwiseError(Exception): pass - +class BrowserThread(Thread): + + def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'): + self.url = url + self.ex = ex + self.plugname = name + self.verbose = verbose + self.timeout = timeout + self.result = None + Thread.__init__(self) + + def get_result(self): + return self.result + + def run(self): + try: + raw = browser().open_novisit(self.url, timeout=self.timeout).read() + except Exception, e: + report(self.verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + self.result = None + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise self.ex(_('%s timed out. Try again later.') % self.plugname) + raise self.ex(_('%s encountered an error.') % self.plugname) + if '404 - ' in raw: + report(self.verbose) + self.result = None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + self.result = soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + self.result = soupparser.fromstring(clean_ascii_chars(raw)) + except: + self.result = None + def report(verbose): if verbose: @@ -180,10 +219,13 @@ class ResultList(list): for elt in elts: elt.drop_tree() - def output_entry(self, entry, prettyout = True, htmlrm="\d+"): + def output_entry(self, entry, prettyout = True, rmhtmlchar=True): out = tostring(entry, pretty_print=prettyout) - #try to work around tostring to remove this encoding for exemle - reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)') + #remove html chars + if rmhtmlchar: + out = unescape(out, rm=True) + # Remove \n\t\r. + reclean = re.compile('(\n+|\t+|\r+)') return reclean.sub('', out) def get_title(self, entry): @@ -211,7 +253,7 @@ class ResultList(list): return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues())) def get_description(self, entry): - description = self.output_entry(entry.xpath('./p')[1],htmlrm="") + description = self.output_entry(entry.xpath('./p')[1],rmhtmlchar=False) description = self.redesc.search(description) if not description or not description.group("desc"): return None @@ -265,9 +307,24 @@ class ResultList(list): isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))] return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1] - def fill_MI(self, entry, title, authors, ratings, verbose): + def fill_MI(self, data, verbose): + inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False, + 'ul': False, 'span': False} + inv_xpath =('./table',) + try: + entry = data.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0] + self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + return None mi = MetaInformation(title, authors) - mi.rating = ratings + ratings = entry.xpath("./p/table") + if len(ratings) >= 2: + mi.rating = self.get_rating(ratings[1], verbose) mi.comments = self.get_description(entry) mi.publisher = self.get_publisher(entry) mi.tags = self.get_tags(entry) @@ -276,67 +333,36 @@ class ResultList(list): mi.author_sort = authors_to_sort_string(authors) return mi - def get_individual_metadata(self, browser, linkdata, verbose): - try: - raw = browser.open_novisit(self.BASE_URL + linkdata).read() - except Exception, e: - report(verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return - if isinstance(getattr(e, 'args', [None])[0], socket.timeout): - raise FictionwiseError(_('Fictionwise timed out. Try again later.')) - raise FictionwiseError(_('Fictionwise encountered an error.')) - if '<title>404 - ' in raw: - report(verbose) - return - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - return soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - return soupparser.fromstring(clean_ascii_chars(raw)) - except: - return None + def producer(self, q, data, verbose=False): + for x in data: + thread = BrowserThread(self.BASE_URL+x, verbose=verbose, ex=FictionwiseError, + name='Fictionwise') + thread.start() + q.put(thread, True) - def populate(self, entries, browser, verbose=False): - inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False, - 'ul': False, 'span': False} - inv_xpath =('./table',) - #single entry + def consumer(self, q, total_entries, verbose=False): + while len(self) < total_entries: + thread = q.get(True) + thread.join() + mi = thread.get_result() + if mi is None: + self.append(None) + else: + self.append(self.fill_MI(mi, verbose)) + + def populate(self, entries, verbose=False, brcall=3): if len(entries) == 1 and not isinstance(entries[0], str): - try: - entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") - self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) - title = self.get_title(entry) - #maybe strenghten the search - ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print _('Failed to get all details for an entry') - print e - return - self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + #single entry + self.append(self.fill_MI(entries[0], verbose)) else: #multiple entries - for x in entries: - try: - entry = self.get_individual_metadata(browser, x, verbose) - entry = entry.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0] - self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) - title = self.get_title(entry) - #maybe strenghten the search - ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print _('Failed to get all details for an entry') - print e - continue - self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + q = Queue(brcall) + prod_thread = Thread(target=self.producer, args=(q, entries, verbose)) + cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose)) + prod_thread.start() + cons_thread.start() + prod_thread.join() + cons_thread.join() def search(title=None, author=None, publisher=None, isbn=None, @@ -349,7 +375,7 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList() ans.populate(entries, br, verbose) - return ans + return [x for x in ans if x is not None] def option_parser(): @@ -391,3 +417,5 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) + +# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\fictionwise.py" -m 5 -a gore -v>data.html \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 580e645320..5bd360ed6c 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -302,9 +302,7 @@ class ResultList(list): def populate(self, entries, verbose=False, brcall=3): if len(entries) == 1 and not isinstance(entries[0], str): #single entry - mi = self.fill_MI(entries[0], verbose) - if mi: - self.append(mi) + self.append(self.fill_MI(entries[0], verbose)) else: #multiple entries q = Queue(brcall) @@ -364,7 +362,7 @@ def search(title=None, author=None, publisher=None, isbn=None, #List of entry ans = ResultList() ans.populate(entries, verbose) - return [x for x in ans if x] + return [x for x in ans if x is not None] def check_for_cover(isbn): br = browser() diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py index b4afe7576d..a27f74529e 100644 --- a/src/calibre/utils/cleantext.py +++ b/src/calibre/utils/cleantext.py @@ -3,7 +3,8 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian <sengian1@gmail.com>' __docformat__ = 'restructuredtext en' -import re +import re, htmlentitydefs +from functools import partial _ascii_pat = None @@ -21,3 +22,32 @@ def clean_ascii_chars(txt, charlist=None): pat = re.compile(u'|'.join(map(unichr, charlist))) return pat.sub('', txt) +## +# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html +# Removes HTML or XML character references and entities from a text string. +# +# @param text The HTML (or XML) source text. +# @return The plain text, as a Unicode string, if necessary. + +def unescape(text, rm=False, rchar=u''): + def fixup(m, rm=rm, rchar=rchar): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + if rm: + return rchar #replace by char + return text # leave as is + return re.sub("&#?\w+;", fixup, text) \ No newline at end of file