From e610f16ca073fc0a4960143484c56031e8ac9069 Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 5 Dec 2010 20:09:17 +0100 Subject: [PATCH] Update fictionwise.py (broken) --- src/calibre/ebooks/metadata/fictionwise.py | 146 +++++++++++++-------- 1 file changed, 93 insertions(+), 53 deletions(-) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py index 828ea31c3a..e56c697e3c 100644 --- a/src/calibre/ebooks/metadata/fictionwise.py +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -3,12 +3,11 @@ __license__ = 'GPL 3' __copyright__ = '2010, sengian ' __docformat__ = 'restructuredtext en' -import sys, textwrap, re +import sys, textwrap, re, traceback, socket from urllib import urlencode -from lxml import html, etree -from lxml.html import soupparser -from lxml.etree import tostring +from lxml import html +from lxml.html import soupparser, tostring from calibre import browser, preferred_encoding from calibre.ebooks.chardet import xml_to_unicode @@ -18,6 +17,7 @@ from calibre.library.comments import sanitize_comments_html from calibre.ebooks.metadata.fetch import MetadataSource from calibre.utils.config import OptionParser from calibre.utils.date import parse_date, utcnow +from calibre.utils.cleantext import clean_ascii_char class Fictionwise(MetadataSource): # {{{ @@ -37,10 +37,11 @@ class Fictionwise(MetadataSource): # {{{ # }}} +class FictionwiseError(Exception): + pass def report(verbose): if verbose: - import traceback traceback.print_exc() class Query(object): @@ -86,18 +87,20 @@ class Query(object): q = q.encode('utf-8') self.urldata = urlencode(q) - def __call__(self, browser, verbose): + def __call__(self, browser, verbose, timeout = 5.): if verbose: - print 'Query:', self.BASE_URL+self.urldata + print _('Query: %s') % self.BASE_URL+self.urldata try: - raw = browser.open_novisit(self.BASE_URL, self.urldata).read() + raw = browser.open_novisit(self.BASE_URL, self.urldata, timeout=timeout).read() except Exception, e: report(verbose) if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return - raise + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise FictionwiseError(_('Fictionwise timed out. Try again later.')) + raise FictionwiseError(_('Fictionwise encountered an error.')) if '404 - ' in raw: return raw = xml_to_unicode(raw, strip_encoding_pats=True, @@ -105,7 +108,11 @@ class Query(object): try: feed = soupparser.fromstring(raw) except: - return + try: + #remove ASCII invalid chars + feed = soupparser.fromstring(clean_ascii_char(raw)) + except: + return None # get list of results as links results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]") @@ -139,12 +146,41 @@ class ResultList(list): self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I) def strip_tags_etree(self, etreeobj, invalid_tags): - for itag in invalid_tags: - for elt in etreeobj.getiterator(itag): - elt.drop_tag() - return etreeobj + for (itag, rmv) in invalid_tags.iteritems(): + if rmv: + for elts in etreeobj.getiterator(itag): + elts.drop_tree() + else: + for elts in etreeobj.getiterator(itag): + elts.drop_tag() - def clean_entry(self, entry, + def clean_entry(self, entry, invalid_tags = {'script': True}, + invalid_id = (), invalid_class=(), invalid_xpath = ()): + #invalid_tags: remove tag and keep content if False else remove + #remove tags + if invalid_tags: + self.strip_tags_etree(entry, invalid_tags) + #remove xpath + if invalid_xpath: + for eltid in invalid_xpath: + elt = entry.xpath(eltid) + for el in elt: + el.drop_tree() + #remove id + if invalid_id: + for eltid in invalid_id: + elt = entry.get_element_by_id(eltid) + if elt is not None: + elt.drop_tree() + #remove class + if invalid_class: + for eltclass in invalid_class: + elts = entry.find_class(eltclass) + if elts is not None: + for elt in elts: + elt.drop_tree() + + def clean_entry_dffdfbdjbf(self, entry, invalid_tags = ('font', 'strong', 'b', 'ul', 'span', 'a'), remove_tags_trees = ('script',)): for it in entry[0].iterchildren(tag='table'): @@ -170,7 +206,6 @@ class ResultList(list): authortext = entry.find('./br').tail if not self.rechkauth.search(authortext): return [] - #TODO: parse all tag if necessary authortext = self.rechkauth.sub('', authortext) return [a.strip() for a in authortext.split('&')] @@ -185,7 +220,7 @@ class ResultList(list): float(image.get('height', default=0))) \ for image in entrytable.getiterator('img')) #ratings as x/5 - return 1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()) + return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues())) def get_description(self, entry): description = self.output_entry(entry.find('./p'),htmlrm="") @@ -221,7 +256,6 @@ class ResultList(list): self.resplitbr.split(date)) if not len(date): return None - #TODO: parse all tag if necessary try: d = self.redate.sub('', date[0]) if d: @@ -279,9 +313,14 @@ class ResultList(list): return feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") def populate(self, entries, browser, verbose=False): - for x in entries: + inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False, + 'ul': False, 'span': False, 'table': True} + inv_xpath =('descendant-or-self::p[1]',) + #single entry + if len(entries) == 1 and not isinstance(entries[0], str): try: - entry = self.get_individual_metadata(browser, x, verbose) + entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) entry = self.clean_entry(entry) title = self.get_title(entry) #ratings: get table for rating then drop @@ -292,28 +331,29 @@ class ResultList(list): authors = self.get_authors(entry) except Exception, e: if verbose: - print 'Failed to get all details for an entry' + print _('Failed to get all details for an entry') print e - continue + return self.append(self.fill_MI(entry, title, authors, ratings, verbose)) - - def populate_single(self, feed, verbose=False): - try: - entry = feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") - entry = self.clean_entry(entry) - title = self.get_title(entry) - #ratings: get table for rating then drop - for elt in entry.getiterator('table'): - ratings = self.get_rating(elt, verbose) - elt.getprevious().drop_tree() - elt.drop_tree() - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print 'Failed to get all details for an entry' - print e - return - self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + else: + #multiple entries + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) + title = self.get_title(entry) + #ratings: get table for rating then drop + for elt in entry.getiterator('table'): + ratings = self.get_rating(elt, verbose) + elt.getprevious().drop_tree() + elt.drop_tree() + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + continue + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) def search(title=None, author=None, publisher=None, isbn=None, @@ -321,35 +361,32 @@ def search(title=None, author=None, publisher=None, isbn=None, keywords=None): br = browser() entries = Query(title=title, author=author, publisher=publisher, - keywords=keywords, max_results=max_results)(br, verbose) + keywords=keywords, max_results=max_results)(br, verbose, timeout = 10.) #List of entry ans = ResultList() - if len(entries) > 1: - ans.populate(entries, br, verbose) - else: - ans.populate_single(entries[0], verbose) + ans.populate(entries, br, verbose) return ans def option_parser(): parser = OptionParser(textwrap.dedent(\ - '''\ + _('''\ %prog [options] Fetch book metadata from Fictionwise. You must specify one of title, author, or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches, so you should make your query as specific as possible. - ''' + ''') )) - parser.add_option('-t', '--title', help='Book title') - parser.add_option('-a', '--author', help='Book author(s)') - parser.add_option('-p', '--publisher', help='Book publisher') - parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-t', '--title', help=_('Book title')) + parser.add_option('-a', '--author', help=_('Book author(s)')) + parser.add_option('-p', '--publisher', help=_('Book publisher')) + parser.add_option('-k', '--keywords', help=_('Keywords')) parser.add_option('-m', '--max-results', default=20, - help='Maximum number of results to fetch') + help=_('Maximum number of results to fetch')) parser.add_option('-v', '--verbose', default=0, action='count', - help='Be more verbose about errors') + help=_('Be more verbose about errors')) return parser def main(args=sys.argv): @@ -362,6 +399,9 @@ def main(args=sys.argv): report(True) parser.print_help() return 1 + if results is None or len(results) == 0: + print _('No result found for this search!') + return 0 for result in results: print unicode(result).encode(preferred_encoding, 'replace') print