diff --git a/src/calibre/devices/usbms/books.py b/src/calibre/devices/usbms/books.py index 23ce1716af..7a5e8c49b3 100644 --- a/src/calibre/devices/usbms/books.py +++ b/src/calibre/devices/usbms/books.py @@ -11,9 +11,9 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.devices.mime import mime_type_ext from calibre.devices.interface import BookList as _BookList from calibre.constants import preferred_encoding -from calibre import isbytestring +from calibre import isbytestring, force_unicode from calibre.utils.config import prefs, tweaks -from calibre.utils.icu import sort_key, strcmp as icu_strcmp +from calibre.utils.icu import strcmp class Book(Metadata): def __init__(self, prefix, lpath, size=None, other=None): @@ -241,7 +241,7 @@ class CollectionsBookList(BookList): if y is None: return -1 if isinstance(x, (unicode, str)): - c = strcmp(x, y) + c = strcmp(force_unicode(x), force_unicode(y)) else: c = cmp(x, y) if c != 0: diff --git a/src/calibre/ebooks/metadata/amazonfr.py b/src/calibre/ebooks/metadata/amazonfr.py new file mode 100644 index 0000000000..156fff3d75 --- /dev/null +++ b/src/calibre/ebooks/metadata/amazonfr.py @@ -0,0 +1,516 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian ' + +import sys, textwrap, re, traceback +from urllib import urlencode +from math import ceil + +from lxml import html +from lxml.html import soupparser + +from calibre.utils.date import parse_date, utcnow, replace_months +from calibre.utils.cleantext import clean_ascii_chars +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.utils.config import OptionParser +from calibre.library.comments import sanitize_comments_html + + +class AmazonFr(MetadataSource): + + name = 'Amazon French' + description = _('Downloads metadata from amazon.fr') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='fr') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class AmazonEs(MetadataSource): + + name = 'Amazon Spanish' + description = _('Downloads metadata from amazon.com in spanish') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='es') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class AmazonEn(MetadataSource): + + name = 'Amazon English' + description = _('Downloads metadata from amazon.com in english') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='en') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class AmazonDe(MetadataSource): + + name = 'Amazon German' + description = _('Downloads metadata from amazon.de') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='de') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class Amazon(MetadataSource): + + name = 'Amazon' + description = _('Downloads metadata from amazon.com') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Kovid Goyal & Sengian' + version = (1, 1, 0) + has_html_comments = True + + def fetch(self): + # if not self.site_customization: + # return + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose, lang='all') + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # @property + # def string_customization_help(self): + # return _('You can select here the language for metadata search with amazon.com') + + +def report(verbose): + if verbose: + traceback.print_exc() + + +class Query(object): + + BASE_URL_ALL = 'http://www.amazon.com' + BASE_URL_FR = 'http://www.amazon.fr' + BASE_URL_DE = 'http://www.amazon.de' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, + max_results=20, rlang='all'): + assert not(title is None and author is None and publisher is None \ + and isbn is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + self.renbres = re.compile(u'\s*(\d+)\s*') + + q = { 'search-alias' : 'stripbooks' , + 'unfiltered' : '1', + 'field-keywords' : '', + 'field-author' : '', + 'field-title' : '', + 'field-isbn' : '', + 'field-publisher' : '' + #get to amazon detailed search page to get all options + # 'node' : '', + # 'field-binding' : '', + #before, during, after + # 'field-dateop' : '', + #month as number + # 'field-datemod' : '', + # 'field-dateyear' : '', + #french only + # 'field-collection' : '', + #many options available + } + + if rlang =='all': + q['sort'] = 'relevanceexprank' + self.urldata = self.BASE_URL_ALL + elif rlang =='es': + q['sort'] = 'relevanceexprank' + q['field-language'] = 'Spanish' + self.urldata = self.BASE_URL_ALL + elif rlang =='en': + q['sort'] = 'relevanceexprank' + q['field-language'] = 'English' + self.urldata = self.BASE_URL_ALL + elif rlang =='fr': + q['sort'] = 'relevancerank' + self.urldata = self.BASE_URL_FR + elif rlang =='de': + q['sort'] = 'relevancerank' + self.urldata = self.BASE_URL_DE + self.baseurl = self.urldata + + if isbn is not None: + q['field-isbn'] = isbn.replace('-', '') + else: + if title is not None: + q['field-title'] = title + if author is not None: + q['field-author'] = author + if publisher is not None: + q['field-publisher'] = publisher + if keywords is not None: + q['field-keywords'] = keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print 'Query:', self.urldata + + try: + raw = browser.open_novisit(self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + + try: + feed = soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + return None, self.urldata + + #nb of page + try: + nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text) + except: + return None, self.urldata + + pages =[feed] + if len(nbresults) > 1: + nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1]))) + for i in xrange(2, nbpagetoquery + 1): + try: + urldata = self.urldata + '&page=' + str(i) + raw = browser.open_novisit(urldata, timeout=timeout).read() + except Exception, e: + continue + if '<title>404 - ' in raw: + continue + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + continue + pages.append(feed) + + results = [] + for x in pages: + results.extend([i.getparent().get('href') \ + for i in x.xpath("//a/span[@class='srTitle']")]) + return results[:self.max_results], self.baseurl + +class ResultList(list): + + def __init__(self, baseurl, lang = 'all'): + self.baseurl = baseurl + self.lang = lang + self.repub = re.compile(u'\((.*)\)') + self.rerat = re.compile(u'([0-9.]+)') + self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>') + self.reoutp = re.compile(r'(?s)<em>--This text ref.*?</em>') + self.recom = re.compile(r'(?s)<!--.*?-->') + self.republi = re.compile(u'(Editeur|Publisher|Verlag)', re.I) + self.reisbn = re.compile(u'(ISBN-10|ISBN-10|ASIN)', re.I) + self.relang = re.compile(u'(Language|Langue|Sprache)', re.I) + self.reratelt = re.compile(u'(Average\s*Customer\s*Review|Moyenne\s*des\s*commentaires\s*client|Durchschnittliche\s*Kundenbewertung)', re.I) + self.reprod = re.compile(u'(Product\s*Details|D.tails\s*sur\s*le\s*produit|Produktinformation)', re.I) + + def strip_tags_etree(self, etreeobj, invalid_tags): + for (itag, rmv) in invalid_tags.iteritems(): + if rmv: + for elts in etreeobj.getiterator(itag): + elts.drop_tree() + else: + for elts in etreeobj.getiterator(itag): + elts.drop_tag() + + def clean_entry(self, entry, invalid_tags = {'script': True}, + invalid_id = (), invalid_class=()): + #invalid_tags: remove tag and keep content if False else remove + #remove tags + if invalid_tags: + self.strip_tags_etree(entry, invalid_tags) + #remove id + if invalid_id: + for eltid in invalid_id: + elt = entry.get_element_by_id(eltid) + if elt is not None: + elt.drop_tree() + #remove class + if invalid_class: + for eltclass in invalid_class: + elts = entry.find_class(eltclass) + if elts is not None: + for elt in elts: + elt.drop_tree() + + def get_title(self, entry): + title = entry.get_element_by_id('btAsinTitle') + if title is not None: + title = title.text + return unicode(title.replace('\n', '').strip()) + + def get_authors(self, entry): + author = entry.get_element_by_id('btAsinTitle') + while author.getparent().tag != 'div': + author = author.getparent() + author = author.getparent() + authortext = [] + for x in author.getiterator('a'): + authortext.append(unicode(x.text_content().strip())) + return authortext + + def get_description(self, entry, verbose): + try: + description = entry.get_element_by_id("productDescription").find("div[@class='content']") + inv_class = ('seeAll', 'emptyClear') + inv_tags ={'img': True, 'a': False} + self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class) + description = html.tostring(description, method='html', encoding=unicode).strip() + # remove all attributes from tags + description = self.reattr.sub(r'<\1>', description) + # Remove the notice about text referring to out of print editions + description = self.reoutp.sub('', description) + # Remove comments + description = self.recom.sub('', description) + return unicode(sanitize_comments_html(description)) + except: + report(verbose) + return None + + def get_tags(self, entry, browser, verbose): + try: + tags = entry.get_element_by_id('tagContentHolder') + testptag = tags.find_class('see-all') + if testptag: + for x in testptag: + alink = x.xpath('descendant-or-self::a') + if alink: + if alink[0].get('class') == 'tgJsActive': + continue + link = self.baseurl + alink[0].get('href') + entry = self.get_individual_metadata(browser, link, verbose) + tags = entry.get_element_by_id('tagContentHolder') + break + tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag'] + except: + report(verbose) + tags = [] + return tags + + def get_book_info(self, entry, mi, verbose): + try: + entry = entry.get_element_by_id('SalesRank').getparent() + except: + try: + for z in entry.getiterator('h2'): + if self.reprod.search(z.text_content()): + entry = z.getparent().find("div[@class='content']/ul") + break + except: + report(verbose) + return mi + elts = entry.findall('li') + #pub & date + elt = filter(lambda x: self.republi.search(x.find('b').text), elts) + if elt: + pub = elt[0].find('b').tail + mi.publisher = unicode(self.repub.sub('', pub).strip()) + d = self.repub.search(pub) + if d is not None: + d = d.group(1) + try: + default = utcnow().replace(day=15) + if self.lang != 'all': + d = replace_months(d, self.lang) + d = parse_date(d, assume_utc=True, default=default) + mi.pubdate = d + except: + report(verbose) + #ISBN + elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts) + if elt: + isbn = elt[0].find('b').tail.replace('-', '').strip() + if check_isbn(isbn): + mi.isbn = unicode(isbn) + elif len(elt) > 1: + isbn = elt[1].find('b').tail.replace('-', '').strip() + if check_isbn(isbn): + mi.isbn = unicode(isbn) + #Langue + elt = filter(lambda x: self.relang.search(x.find('b').text), elts) + if elt: + langue = elt[0].find('b').tail.strip() + if langue: + mi.language = unicode(langue) + #ratings + elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts) + if elt: + ratings = elt[0].find_class('swSprite') + if ratings: + ratings = self.rerat.findall(ratings[0].get('title')) + if len(ratings) == 2: + mi.rating = float(ratings[0])/float(ratings[1]) * 5 + return mi + + def fill_MI(self, entry, title, authors, browser, verbose): + mi = MetaInformation(title, authors) + mi.author_sort = authors_to_sort_string(authors) + mi.comments = self.get_description(entry, verbose) + mi = self.get_book_info(entry, mi, verbose) + mi.tags = self.get_tags(entry, browser, verbose) + return mi + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + report(verbose) + return + + def populate(self, entries, browser, verbose=False): + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + # clean results + # inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop') + # inv_class = ('buyingDetailsGrid', 'productImageGrid') + # inv_tags ={'script': True, 'style': True, 'form': False} + # self.clean_entry(entry, invalid_id=inv_ids) + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + print 'URL who failed:', x + report(verbose) + continue + self.append(self.fill_MI(entry, title, authors, browser, verbose)) + + +def search(title=None, author=None, publisher=None, isbn=None, + max_results=5, verbose=False, keywords=None, lang='all'): + br = browser() + entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results,rlang=lang)(br, verbose) + + if entries is None or len(entries) == 0: + return + + #List of entry + ans = ResultList(baseurl, lang) + ans.populate(entries, br, verbose) + return ans + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + _('''\ + %prog [options] + + Fetch book metadata from Amazon. You must specify one of title, author, + ISBN, publisher or keywords. Will fetch a maximum of 10 matches, + so you should make your query as specific as possible. + You can chose the language for metadata retrieval: + All & english & french & german & spanish + ''' + ))) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-i', '--isbn', help='Book ISBN') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-m', '--max-results', default=10, + help='Maximum number of results to fetch') + parser.add_option('-l', '--lang', default='all', + help='Chosen language for metadata search (all, en, fr, es, de)') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results, + lang=opts.lang) + except AssertionError: + report(True) + parser.print_help() + return 1 + if results is None or len(results) == 0: + print 'No result found for this search!' + return 0 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + print + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py new file mode 100644 index 0000000000..b780f2b39d --- /dev/null +++ b/src/calibre/ebooks/metadata/fictionwise.py @@ -0,0 +1,390 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian <sengian1@gmail.com>' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re, traceback, socket +from urllib import urlencode + +from lxml.html import soupparser, tostring + +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.library.comments import sanitize_comments_html +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.utils.config import OptionParser +from calibre.utils.date import parse_date, utcnow +from calibre.utils.cleantext import clean_ascii_chars + +class Fictionwise(MetadataSource): # {{{ + + author = 'Sengian' + name = 'Fictionwise' + description = _('Downloads metadata from Fictionwise') + + has_html_comments = True + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # }}} + +class FictionwiseError(Exception): + pass + +def report(verbose): + if verbose: + traceback.print_exc() + +class Query(object): + + BASE_URL = 'http://www.fictionwise.com/servlet/mw' + + def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20): + assert not(title is None and author is None and publisher is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + q = { 'template' : 'searchresults_adv.htm' , + 'searchtitle' : '', + 'searchauthor' : '', + 'searchpublisher' : '', + 'searchkeyword' : '', + #possibilities startoflast, fullname, lastfirst + 'searchauthortype' : 'startoflast', + 'searchcategory' : '', + 'searchcategory2' : '', + 'searchprice_s' : '0', + 'searchprice_e' : 'ANY', + 'searchformat' : '', + 'searchgeo' : 'US', + 'searchfwdatetype' : '', + #maybe use dates fields if needed? + #'sortorder' : 'DESC', + #many options available: b.SortTitle, a.SortName, + #b.DateFirstPublished, b.FWPublishDate + 'sortby' : 'b.SortTitle' + } + if title is not None: + q['searchtitle'] = title + if author is not None: + q['searchauthor'] = author + if publisher is not None: + q['searchpublisher'] = publisher + if keywords is not None: + q['searchkeyword'] = keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = urlencode(q) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print _('Query: %s') % self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL, self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise FictionwiseError(_('Fictionwise timed out. Try again later.')) + raise FictionwiseError(_('Fictionwise encountered an error.')) + if '<title>404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + feed = soupparser.fromstring(clean_ascii_chars(raw)) + except: + return None + + # get list of results as links + results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]") + results = results[:self.max_results] + results = [i.xpath('descendant-or-self::a')[0].get('href') for i in results] + #return feed if no links ie normally a single book or nothing + if not results: + results = [feed] + return results + +class ResultList(list): + + BASE_URL = 'http://www.fictionwise.com' + COLOR_VALUES = {'BLUE': 4, 'GREEN': 3, 'YELLOW': 2, 'RED': 1, 'NA': 0} + + def __init__(self): + self.retitle = re.compile(r'\[[^\[\]]+\]') + self.rechkauth = re.compile(r'.*book\s*by', re.I) + self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I) + self.repub = re.compile(r'.*publisher\s*:\s*', re.I) + self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I) + self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I) + self.resplitbr = re.compile(r'<br[^>]*>', re.I) + self.recomment = re.compile(r'(?s)<!--.*?-->') + self.reimg = re.compile(r'<img[^>]*>', re.I) + self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I) + self.renbcom = re.compile('(?P<nbcom>\d+)\s*Reader Ratings:') + self.recolor = re.compile('(?P<ncolor>[^/]+).gif') + self.resplitbrdiv = re.compile(r'(<br[^>]+>|</?div[^>]*>)', re.I) + self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I) + + def strip_tags_etree(self, etreeobj, invalid_tags): + for (itag, rmv) in invalid_tags.iteritems(): + if rmv: + for elts in etreeobj.getiterator(itag): + elts.drop_tree() + else: + for elts in etreeobj.getiterator(itag): + elts.drop_tag() + + def clean_entry(self, entry, invalid_tags = {'script': True}, + invalid_id = (), invalid_class=(), invalid_xpath = ()): + #invalid_tags: remove tag and keep content if False else remove + #remove tags + if invalid_tags: + self.strip_tags_etree(entry, invalid_tags) + #remove xpath + if invalid_xpath: + for eltid in invalid_xpath: + elt = entry.xpath(eltid) + for el in elt: + el.drop_tree() + #remove id + if invalid_id: + for eltid in invalid_id: + elt = entry.get_element_by_id(eltid) + if elt is not None: + elt.drop_tree() + #remove class + if invalid_class: + for eltclass in invalid_class: + elts = entry.find_class(eltclass) + if elts is not None: + for elt in elts: + elt.drop_tree() + + def output_entry(self, entry, prettyout = True, htmlrm="\d+"): + out = tostring(entry, pretty_print=prettyout) + #try to work around tostring to remove this encoding for exemle + reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)') + return reclean.sub('', out) + + def get_title(self, entry): + title = entry.findtext('./') + return self.retitle.sub('', title).strip() + + def get_authors(self, entry): + authortext = entry.find('./br').tail + if not self.rechkauth.search(authortext): + return [] + authortext = self.rechkauth.sub('', authortext) + return [a.strip() for a in authortext.split('&')] + + def get_rating(self, entrytable, verbose): + nbcomment = tostring(entrytable.getprevious()) + try: + nbcomment = self.renbcom.search(nbcomment).group("nbcom") + except: + report(verbose) + return None + hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")], + float(image.get('height', default=0))) \ + for image in entrytable.getiterator('img')) + #ratings as x/5 + return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues())) + + def get_description(self, entry): + description = self.output_entry(entry.xpath('./p')[1],htmlrm="") + description = self.redesc.search(description) + if not description or not description.group("desc"): + return None + #remove invalid tags + description = self.reimg.sub('', description.group("desc")) + description = self.recomment.sub('', description) + description = self.resanitize.sub('', sanitize_comments_html(description)) + return _('SUMMARY:\n %s') % re.sub(r'\n\s+</p>','\n</p>', description) + + def get_publisher(self, entry): + publisher = self.output_entry(entry.xpath('./p')[1]) + publisher = filter(lambda x: self.repub.search(x) is not None, + self.resplitbr.split(publisher)) + if not len(publisher): + return None + publisher = self.repub.sub('', publisher[0]) + return publisher.split(',')[0].strip() + + def get_tags(self, entry): + tag = self.output_entry(entry.xpath('./p')[1]) + tag = filter(lambda x: self.retag.search(x) is not None, + self.resplitbr.split(tag)) + if not len(tag): + return [] + return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/')) + + def get_date(self, entry, verbose): + date = self.output_entry(entry.xpath('./p')[1]) + date = filter(lambda x: self.redate.search(x) is not None, + self.resplitbr.split(date)) + if not len(date): + return None + try: + d = self.redate.sub('', date[0]) + if d: + default = utcnow().replace(day=15) + d = parse_date(d, assume_utc=True, default=default) + else: + d = None + except: + report(verbose) + d = None + return d + + def get_ISBN(self, entry): + isbns = self.output_entry(entry.xpath('./p')[2]) + isbns = filter(lambda x: self.reisbn.search(x) is not None, + self.resplitbrdiv.split(isbns)) + if not len(isbns): + return None + isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))] + return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1] + + def fill_MI(self, entry, title, authors, ratings, verbose): + mi = MetaInformation(title, authors) + mi.rating = ratings + mi.comments = self.get_description(entry) + mi.publisher = self.get_publisher(entry) + mi.tags = self.get_tags(entry) + mi.pubdate = self.get_date(entry, verbose) + mi.isbn = self.get_ISBN(entry) + mi.author_sort = authors_to_sort_string(authors) + return mi + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise FictionwiseError(_('Fictionwise timed out. Try again later.')) + raise FictionwiseError(_('Fictionwise encountered an error.')) + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + return soupparser.fromstring(raw) + except: + try: + #remove ASCII invalid chars + return soupparser.fromstring(clean_ascii_chars(raw)) + except: + return None + + def populate(self, entries, browser, verbose=False): + inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False, + 'ul': False, 'span': False} + inv_xpath =('./table',) + #single entry + if len(entries) == 1 and not isinstance(entries[0], str): + try: + entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td") + self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) + title = self.get_title(entry) + #maybe strenghten the search + ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + return + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + else: + #multiple entries + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + entry = entry.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0] + self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath) + title = self.get_title(entry) + #maybe strenghten the search + ratings = self.get_rating(entry.xpath("./p/table")[1], verbose) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print _('Failed to get all details for an entry') + print e + continue + self.append(self.fill_MI(entry, title, authors, ratings, verbose)) + + +def search(title=None, author=None, publisher=None, isbn=None, + min_viewability='none', verbose=False, max_results=5, + keywords=None): + br = browser() + entries = Query(title=title, author=author, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose, timeout = 15.) + + #List of entry + ans = ResultList() + ans.populate(entries, br, verbose) + return ans + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + _('''\ + %prog [options] + + Fetch book metadata from Fictionwise. You must specify one of title, author, + or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + ''') + )) + parser.add_option('-t', '--title', help=_('Book title')) + parser.add_option('-a', '--author', help=_('Book author(s)')) + parser.add_option('-p', '--publisher', help=_('Book publisher')) + parser.add_option('-k', '--keywords', help=_('Keywords')) + parser.add_option('-m', '--max-results', default=20, + help=_('Maximum number of results to fetch')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Be more verbose about errors')) + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + if results is None or len(results) == 0: + print _('No result found for this search!') + return 0 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + print + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py index 4d19e9611b..8914e2d985 100644 --- a/src/calibre/ebooks/metadata/nicebooks.py +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -10,7 +10,8 @@ from copy import deepcopy from lxml.html import soupparser -from calibre.utils.date import parse_date, utcnow +from calibre.utils.date import parse_date, utcnow, replace_months +from calibre.utils.cleantext import clean_ascii_chars from calibre import browser, preferred_encoding from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn, \ @@ -71,31 +72,16 @@ class NiceBooksCovers(CoverDownload): traceback.format_exc(), self.name)) +class NiceBooksError(Exception): + pass + +class ISBNNotFound(NiceBooksError): + pass + def report(verbose): if verbose: - import traceback traceback.print_exc() -def replace_monthsfr(datefr): - # Replace french months by english equivalent for parse_date - frtoen = { - u'[jJ]anvier': u'jan', - u'[fF].vrier': u'feb', - u'[mM]ars': u'mar', - u'[aA]vril': u'apr', - u'[mM]ai': u'may', - u'[jJ]uin': u'jun', - u'[jJ]uillet': u'jul', - u'[aA]o.t': u'aug', - u'[sS]eptembre': u'sep', - u'[Oo]ctobre': u'oct', - u'[nN]ovembre': u'nov', - u'[dD].cembre': u'dec' } - for k in frtoen.iterkeys(): - tmp = re.sub(k, frtoen[k], datefr) - if tmp <> datefr: break - return tmp - class Query(object): BASE_URL = 'http://fr.nicebooks.com/' @@ -119,7 +105,7 @@ class Query(object): def __call__(self, browser, verbose, timeout = 5.): if verbose: - print 'Query:', self.BASE_URL+self.urldata + print _('Query: %s') % self.BASE_URL+self.urldata try: raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() @@ -128,7 +114,9 @@ class Query(object): if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return - raise + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise NiceBooksError(_('Nicebooks encountered an error.')) if '<title>404 - ' in raw: return raw = xml_to_unicode(raw, strip_encoding_pats=True, @@ -136,7 +124,11 @@ class Query(object): try: feed = soupparser.fromstring(raw) except: - return + try: + #remove ASCII invalid chars + feed = soupparser.fromstring(clean_ascii_chars(raw)) + except: + return None #nb of page to call try: @@ -161,7 +153,11 @@ class Query(object): try: feed = soupparser.fromstring(raw) except: - continue + try: + #remove ASCII invalid chars + feed = soupparser.fromstring(clean_ascii_chars(raw)) + except: + continue pages.append(feed) results = [] @@ -180,14 +176,12 @@ class ResultList(list): self.reautclean = re.compile(u'\s*\(.*\)\s*') def get_title(self, entry): - # title = deepcopy(entry.find("div[@id='book-info']")) title = deepcopy(entry) title.remove(title.find("dl[@title='Informations sur le livre']")) title = ' '.join([i.text_content() for i in title.iterchildren()]) return unicode(title.replace('\n', '')) def get_authors(self, entry): - # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") author = entry.find("dl[@title='Informations sur le livre']") authortext = [] for x in author.getiterator('dt'): @@ -223,7 +217,7 @@ class ResultList(list): d = x.getnext().text_content() try: default = utcnow().replace(day=15) - d = replace_monthsfr(d) + d = replace_months(d, 'fr') d = parse_date(d, assume_utc=True, default=default) mi.pubdate = d except: @@ -234,11 +228,6 @@ class ResultList(list): mi = MetaInformation(title, authors) mi.author_sort = authors_to_sort_string(authors) mi.comments = self.get_description(entry, verbose) - # entry = entry.find("dl[@title='Informations sur le livre']") - # mi.publisher = self.get_publisher(entry) - # mi.pubdate = self.get_date(entry, verbose) - # mi.isbn = self.get_ISBN(entry) - # mi.language = self.get_language(entry) return self.get_book_info(entry, mi, verbose) def get_individual_metadata(self, browser, linkdata, verbose): @@ -249,7 +238,9 @@ class ResultList(list): if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return - raise + if isinstance(getattr(e, 'args', [None])[0], socket.timeout): + raise NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise NiceBooksError(_('Nicebooks encountered an error.')) if '<title>404 - ' in raw: report(verbose) return @@ -258,7 +249,11 @@ class ResultList(list): try: feed = soupparser.fromstring(raw) except: - return + try: + #remove ASCII invalid chars + feed = soupparser.fromstring(clean_ascii_chars(raw)) + except: + return None # get results return feed.xpath("//div[@id='container']")[0] @@ -292,13 +287,6 @@ class ResultList(list): continue self.append(self.fill_MI(entry, title, authors, verbose)) - -class NiceBooksError(Exception): - pass - -class ISBNNotFound(NiceBooksError): - pass - class Covers(object): def __init__(self, isbn = None): @@ -329,11 +317,10 @@ class Covers(object): return cover, ext if ext else 'jpg' except Exception, err: if isinstance(getattr(err, 'args', [None])[0], socket.timeout): - err = NiceBooksError(_('Nicebooks timed out. Try again later.')) - raise err + raise NiceBooksError(_('Nicebooks timed out. Try again later.')) if not len(self.urlimg): if not self.isbnf: - raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.')) + raise ISBNNotFound(_('ISBN: %s not found.') % self.isbn) raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher')) @@ -341,10 +328,10 @@ def search(title=None, author=None, publisher=None, isbn=None, max_results=5, verbose=False, keywords=None): br = browser() entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, - keywords=keywords, max_results=max_results)(br, verbose) + keywords=keywords, max_results=max_results)(br, verbose,timeout = 10.) if entries is None or len(entries) == 0: - return + return None #List of entry ans = ResultList() @@ -364,28 +351,28 @@ def cover_from_isbn(isbn, timeout = 5.): def option_parser(): parser = OptionParser(textwrap.dedent(\ - '''\ + _('''\ %prog [options] Fetch book metadata from Nicebooks. You must specify one of title, author, ISBN, publisher or keywords. Will fetch a maximum of 20 matches, so you should make your query as specific as possible. It can also get covers if the option is activated. - ''' + ''') )) - parser.add_option('-t', '--title', help='Book title') - parser.add_option('-a', '--author', help='Book author(s)') - parser.add_option('-p', '--publisher', help='Book publisher') - parser.add_option('-i', '--isbn', help='Book ISBN') - parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-t', '--title', help=_('Book title')) + parser.add_option('-a', '--author', help=_('Book author(s)')) + parser.add_option('-p', '--publisher', help=_('Book publisher')) + parser.add_option('-i', '--isbn', help=_('Book ISBN')) + parser.add_option('-k', '--keywords', help=_('Keywords')) parser.add_option('-c', '--covers', default=0, - help='Covers: 1-Check/ 2-Download') + help=_('Covers: 1-Check/ 2-Download')) parser.add_option('-p', '--coverspath', default='', - help='Covers files path') + help=_('Covers files path')) parser.add_option('-m', '--max-results', default=20, - help='Maximum number of results to fetch') + help=_('Maximum number of results to fetch')) parser.add_option('-v', '--verbose', default=0, action='count', - help='Be more verbose about errors') + help=_('Be more verbose about errors')) return parser def main(args=sys.argv): @@ -400,15 +387,15 @@ def main(args=sys.argv): parser.print_help() return 1 if results is None or len(results) == 0: - print 'No result found for this search!' + print _('No result found for this search!') return 0 for result in results: print unicode(result).encode(preferred_encoding, 'replace') covact = int(opts.covers) if covact == 1: - textcover = 'No cover found!' + textcover = _('No cover found!') if check_for_cover(result.isbn): - textcover = 'A cover was found for this book' + textcover = _('A cover was found for this book') print textcover elif covact == 2: cover_data, ext = cover_from_isbn(result.isbn) @@ -417,7 +404,7 @@ def main(args=sys.argv): cpath = os.path.normpath(opts.coverspath + '/' + result.isbn) oname = os.path.abspath(cpath+'.'+ext) open(oname, 'wb').write(cover_data) - print 'Cover saved to file ', oname + print _('Cover saved to file '), oname print if __name__ == '__main__': diff --git a/src/calibre/utils/cleantext.py b/src/calibre/utils/cleantext.py new file mode 100644 index 0000000000..b4afe7576d --- /dev/null +++ b/src/calibre/utils/cleantext.py @@ -0,0 +1,23 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian <sengian1@gmail.com>' +__docformat__ = 'restructuredtext en' + +import re + +_ascii_pat = None + +def clean_ascii_chars(txt, charlist=None): + 'remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default' + global _ascii_pat + if _ascii_pat is None: + chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \ + + [0x1A, 0x1B] + _ascii_pat = re.compile(u'|'.join(map(unichr, chars))) + + if charlist is None: + pat = _ascii_pat + else: + pat = re.compile(u'|'.join(map(unichr, charlist))) + return pat.sub('', txt) + diff --git a/src/calibre/utils/date.py b/src/calibre/utils/date.py index ec58c49628..f025a0c9bf 100644 --- a/src/calibre/utils/date.py +++ b/src/calibre/utils/date.py @@ -151,3 +151,45 @@ def format_date(dt, format, assume_utc=False, as_utc=False): format = re.sub('d{1,4}', format_day, format) format = re.sub('M{1,4}', format_month, format) return re.sub('yyyy|yy', format_year, format) + +def replace_months(datestr, clang): + # Replace months by english equivalent for parse_date + frtoen = { + u'[jJ]anvier': u'jan', + u'[fF].vrier': u'feb', + u'[mM]ars': u'mar', + u'[aA]vril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uin': u'jun', + u'[jJ]uillet': u'jul', + u'[aA]o.t': u'aug', + u'[sS]eptembre': u'sep', + u'[Oo]ctobre': u'oct', + u'[nN]ovembre': u'nov', + u'[dD].cembre': u'dec' } + detoen = { + u'[jJ]anuar': u'jan', + u'[fF]ebruar': u'feb', + u'[mM].rz': u'mar', + u'[aA]pril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uni': u'jun', + u'[jJ]uli': u'jul', + u'[aA]ugust': u'aug', + u'[sS]eptember': u'sep', + u'[Oo]ktober': u'oct', + u'[nN]ovember': u'nov', + u'[dD]ezember': u'dec' } + + if clang == 'fr': + dictoen = frtoen + elif clang == 'de': + dictoen = detoen + else: + return datestr + + for k in dictoen.iterkeys(): + tmp = re.sub(k, dictoen[k], datestr) + if tmp != datestr: break + return tmp + diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 38542a44c6..7ec94c32ff 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -284,7 +284,7 @@ icu_upper(PyObject *self, PyObject *args) { PyMem_Free(input); return ret; -} +} // }}} // lower {{{ static PyObject *