Initial implementation of basic metadata Amazon plugin

2026-05-20 22:12:37 -04:00 · 2010-12-06 11:51:41 -07:00
parent 51f48f0cb2 57e0e1820a
commit d43af28fcf
7 changed files with 1025 additions and 67 deletions
@@ -11,9 +11,9 @@ from calibre.ebooks.metadata.book.base import Metadata
 from calibre.devices.mime import mime_type_ext
 from calibre.devices.interface import BookList as _BookList
 from calibre.constants import preferred_encoding
-from calibre import isbytestring
+from calibre import isbytestring, force_unicode
 from calibre.utils.config import prefs, tweaks
-from calibre.utils.icu import sort_key, strcmp as icu_strcmp
+from calibre.utils.icu import strcmp

 class Book(Metadata):
    def __init__(self, prefix, lpath, size=None, other=None):
@@ -241,7 +241,7 @@ class CollectionsBookList(BookList):
            if y is None:
                return -1
            if isinstance(x, (unicode, str)):
-                c = strcmp(x, y)
+                c = strcmp(force_unicode(x), force_unicode(y))
            else:
                c = cmp(x, y)
            if c != 0:
@@ -0,0 +1,516 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2010, sengian <sengian1@gmail.com>'
+
+import sys, textwrap, re, traceback
+from urllib import urlencode
+from math import ceil
+
+from lxml import html
+from lxml.html import soupparser
+
+from calibre.utils.date import parse_date, utcnow, replace_months
+from calibre.utils.cleantext import clean_ascii_chars
+from calibre import browser, preferred_encoding
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.metadata import MetaInformation, check_isbn, \
+    authors_to_sort_string
+from calibre.ebooks.metadata.fetch import MetadataSource
+from calibre.utils.config import OptionParser
+from calibre.library.comments import sanitize_comments_html
+
+
+class AmazonFr(MetadataSource):
+
+    name = 'Amazon French'
+    description = _('Downloads metadata from amazon.fr')
+    supported_platforms = ['windows', 'osx', 'linux']
+    author = 'Sengian'
+    version = (1, 0, 0)
+    has_html_comments = True
+
+    def fetch(self):
+        try:
+            self.results = search(self.title, self.book_author, self.publisher,
+                                  self.isbn, max_results=10, verbose=self.verbose, lang='fr')
+        except Exception, e:
+            self.exception = e
+            self.tb = traceback.format_exc()
+
+class AmazonEs(MetadataSource):
+
+    name = 'Amazon Spanish'
+    description = _('Downloads metadata from amazon.com in spanish')
+    supported_platforms = ['windows', 'osx', 'linux']
+    author = 'Sengian'
+    version = (1, 0, 0)
+    has_html_comments = True
+
+    def fetch(self):
+        try:
+            self.results = search(self.title, self.book_author, self.publisher,
+                                  self.isbn, max_results=10, verbose=self.verbose, lang='es')
+        except Exception, e:
+            self.exception = e
+            self.tb = traceback.format_exc()
+
+class AmazonEn(MetadataSource):
+
+    name = 'Amazon English'
+    description = _('Downloads metadata from amazon.com in english')
+    supported_platforms = ['windows', 'osx', 'linux']
+    author = 'Sengian'
+    version = (1, 0, 0)
+    has_html_comments = True
+
+    def fetch(self):
+        try:
+            self.results = search(self.title, self.book_author, self.publisher,
+                                  self.isbn, max_results=10, verbose=self.verbose, lang='en')
+        except Exception, e:
+            self.exception = e
+            self.tb = traceback.format_exc()
+
+class AmazonDe(MetadataSource):
+
+    name = 'Amazon German'
+    description = _('Downloads metadata from amazon.de')
+    supported_platforms = ['windows', 'osx', 'linux']
+    author = 'Sengian'
+    version = (1, 0, 0)
+    has_html_comments = True
+
+    def fetch(self):
+        try:
+            self.results = search(self.title, self.book_author, self.publisher,
+                                  self.isbn, max_results=10, verbose=self.verbose, lang='de')
+        except Exception, e:
+            self.exception = e
+            self.tb = traceback.format_exc()
+
+class Amazon(MetadataSource):
+
+    name = 'Amazon'
+    description = _('Downloads metadata from amazon.com')
+    supported_platforms = ['windows', 'osx', 'linux']
+    author = 'Kovid Goyal & Sengian'
+    version = (1, 1, 0)
+    has_html_comments = True
+
+    def fetch(self):
+        # if not self.site_customization:
+            # return
+        try:
+            self.results = search(self.title, self.book_author, self.publisher,
+                                  self.isbn, max_results=10, verbose=self.verbose, lang='all')
+        except Exception, e:
+            self.exception = e
+            self.tb = traceback.format_exc()
+
+    # @property
+    # def string_customization_help(self):
+        # return _('You can select here the language for metadata search with amazon.com')
+
+
+def report(verbose):
+    if verbose:
+        traceback.print_exc()
+
+
+class Query(object):
+
+    BASE_URL_ALL = 'http://www.amazon.com'
+    BASE_URL_FR = 'http://www.amazon.fr'
+    BASE_URL_DE = 'http://www.amazon.de'
+
+    def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None,
+        max_results=20, rlang='all'):
+        assert not(title is None and author is None and publisher is None \
+            and isbn is None and keywords is None)
+        assert (max_results < 21)
+
+        self.max_results = int(max_results)
+        self.renbres = re.compile(u'\s*(\d+)\s*')
+
+        q = {   'search-alias' : 'stripbooks' ,
+                'unfiltered' : '1',
+                'field-keywords' : '',
+                'field-author' : '',
+                'field-title' : '',
+                'field-isbn' : '',
+                'field-publisher' : ''
+                #get to amazon detailed search page to get all options
+                # 'node' : '',
+                # 'field-binding' : '',
+                #before, during, after
+                # 'field-dateop' : '',
+                #month as number
+                # 'field-datemod' : '',
+                # 'field-dateyear' : '',
+                #french only
+                # 'field-collection' : '',
+                #many options available
+            }
+
+        if rlang =='all':
+            q['sort'] = 'relevanceexprank'
+            self.urldata = self.BASE_URL_ALL
+        elif rlang =='es':
+            q['sort'] = 'relevanceexprank'
+            q['field-language'] = 'Spanish'
+            self.urldata = self.BASE_URL_ALL
+        elif rlang =='en':
+            q['sort'] = 'relevanceexprank'
+            q['field-language'] = 'English'
+            self.urldata = self.BASE_URL_ALL
+        elif rlang =='fr':
+            q['sort'] = 'relevancerank'
+            self.urldata = self.BASE_URL_FR
+        elif rlang =='de':
+            q['sort'] = 'relevancerank'
+            self.urldata = self.BASE_URL_DE
+        self.baseurl = self.urldata
+
+        if isbn is not None:
+            q['field-isbn'] = isbn.replace('-', '')
+        else:
+            if title is not None:
+                q['field-title'] = title
+            if author is not None:
+                q['field-author'] = author
+            if publisher is not None:
+                q['field-publisher'] = publisher
+            if keywords is not None:
+                q['field-keywords'] = keywords
+
+        if isinstance(q, unicode):
+            q = q.encode('utf-8')
+        self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q)
+
+    def __call__(self, browser, verbose, timeout = 5.):
+        if verbose:
+            print 'Query:', self.urldata
+
+        try:
+            raw = browser.open_novisit(self.urldata, timeout=timeout).read()
+        except Exception, e:
+            report(verbose)
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                return
+            raise
+        if '<title>404 - ' in raw:
+            return
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+
+        try:
+            feed = soupparser.fromstring(raw)
+        except:
+            try:
+                #remove ASCII invalid chars
+                return soupparser.fromstring(clean_ascii_chars(raw))
+            except:
+                return None, self.urldata
+
+        #nb of page
+        try:
+            nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text)
+        except:
+            return None, self.urldata
+
+        pages =[feed]
+        if len(nbresults) > 1:
+            nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1])))
+            for i in xrange(2, nbpagetoquery + 1):
+                try:
+                    urldata = self.urldata + '&page=' + str(i)
+                    raw = browser.open_novisit(urldata, timeout=timeout).read()
+                except Exception, e:
+                    continue
+                if '<title>404 - ' in raw:
+                    continue
+                raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                        resolve_entities=True)[0]
+                try:
+                    feed = soupparser.fromstring(raw)
+                except:
+                    try:
+                        #remove ASCII invalid chars
+                        return soupparser.fromstring(clean_ascii_chars(raw))
+                    except:
+                        continue
+                pages.append(feed)
+
+        results = []
+        for x in pages:
+            results.extend([i.getparent().get('href') \
+                for i in x.xpath("//a/span[@class='srTitle']")])
+        return results[:self.max_results], self.baseurl
+
+class ResultList(list):
+
+    def __init__(self, baseurl, lang = 'all'):
+        self.baseurl = baseurl
+        self.lang = lang
+        self.repub = re.compile(u'\((.*)\)')
+        self.rerat = re.compile(u'([0-9.]+)')
+        self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>')
+        self.reoutp = re.compile(r'(?s)<em>--This text ref.*?</em>')
+        self.recom = re.compile(r'(?s)<!--.*?-->')
+        self.republi = re.compile(u'(Editeur|Publisher|Verlag)', re.I)
+        self.reisbn = re.compile(u'(ISBN-10|ISBN-10|ASIN)', re.I)
+        self.relang = re.compile(u'(Language|Langue|Sprache)', re.I)
+        self.reratelt = re.compile(u'(Average\s*Customer\s*Review|Moyenne\s*des\s*commentaires\s*client|Durchschnittliche\s*Kundenbewertung)', re.I)
+        self.reprod = re.compile(u'(Product\s*Details|D.tails\s*sur\s*le\s*produit|Produktinformation)', re.I)
+
+    def strip_tags_etree(self, etreeobj, invalid_tags):
+        for (itag, rmv) in invalid_tags.iteritems():
+            if rmv:
+                for elts in etreeobj.getiterator(itag):
+                    elts.drop_tree()
+            else:
+                for elts in etreeobj.getiterator(itag):
+                    elts.drop_tag()
+
+    def clean_entry(self, entry, invalid_tags = {'script': True},
+                invalid_id = (), invalid_class=()):
+        #invalid_tags: remove tag and keep content if False else remove
+        #remove tags
+        if invalid_tags:
+            self.strip_tags_etree(entry, invalid_tags)
+        #remove id
+        if invalid_id:
+            for eltid in invalid_id:
+                elt = entry.get_element_by_id(eltid)
+                if elt is not None:
+                    elt.drop_tree()
+        #remove class
+        if invalid_class:
+            for eltclass in invalid_class:
+                elts = entry.find_class(eltclass)
+                if elts is not None:
+                    for elt in elts:
+                        elt.drop_tree()
+
+    def get_title(self, entry):
+        title = entry.get_element_by_id('btAsinTitle')
+        if title is not None:
+            title = title.text
+        return unicode(title.replace('\n', '').strip())
+
+    def get_authors(self, entry):
+        author = entry.get_element_by_id('btAsinTitle')
+        while author.getparent().tag != 'div':
+            author = author.getparent()
+        author = author.getparent()
+        authortext = []
+        for x in author.getiterator('a'):
+            authortext.append(unicode(x.text_content().strip()))
+        return authortext
+
+    def get_description(self, entry, verbose):
+        try:
+            description = entry.get_element_by_id("productDescription").find("div[@class='content']")
+            inv_class = ('seeAll', 'emptyClear')
+            inv_tags ={'img': True, 'a': False}
+            self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class)
+            description = html.tostring(description, method='html', encoding=unicode).strip()
+            # remove all attributes from tags
+            description = self.reattr.sub(r'<\1>', description)
+            # Remove the notice about text referring to out of print editions
+            description = self.reoutp.sub('', description)
+            # Remove comments
+            description = self.recom.sub('', description)
+            return unicode(sanitize_comments_html(description))
+        except:
+            report(verbose)
+            return None
+
+    def get_tags(self, entry, browser, verbose):
+        try:
+            tags = entry.get_element_by_id('tagContentHolder')
+            testptag = tags.find_class('see-all')
+            if testptag:
+                for x in testptag:
+                    alink = x.xpath('descendant-or-self::a')
+                    if alink:
+                        if alink[0].get('class') == 'tgJsActive':
+                            continue
+                        link = self.baseurl + alink[0].get('href')
+                        entry = self.get_individual_metadata(browser, link, verbose)
+                        tags = entry.get_element_by_id('tagContentHolder')
+                        break
+            tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag']
+        except:
+            report(verbose)
+            tags = []
+        return tags
+
+    def get_book_info(self, entry, mi, verbose):
+        try:
+            entry = entry.get_element_by_id('SalesRank').getparent()
+        except:
+            try:
+                for z in entry.getiterator('h2'):
+                    if self.reprod.search(z.text_content()):
+                        entry = z.getparent().find("div[@class='content']/ul")
+                        break
+            except:
+                report(verbose)
+                return mi
+        elts = entry.findall('li')
+        #pub & date
+        elt = filter(lambda x: self.republi.search(x.find('b').text), elts)
+        if elt:
+            pub = elt[0].find('b').tail
+            mi.publisher = unicode(self.repub.sub('', pub).strip())
+            d = self.repub.search(pub)
+            if d is not None:
+                d = d.group(1)
+                try:
+                    default = utcnow().replace(day=15)
+                    if self.lang != 'all':
+                        d = replace_months(d, self.lang)
+                    d = parse_date(d, assume_utc=True, default=default)
+                    mi.pubdate = d
+                except:
+                    report(verbose)
+        #ISBN
+        elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts)
+        if elt:
+            isbn = elt[0].find('b').tail.replace('-', '').strip()
+            if check_isbn(isbn):
+                    mi.isbn = unicode(isbn)
+            elif len(elt) > 1:
+                isbn = elt[1].find('b').tail.replace('-', '').strip()
+                if check_isbn(isbn):
+                    mi.isbn = unicode(isbn)
+        #Langue
+        elt = filter(lambda x: self.relang.search(x.find('b').text), elts)
+        if elt:
+            langue = elt[0].find('b').tail.strip()
+            if langue:
+                mi.language = unicode(langue)
+        #ratings
+        elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts)
+        if elt:
+            ratings = elt[0].find_class('swSprite')
+            if ratings:
+                ratings = self.rerat.findall(ratings[0].get('title'))
+                if len(ratings) == 2:
+                    mi.rating = float(ratings[0])/float(ratings[1]) * 5
+        return mi
+
+    def fill_MI(self, entry, title, authors, browser, verbose):
+        mi = MetaInformation(title, authors)
+        mi.author_sort = authors_to_sort_string(authors)
+        mi.comments = self.get_description(entry, verbose)
+        mi = self.get_book_info(entry, mi, verbose)
+        mi.tags = self.get_tags(entry, browser, verbose)
+        return mi
+
+    def get_individual_metadata(self, browser, linkdata, verbose):
+        try:
+            raw = browser.open_novisit(linkdata).read()
+        except Exception, e:
+            report(verbose)
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                return
+            raise
+        if '<title>404 - ' in raw:
+            report(verbose)
+            return
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+        try:
+            return soupparser.fromstring(raw)
+        except:
+            try:
+                #remove ASCII invalid chars
+                return soupparser.fromstring(clean_ascii_chars(raw))
+            except:
+                report(verbose)
+                return
+
+    def populate(self, entries, browser, verbose=False):
+        for x in entries:
+            try:
+                entry = self.get_individual_metadata(browser, x, verbose)
+                # clean results
+                # inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop')
+                # inv_class = ('buyingDetailsGrid', 'productImageGrid')
+                # inv_tags ={'script': True, 'style': True, 'form': False}
+                # self.clean_entry(entry, invalid_id=inv_ids)
+                title = self.get_title(entry)
+                authors = self.get_authors(entry)
+            except Exception, e:
+                if verbose:
+                    print 'Failed to get all details for an entry'
+                    print e
+                    print 'URL who failed:', x
+                    report(verbose)
+                continue
+            self.append(self.fill_MI(entry, title, authors, browser, verbose))
+
+
+def search(title=None, author=None, publisher=None, isbn=None,
+           max_results=5, verbose=False, keywords=None, lang='all'):
+    br = browser()
+    entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher,
+        keywords=keywords, max_results=max_results,rlang=lang)(br, verbose)
+
+    if entries is None or len(entries) == 0:
+        return
+
+    #List of entry
+    ans = ResultList(baseurl, lang)
+    ans.populate(entries, br, verbose)
+    return ans
+
+def option_parser():
+    parser = OptionParser(textwrap.dedent(\
+    _('''\
+        %prog [options]
+
+        Fetch book metadata from Amazon. You must specify one of title, author,
+        ISBN, publisher or keywords. Will fetch a maximum of 10 matches,
+        so you should make your query as specific as possible.
+        You can chose the language for metadata retrieval:
+        All & english & french & german & spanish
+    '''
+    )))
+    parser.add_option('-t', '--title', help='Book title')
+    parser.add_option('-a', '--author', help='Book author(s)')
+    parser.add_option('-p', '--publisher', help='Book publisher')
+    parser.add_option('-i', '--isbn', help='Book ISBN')
+    parser.add_option('-k', '--keywords', help='Keywords')
+    parser.add_option('-m', '--max-results', default=10,
+                      help='Maximum number of results to fetch')
+    parser.add_option('-l', '--lang', default='all',
+                      help='Chosen language for metadata search (all, en, fr, es, de)')
+    parser.add_option('-v', '--verbose', default=0, action='count',
+                      help='Be more verbose about errors')
+    return parser
+
+def main(args=sys.argv):
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    try:
+        results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher,
+            keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results,
+                lang=opts.lang)
+    except AssertionError:
+        report(True)
+        parser.print_help()
+        return 1
+    if results is None or len(results) == 0:
+        print 'No result found for this search!'
+        return 0
+    for result in results:
+        print unicode(result).encode(preferred_encoding, 'replace')
+        print
+
+if __name__ == '__main__':
+    sys.exit(main())
@@ -0,0 +1,390 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2010, sengian <sengian1@gmail.com>'
+__docformat__ = 'restructuredtext en'
+
+import sys, textwrap, re, traceback, socket
+from urllib import urlencode
+
+from lxml.html import soupparser, tostring
+
+from calibre import browser, preferred_encoding
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.metadata import MetaInformation, check_isbn, \
+    authors_to_sort_string
+from calibre.library.comments import sanitize_comments_html
+from calibre.ebooks.metadata.fetch import MetadataSource
+from calibre.utils.config import OptionParser
+from calibre.utils.date import parse_date, utcnow
+from calibre.utils.cleantext import clean_ascii_chars
+
+class Fictionwise(MetadataSource): # {{{
+
+    author = 'Sengian'
+    name = 'Fictionwise'
+    description = _('Downloads metadata from Fictionwise')
+
+    has_html_comments = True
+
+    def fetch(self):
+        try:
+            self.results = search(self.title, self.book_author, self.publisher,
+                self.isbn, max_results=10, verbose=self.verbose)
+        except Exception, e:
+            self.exception = e
+            self.tb = traceback.format_exc()
+
+    # }}}
+
+class FictionwiseError(Exception):
+    pass
+
+def report(verbose):
+    if verbose:
+        traceback.print_exc()
+
+class Query(object):
+
+    BASE_URL = 'http://www.fictionwise.com/servlet/mw'
+
+    def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20):
+        assert not(title is None and author is None and publisher is None and keywords is None)
+        assert (max_results < 21)
+
+        self.max_results = int(max_results)
+        q = {   'template' : 'searchresults_adv.htm' ,
+                'searchtitle' : '',
+                'searchauthor' : '',
+                'searchpublisher' : '',
+                'searchkeyword' : '',
+                #possibilities startoflast, fullname, lastfirst
+                'searchauthortype' : 'startoflast',
+                'searchcategory' : '',
+                'searchcategory2' : '',
+                'searchprice_s' : '0',
+                'searchprice_e' : 'ANY',
+                'searchformat' : '',
+                'searchgeo' : 'US',
+                'searchfwdatetype' : '',
+                #maybe use dates fields if needed?
+                #'sortorder' : 'DESC',
+                #many options available: b.SortTitle, a.SortName,
+                #b.DateFirstPublished, b.FWPublishDate
+                'sortby' : 'b.SortTitle'
+            }
+        if title is not None:
+            q['searchtitle'] = title
+        if author is not None:
+            q['searchauthor'] = author
+        if publisher is not None:
+            q['searchpublisher'] = publisher
+        if keywords is not None:
+            q['searchkeyword'] = keywords
+
+        if isinstance(q, unicode):
+            q = q.encode('utf-8')
+        self.urldata = urlencode(q)
+
+    def __call__(self, browser, verbose, timeout = 5.):
+        if verbose:
+            print _('Query: %s') % self.BASE_URL+self.urldata
+
+        try:
+            raw = browser.open_novisit(self.BASE_URL, self.urldata, timeout=timeout).read()
+        except Exception, e:
+            report(verbose)
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                return
+            if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
+                raise FictionwiseError(_('Fictionwise timed out. Try again later.'))
+            raise FictionwiseError(_('Fictionwise encountered an error.'))
+        if '<title>404 - ' in raw:
+            return
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+        try:
+            feed = soupparser.fromstring(raw)
+        except:
+            try:
+                #remove ASCII invalid chars
+                feed = soupparser.fromstring(clean_ascii_chars(raw))
+            except:
+                return None
+
+        # get list of results as links
+        results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]")
+        results = results[:self.max_results]
+        results = [i.xpath('descendant-or-self::a')[0].get('href') for i in results]
+        #return feed if no links ie normally a single book or nothing
+        if not results:
+            results = [feed]
+        return results
+
+class ResultList(list):
+
+    BASE_URL = 'http://www.fictionwise.com'
+    COLOR_VALUES = {'BLUE': 4, 'GREEN': 3, 'YELLOW': 2, 'RED': 1, 'NA': 0}
+
+    def __init__(self):
+        self.retitle = re.compile(r'\[[^\[\]]+\]')
+        self.rechkauth = re.compile(r'.*book\s*by', re.I)
+        self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I)
+        self.repub = re.compile(r'.*publisher\s*:\s*', re.I)
+        self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I)
+        self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I)
+        self.resplitbr = re.compile(r'<br[^>]*>', re.I)
+        self.recomment = re.compile(r'(?s)<!--.*?-->')
+        self.reimg = re.compile(r'<img[^>]*>', re.I)
+        self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I)
+        self.renbcom = re.compile('(?P<nbcom>\d+)\s*Reader Ratings:')
+        self.recolor = re.compile('(?P<ncolor>[^/]+).gif')
+        self.resplitbrdiv = re.compile(r'(<br[^>]+>|</?div[^>]*>)', re.I)
+        self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I)
+
+    def strip_tags_etree(self, etreeobj, invalid_tags):
+        for (itag, rmv) in invalid_tags.iteritems():
+            if rmv:
+                for elts in etreeobj.getiterator(itag):
+                    elts.drop_tree()
+            else:
+                for elts in etreeobj.getiterator(itag):
+                    elts.drop_tag()
+
+    def clean_entry(self, entry, invalid_tags = {'script': True},
+                invalid_id = (), invalid_class=(), invalid_xpath = ()):
+        #invalid_tags: remove tag and keep content if False else remove
+        #remove tags
+        if invalid_tags:
+            self.strip_tags_etree(entry, invalid_tags)
+        #remove xpath
+        if invalid_xpath:
+            for eltid in invalid_xpath:
+                elt = entry.xpath(eltid)
+                for el in elt:
+                    el.drop_tree()
+        #remove id
+        if invalid_id:
+            for eltid in invalid_id:
+                elt = entry.get_element_by_id(eltid)
+                if elt is not None:
+                    elt.drop_tree()
+        #remove class
+        if invalid_class:
+            for eltclass in invalid_class:
+                elts = entry.find_class(eltclass)
+                if elts is not None:
+                    for elt in elts:
+                        elt.drop_tree()
+
+    def output_entry(self, entry, prettyout = True, htmlrm="\d+"):
+        out = tostring(entry, pretty_print=prettyout)
+        #try to work around tostring to remove this encoding for exemle
+        reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)')
+        return reclean.sub('', out)
+
+    def get_title(self, entry):
+        title = entry.findtext('./')
+        return self.retitle.sub('', title).strip()
+
+    def get_authors(self, entry):
+        authortext = entry.find('./br').tail
+        if not self.rechkauth.search(authortext):
+            return []
+        authortext = self.rechkauth.sub('', authortext)
+        return [a.strip() for a in authortext.split('&')]
+
+    def get_rating(self, entrytable, verbose):
+        nbcomment = tostring(entrytable.getprevious())
+        try:
+            nbcomment = self.renbcom.search(nbcomment).group("nbcom")
+        except:
+            report(verbose)
+            return None
+        hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")],
+                    float(image.get('height', default=0))) \
+                        for image in entrytable.getiterator('img'))
+        #ratings as x/5
+        return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()))
+
+    def get_description(self, entry):
+        description = self.output_entry(entry.xpath('./p')[1],htmlrm="")
+        description = self.redesc.search(description)
+        if not description or not description.group("desc"):
+            return None
+        #remove invalid tags
+        description = self.reimg.sub('', description.group("desc"))
+        description = self.recomment.sub('', description)
+        description = self.resanitize.sub('', sanitize_comments_html(description))
+        return _('SUMMARY:\n %s') % re.sub(r'\n\s+</p>','\n</p>', description)
+
+    def get_publisher(self, entry):
+        publisher = self.output_entry(entry.xpath('./p')[1])
+        publisher = filter(lambda x: self.repub.search(x) is not None,
+            self.resplitbr.split(publisher))
+        if not len(publisher):
+            return None
+        publisher = self.repub.sub('', publisher[0])
+        return publisher.split(',')[0].strip()
+
+    def get_tags(self, entry):
+        tag = self.output_entry(entry.xpath('./p')[1])
+        tag = filter(lambda x: self.retag.search(x) is not None,
+            self.resplitbr.split(tag))
+        if not len(tag):
+            return []
+        return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/'))
+
+    def get_date(self, entry, verbose):
+        date = self.output_entry(entry.xpath('./p')[1])
+        date = filter(lambda x: self.redate.search(x) is not None,
+            self.resplitbr.split(date))
+        if not len(date):
+            return None
+        try:
+            d = self.redate.sub('', date[0])
+            if d:
+                default = utcnow().replace(day=15)
+                d = parse_date(d, assume_utc=True, default=default)
+            else:
+                d = None
+        except:
+            report(verbose)
+            d = None
+        return d
+
+    def get_ISBN(self, entry):
+        isbns = self.output_entry(entry.xpath('./p')[2])
+        isbns = filter(lambda x: self.reisbn.search(x) is not None,
+            self.resplitbrdiv.split(isbns))
+        if not len(isbns):
+            return None
+        isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))]
+        return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1]
+
+    def fill_MI(self, entry, title, authors, ratings, verbose):
+        mi = MetaInformation(title, authors)
+        mi.rating = ratings
+        mi.comments = self.get_description(entry)
+        mi.publisher = self.get_publisher(entry)
+        mi.tags = self.get_tags(entry)
+        mi.pubdate = self.get_date(entry, verbose)
+        mi.isbn = self.get_ISBN(entry)
+        mi.author_sort = authors_to_sort_string(authors)
+        return mi
+
+    def get_individual_metadata(self, browser, linkdata, verbose):
+        try:
+            raw = browser.open_novisit(self.BASE_URL + linkdata).read()
+        except Exception, e:
+            report(verbose)
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                return
+            if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
+                raise FictionwiseError(_('Fictionwise timed out. Try again later.'))
+            raise FictionwiseError(_('Fictionwise encountered an error.'))
+        if '<title>404 - ' in raw:
+            report(verbose)
+            return
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+        try:
+            return soupparser.fromstring(raw)
+        except:
+            try:
+                #remove ASCII invalid chars
+                return soupparser.fromstring(clean_ascii_chars(raw))
+            except:
+                return None
+
+    def populate(self, entries, browser, verbose=False):
+        inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False,
+            'ul': False, 'span': False}
+        inv_xpath =('./table',)
+        #single entry
+        if len(entries) == 1 and not isinstance(entries[0], str):
+            try:
+                entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")
+                self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
+                title = self.get_title(entry)
+                #maybe strenghten the search
+                ratings =  self.get_rating(entry.xpath("./p/table")[1], verbose)
+                authors = self.get_authors(entry)
+            except Exception, e:
+                if verbose:
+                    print _('Failed to get all details for an entry')
+                    print e
+                return
+            self.append(self.fill_MI(entry, title, authors, ratings, verbose))
+        else:
+            #multiple entries
+            for x in entries:
+                try:
+                    entry = self.get_individual_metadata(browser, x, verbose)
+                    entry = entry.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0]
+                    self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
+                    title = self.get_title(entry)
+                    #maybe strenghten the search
+                    ratings =  self.get_rating(entry.xpath("./p/table")[1], verbose)
+                    authors = self.get_authors(entry)
+                except Exception, e:
+                    if verbose:
+                        print _('Failed to get all details for an entry')
+                        print e
+                    continue
+                self.append(self.fill_MI(entry, title, authors, ratings, verbose))
+
+
+def search(title=None, author=None, publisher=None, isbn=None,
+           min_viewability='none', verbose=False, max_results=5,
+            keywords=None):
+    br = browser()
+    entries = Query(title=title, author=author, publisher=publisher,
+        keywords=keywords, max_results=max_results)(br, verbose, timeout = 15.)
+
+    #List of entry
+    ans = ResultList()
+    ans.populate(entries, br, verbose)
+    return ans
+
+
+def option_parser():
+    parser = OptionParser(textwrap.dedent(\
+    _('''\
+        %prog [options]
+
+        Fetch book metadata from Fictionwise. You must specify one of title, author,
+        or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches,
+        so you should make your query as specific as possible.
+    ''')
+    ))
+    parser.add_option('-t', '--title', help=_('Book title'))
+    parser.add_option('-a', '--author', help=_('Book author(s)'))
+    parser.add_option('-p', '--publisher', help=_('Book publisher'))
+    parser.add_option('-k', '--keywords', help=_('Keywords'))
+    parser.add_option('-m', '--max-results', default=20,
+                      help=_('Maximum number of results to fetch'))
+    parser.add_option('-v', '--verbose', default=0, action='count',
+                      help=_('Be more verbose about errors'))
+    return parser
+
+def main(args=sys.argv):
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    try:
+        results = search(opts.title, opts.author, publisher=opts.publisher,
+            keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results)
+    except AssertionError:
+        report(True)
+        parser.print_help()
+        return 1
+    if results is None or len(results) == 0:
+        print _('No result found for this search!')
+        return 0
+    for result in results:
+        print unicode(result).encode(preferred_encoding, 'replace')
+        print
+
+if __name__ == '__main__':
+    sys.exit(main())
@@ -10,7 +10,8 @@ from copy import deepcopy

 from lxml.html import soupparser

-from calibre.utils.date import parse_date, utcnow
+from calibre.utils.date import parse_date, utcnow, replace_months
+from calibre.utils.cleantext import clean_ascii_chars
 from calibre import browser, preferred_encoding
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.metadata import MetaInformation, check_isbn, \
@@ -71,31 +72,16 @@ class NiceBooksCovers(CoverDownload):
                traceback.format_exc(), self.name))


+class NiceBooksError(Exception):
+    pass
+
+class ISBNNotFound(NiceBooksError):
+    pass
+
 def report(verbose):
    if verbose:
-        import traceback
        traceback.print_exc()

-def replace_monthsfr(datefr):
-    # Replace french months by english equivalent for parse_date
-    frtoen = {
-        u'[jJ]anvier': u'jan',
-        u'[fF].vrier': u'feb',
-        u'[mM]ars': u'mar',
-        u'[aA]vril': u'apr',
-        u'[mM]ai': u'may',
-        u'[jJ]uin': u'jun',
-        u'[jJ]uillet': u'jul',
-        u'[aA]o.t': u'aug',
-        u'[sS]eptembre': u'sep',
-        u'[Oo]ctobre': u'oct',
-        u'[nN]ovembre': u'nov',
-        u'[dD].cembre': u'dec' }
-    for k in frtoen.iterkeys():
-        tmp = re.sub(k, frtoen[k], datefr)
-        if tmp <> datefr: break
-    return tmp
-
 class Query(object):

    BASE_URL = 'http://fr.nicebooks.com/'
@@ -119,7 +105,7 @@ class Query(object):

    def __call__(self, browser, verbose, timeout = 5.):
        if verbose:
-            print 'Query:', self.BASE_URL+self.urldata
+            print _('Query: %s') % self.BASE_URL+self.urldata

        try:
            raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read()
@@ -128,7 +114,9 @@ class Query(object):
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return
-            raise
+            if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
+                raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
+            raise NiceBooksError(_('Nicebooks encountered an error.'))
        if '<title>404 - ' in raw:
            return
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
@@ -136,7 +124,11 @@ class Query(object):
        try:
            feed = soupparser.fromstring(raw)
        except:
-            return
+            try:
+                #remove ASCII invalid chars
+                feed = soupparser.fromstring(clean_ascii_chars(raw))
+            except:
+                return None

        #nb of page to call
        try:
@@ -161,7 +153,11 @@ class Query(object):
                try:
                    feed = soupparser.fromstring(raw)
                except:
-                    continue
+                    try:
+                        #remove ASCII invalid chars
+                        feed = soupparser.fromstring(clean_ascii_chars(raw))
+                    except:
+                        continue
                pages.append(feed)

        results = []
@@ -180,14 +176,12 @@ class ResultList(list):
        self.reautclean = re.compile(u'\s*\(.*\)\s*')

    def get_title(self, entry):
-        # title = deepcopy(entry.find("div[@id='book-info']"))
        title = deepcopy(entry)
        title.remove(title.find("dl[@title='Informations sur le livre']"))
        title = ' '.join([i.text_content() for i in title.iterchildren()])
        return unicode(title.replace('\n', ''))

    def get_authors(self, entry):
-        # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
        author = entry.find("dl[@title='Informations sur le livre']")
        authortext = []
        for x in author.getiterator('dt'):
@@ -223,7 +217,7 @@ class ResultList(list):
                d = x.getnext().text_content()
                try:
                    default = utcnow().replace(day=15)
-                    d = replace_monthsfr(d)
+                    d = replace_months(d, 'fr')
                    d = parse_date(d, assume_utc=True, default=default)
                    mi.pubdate = d
                except:
@@ -234,11 +228,6 @@ class ResultList(list):
        mi = MetaInformation(title, authors)
        mi.author_sort = authors_to_sort_string(authors)
        mi.comments = self.get_description(entry, verbose)
-        # entry = entry.find("dl[@title='Informations sur le livre']")
-        # mi.publisher = self.get_publisher(entry)
-        # mi.pubdate = self.get_date(entry, verbose)
-        # mi.isbn = self.get_ISBN(entry)
-        # mi.language = self.get_language(entry)
        return self.get_book_info(entry, mi, verbose)

    def get_individual_metadata(self, browser, linkdata, verbose):
@@ -249,7 +238,9 @@ class ResultList(list):
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return
-            raise
+            if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
+                raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
+            raise NiceBooksError(_('Nicebooks encountered an error.'))
        if '<title>404 - ' in raw:
            report(verbose)
            return
@@ -258,7 +249,11 @@ class ResultList(list):
        try:
            feed = soupparser.fromstring(raw)
        except:
-            return
+            try:
+                #remove ASCII invalid chars
+                feed = soupparser.fromstring(clean_ascii_chars(raw))
+            except:
+                return None

        # get results
        return feed.xpath("//div[@id='container']")[0]
@@ -292,13 +287,6 @@ class ResultList(list):
                    continue
                self.append(self.fill_MI(entry, title, authors, verbose))

-
-class NiceBooksError(Exception):
-    pass
-
-class ISBNNotFound(NiceBooksError):
-    pass
-
 class Covers(object):

    def __init__(self, isbn = None):
@@ -329,11 +317,10 @@ class Covers(object):
            return cover, ext if ext else 'jpg'
        except Exception, err:
            if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
-                err = NiceBooksError(_('Nicebooks timed out. Try again later.'))
-                raise err
+                raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
            if not len(self.urlimg):
                if not self.isbnf:
-                    raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.'))
+                    raise ISBNNotFound(_('ISBN: %s not found.') % self.isbn)
                raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher'))


@@ -341,10 +328,10 @@ def search(title=None, author=None, publisher=None, isbn=None,
           max_results=5, verbose=False, keywords=None):
    br = browser()
    entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
-        keywords=keywords, max_results=max_results)(br, verbose)
+        keywords=keywords, max_results=max_results)(br, verbose,timeout = 10.)

    if entries is None or len(entries) == 0:
-        return
+        return None

    #List of entry
    ans = ResultList()
@@ -364,28 +351,28 @@ def cover_from_isbn(isbn, timeout = 5.):

 def option_parser():
    parser = OptionParser(textwrap.dedent(\
-    '''\
+    _('''\
        %prog [options]

        Fetch book metadata from Nicebooks. You must specify one of title, author,
        ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
        so you should make your query as specific as possible.
        It can also get covers if the option is activated.
-    '''
+    ''')
    ))
-    parser.add_option('-t', '--title', help='Book title')
-    parser.add_option('-a', '--author', help='Book author(s)')
-    parser.add_option('-p', '--publisher', help='Book publisher')
-    parser.add_option('-i', '--isbn', help='Book ISBN')
-    parser.add_option('-k', '--keywords', help='Keywords')
+    parser.add_option('-t', '--title', help=_('Book title'))
+    parser.add_option('-a', '--author', help=_('Book author(s)'))
+    parser.add_option('-p', '--publisher', help=_('Book publisher'))
+    parser.add_option('-i', '--isbn', help=_('Book ISBN'))
+    parser.add_option('-k', '--keywords', help=_('Keywords'))
    parser.add_option('-c', '--covers', default=0,
-                      help='Covers: 1-Check/ 2-Download')
+                      help=_('Covers: 1-Check/ 2-Download'))
    parser.add_option('-p', '--coverspath', default='',
-                      help='Covers files path')
+                      help=_('Covers files path'))
    parser.add_option('-m', '--max-results', default=20,
-                      help='Maximum number of results to fetch')
+                      help=_('Maximum number of results to fetch'))
    parser.add_option('-v', '--verbose', default=0, action='count',
-                      help='Be more verbose about errors')
+                      help=_('Be more verbose about errors'))
    return parser

 def main(args=sys.argv):
@@ -400,15 +387,15 @@ def main(args=sys.argv):
        parser.print_help()
        return 1
    if results is None or len(results) == 0:
-        print 'No result found for this search!'
+        print _('No result found for this search!')
        return 0
    for result in results:
        print unicode(result).encode(preferred_encoding, 'replace')
        covact = int(opts.covers)
        if  covact == 1:
-            textcover = 'No cover found!'
+            textcover = _('No cover found!')
            if check_for_cover(result.isbn):
-                textcover = 'A cover was found for this book'
+                textcover = _('A cover was found for this book')
            print textcover
        elif covact == 2:
            cover_data, ext = cover_from_isbn(result.isbn)
@@ -417,7 +404,7 @@ def main(args=sys.argv):
                cpath = os.path.normpath(opts.coverspath + '/' + result.isbn)
            oname = os.path.abspath(cpath+'.'+ext)
            open(oname, 'wb').write(cover_data)
-            print 'Cover saved to file ', oname
+            print _('Cover saved to file '), oname
        print

 if __name__ == '__main__':
@@ -0,0 +1,23 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2010, sengian <sengian1@gmail.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+_ascii_pat = None
+
+def clean_ascii_chars(txt, charlist=None):
+    'remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default'
+    global _ascii_pat
+    if _ascii_pat is None:
+        chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \
+            + [0x1A, 0x1B]
+        _ascii_pat = re.compile(u'|'.join(map(unichr, chars)))
+
+    if charlist is None:
+        pat = _ascii_pat
+    else:
+        pat = re.compile(u'|'.join(map(unichr, charlist)))
+    return pat.sub('', txt)
+
@@ -151,3 +151,45 @@ def format_date(dt, format, assume_utc=False, as_utc=False):
    format = re.sub('d{1,4}', format_day, format)
    format = re.sub('M{1,4}', format_month, format)
    return re.sub('yyyy|yy', format_year, format)
+
+def replace_months(datestr, clang):
+    # Replace months by english equivalent for parse_date
+    frtoen = {
+        u'[jJ]anvier': u'jan',
+        u'[fF].vrier': u'feb',
+        u'[mM]ars': u'mar',
+        u'[aA]vril': u'apr',
+        u'[mM]ai': u'may',
+        u'[jJ]uin': u'jun',
+        u'[jJ]uillet': u'jul',
+        u'[aA]o.t': u'aug',
+        u'[sS]eptembre': u'sep',
+        u'[Oo]ctobre': u'oct',
+        u'[nN]ovembre': u'nov',
+        u'[dD].cembre': u'dec' }
+    detoen = {
+        u'[jJ]anuar': u'jan',
+        u'[fF]ebruar': u'feb',
+        u'[mM].rz': u'mar',
+        u'[aA]pril': u'apr',
+        u'[mM]ai': u'may',
+        u'[jJ]uni': u'jun',
+        u'[jJ]uli': u'jul',
+        u'[aA]ugust': u'aug',
+        u'[sS]eptember': u'sep',
+        u'[Oo]ktober': u'oct',
+        u'[nN]ovember': u'nov',
+        u'[dD]ezember': u'dec' }
+
+    if clang == 'fr':
+        dictoen = frtoen
+    elif clang == 'de':
+        dictoen = detoen
+    else:
+        return datestr
+
+    for k in dictoen.iterkeys():
+        tmp = re.sub(k, dictoen[k], datestr)
+        if tmp != datestr: break
+    return tmp
+
@@ -284,7 +284,7 @@ icu_upper(PyObject *self, PyObject *args) {
    PyMem_Free(input);

    return ret;
-}
+} // }}}

 // lower {{{
 static PyObject *