From fc524ee7d4eccbe6f8a0ae63f56a06caaf248fa0 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 21 Nov 2010 09:52:40 -0700
Subject: [PATCH] Metadata and cover download plugins from Nicebooks

---
 src/calibre/customize/builtins.py        |   6 +-
 src/calibre/customize/ui.py              |   2 +-
 src/calibre/ebooks/metadata/nicebooks.py | 424 +++++++++++++++++++++++
 src/calibre/gui2/wizard/__init__.py      |  12 +-
 4 files changed, 437 insertions(+), 7 deletions(-)
 create mode 100644 src/calibre/ebooks/metadata/nicebooks.py

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 681d953c9b..87946706cf 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -483,6 +483,7 @@ from calibre.devices.kobo.driver import KOBO
 from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
     LibraryThing
 from calibre.ebooks.metadata.douban import DoubanBooks
+from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
 from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
         LibraryThingCovers, DoubanCovers
 from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
@@ -490,8 +491,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested
 from calibre.ebooks.epub.fix.epubcheck import Epubcheck
 
 plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
-        LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
-        Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers]
+        LibraryThing, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
+        Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers,
+        NiceBooksCovers]
 plugins += [
     ComicInput,
     EPUBInput,
diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py
index 844269e453..c360122842 100644
--- a/src/calibre/customize/ui.py
+++ b/src/calibre/customize/ui.py
@@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name):
     config['enabled_plugins'] = ep
 
 default_disabled_plugins = set([
-    'Douban Books', 'Douban.com covers',
+    'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers'
 ])
 
 def is_disabled(plugin):
diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py
new file mode 100644
index 0000000000..4d19e9611b
--- /dev/null
+++ b/src/calibre/ebooks/metadata/nicebooks.py
@@ -0,0 +1,424 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2010, sengian <sengian1@gmail.com>'
+__docformat__ = 'restructuredtext en'
+
+import sys, textwrap, re, traceback, socket
+from urllib import urlencode
+from math import ceil
+from copy import deepcopy
+
+from lxml.html import soupparser
+
+from calibre.utils.date import parse_date, utcnow
+from calibre import browser, preferred_encoding
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.metadata import MetaInformation, check_isbn, \
+    authors_to_sort_string
+from calibre.ebooks.metadata.fetch import MetadataSource
+from calibre.ebooks.metadata.covers import CoverDownload
+from calibre.utils.config import OptionParser
+
+class NiceBooks(MetadataSource):
+
+    name = 'Nicebooks'
+    description = _('Downloads metadata from french Nicebooks')
+    supported_platforms = ['windows', 'osx', 'linux']
+    author = 'Sengian'
+    version             = (1, 0, 0)
+
+    def fetch(self):
+        try:
+            self.results = search(self.title, self.book_author, self.publisher,
+                                  self.isbn, max_results=10, verbose=self.verbose)
+        except Exception, e:
+            self.exception = e
+            self.tb = traceback.format_exc()
+
+class NiceBooksCovers(CoverDownload):
+
+    name = 'Nicebooks covers'
+    description = _('Downloads covers from french Nicebooks')
+    supported_platforms = ['windows', 'osx', 'linux']
+    author = 'Sengian'
+    type = _('Cover download')
+    version             = (1, 0, 0)
+
+    def has_cover(self, mi, ans, timeout=5.):
+        if not mi.isbn:
+            return False
+        br = browser()
+        try:
+            entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0]
+            if Covers(mi.isbn)(entry).check_cover():
+                self.debug('cover for', mi.isbn, 'found')
+                ans.set()
+        except Exception, e:
+            self.debug(e)
+
+    def get_covers(self, mi, result_queue, abort, timeout=5.):
+        if not mi.isbn:
+            return
+        br = browser()
+        try:
+            entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0]
+            cover_data, ext = Covers(mi.isbn)(entry).get_cover(br, timeout)
+            if not ext:
+                ext = 'jpg'
+            result_queue.put((True, cover_data, ext, self.name))
+        except Exception, e:
+            result_queue.put((False, self.exception_to_string(e),
+                traceback.format_exc(), self.name))
+
+
+def report(verbose):
+    if verbose:
+        import traceback
+        traceback.print_exc()
+
+def replace_monthsfr(datefr):
+    # Replace french months by english equivalent for parse_date
+    frtoen = {
+        u'[jJ]anvier': u'jan',
+        u'[fF].vrier': u'feb',
+        u'[mM]ars': u'mar',
+        u'[aA]vril': u'apr',
+        u'[mM]ai': u'may',
+        u'[jJ]uin': u'jun',
+        u'[jJ]uillet': u'jul',
+        u'[aA]o.t': u'aug',
+        u'[sS]eptembre': u'sep',
+        u'[Oo]ctobre': u'oct',
+        u'[nN]ovembre': u'nov',
+        u'[dD].cembre': u'dec' }
+    for k in frtoen.iterkeys():
+        tmp = re.sub(k, frtoen[k], datefr)
+        if tmp <> datefr: break
+    return tmp
+
+class Query(object):
+
+    BASE_URL = 'http://fr.nicebooks.com/'
+
+    def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20):
+        assert not(title is None and author is None and publisher is None \
+            and isbn is None and keywords is None)
+        assert (max_results < 21)
+
+        self.max_results = int(max_results)
+
+        if isbn is not None:
+            q = isbn
+        else:
+            q = ' '.join([i for i in (title, author, publisher, keywords) \
+                if i is not None])
+
+        if isinstance(q, unicode):
+            q = q.encode('utf-8')
+        self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'})
+
+    def __call__(self, browser, verbose, timeout = 5.):
+        if verbose:
+            print 'Query:', self.BASE_URL+self.urldata
+
+        try:
+            raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read()
+        except Exception, e:
+            report(verbose)
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                return
+            raise
+        if '<title>404 - ' in raw:
+            return
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+        try:
+            feed = soupparser.fromstring(raw)
+        except:
+            return
+
+        #nb of page to call
+        try:
+            nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text)
+        except:
+            #direct hit
+            return [feed]
+
+        nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10))
+        pages =[feed]
+        if nbpagetoquery > 1:
+            for i in xrange(2, nbpagetoquery + 1):
+                try:
+                    urldata = self.urldata + '&p=' + str(i)
+                    raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read()
+                except Exception, e:
+                    continue
+                if '<title>404 - ' in raw:
+                    continue
+                raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                        resolve_entities=True)[0]
+                try:
+                    feed = soupparser.fromstring(raw)
+                except:
+                    continue
+                pages.append(feed)
+
+        results = []
+        for x in pages:
+            results.extend([i.find_class('title')[0].get('href') \
+                for i in x.xpath("//ul[@id='results']/li")])
+        return results[:self.max_results]
+
+class ResultList(list):
+
+    BASE_URL = 'http://fr.nicebooks.com'
+
+    def __init__(self):
+        self.repub = re.compile(u'\s*.diteur\s*', re.I)
+        self.reauteur = re.compile(u'\s*auteur.*', re.I)
+        self.reautclean = re.compile(u'\s*\(.*\)\s*')
+
+    def get_title(self, entry):
+        # title = deepcopy(entry.find("div[@id='book-info']"))
+        title = deepcopy(entry)
+        title.remove(title.find("dl[@title='Informations sur le livre']"))
+        title = ' '.join([i.text_content() for i in title.iterchildren()])
+        return unicode(title.replace('\n', ''))
+
+    def get_authors(self, entry):
+        # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
+        author = entry.find("dl[@title='Informations sur le livre']")
+        authortext = []
+        for x in author.getiterator('dt'):
+            if self.reauteur.match(x.text):
+                elt = x.getnext()
+                while elt.tag == 'dd':
+                    authortext.append(unicode(elt.text_content()))
+                    elt = elt.getnext()
+                break
+        if len(authortext) == 1:
+            authortext = [self.reautclean.sub('', authortext[0])]
+        return authortext
+
+    def get_description(self, entry, verbose):
+        try:
+            return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text)
+        except:
+            report(verbose)
+            return None
+
+    def get_book_info(self, entry, mi, verbose):
+        entry = entry.find("dl[@title='Informations sur le livre']")
+        for x in entry.getiterator('dt'):
+            if x.text == 'ISBN':
+                isbntext = x.getnext().text_content().replace('-', '')
+                if check_isbn(isbntext):
+                    mi.isbn = unicode(isbntext)
+            elif self.repub.match(x.text):
+                mi.publisher = unicode(x.getnext().text_content())
+            elif x.text == 'Langue':
+                mi.language = unicode(x.getnext().text_content())
+            elif x.text == 'Date de parution':
+                d = x.getnext().text_content()
+                try:
+                    default = utcnow().replace(day=15)
+                    d = replace_monthsfr(d)
+                    d = parse_date(d, assume_utc=True, default=default)
+                    mi.pubdate = d
+                except:
+                    report(verbose)
+        return mi
+
+    def fill_MI(self, entry, title, authors, verbose):
+        mi = MetaInformation(title, authors)
+        mi.author_sort = authors_to_sort_string(authors)
+        mi.comments = self.get_description(entry, verbose)
+        # entry = entry.find("dl[@title='Informations sur le livre']")
+        # mi.publisher = self.get_publisher(entry)
+        # mi.pubdate = self.get_date(entry, verbose)
+        # mi.isbn = self.get_ISBN(entry)
+        # mi.language = self.get_language(entry)
+        return self.get_book_info(entry, mi, verbose)
+
+    def get_individual_metadata(self, browser, linkdata, verbose):
+        try:
+            raw = browser.open_novisit(self.BASE_URL + linkdata).read()
+        except Exception, e:
+            report(verbose)
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                return
+            raise
+        if '<title>404 - ' in raw:
+            report(verbose)
+            return
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+        try:
+            feed = soupparser.fromstring(raw)
+        except:
+            return
+
+        # get results
+        return feed.xpath("//div[@id='container']")[0]
+
+    def populate(self, entries, browser, verbose=False):
+        #single entry
+        if len(entries) == 1 and not isinstance(entries[0], str):
+            try:
+                entry = entries[0].xpath("//div[@id='container']")[0]
+                entry = entry.find("div[@id='book-info']")
+                title = self.get_title(entry)
+                authors = self.get_authors(entry)
+            except Exception, e:
+                if verbose:
+                    print 'Failed to get all details for an entry'
+                    print e
+                return
+            self.append(self.fill_MI(entry, title, authors, verbose))
+        else:
+        #multiple entries
+            for x in entries:
+                try:
+                    entry = self.get_individual_metadata(browser, x, verbose)
+                    entry = entry.find("div[@id='book-info']")
+                    title = self.get_title(entry)
+                    authors = self.get_authors(entry)
+                except Exception, e:
+                    if verbose:
+                        print 'Failed to get all details for an entry'
+                        print e
+                    continue
+                self.append(self.fill_MI(entry, title, authors, verbose))
+
+
+class NiceBooksError(Exception):
+    pass
+
+class ISBNNotFound(NiceBooksError):
+    pass
+
+class Covers(object):
+
+    def __init__(self, isbn = None):
+        assert isbn is not None
+        self.urlimg = ''
+        self.isbn = isbn
+        self.isbnf = False
+
+    def __call__(self, entry = None):
+        try:
+            self.urlimg = entry.xpath("//div[@id='book-picture']/a")[0].get('href')
+        except:
+            return self
+        isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']")
+        for x in isbno.getiterator('dt'):
+            if x.text == 'ISBN' and check_isbn(x.getnext().text_content()):
+                self.isbnf = True
+                break
+        return self
+
+    def check_cover(self):
+        return True if self.urlimg else False
+
+    def get_cover(self, browser, timeout = 5.):
+        try:
+            cover, ext = browser.open_novisit(self.urlimg, timeout=timeout).read(), \
+                self.urlimg.rpartition('.')[-1]
+            return cover, ext if ext else 'jpg'
+        except Exception, err:
+            if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
+                err = NiceBooksError(_('Nicebooks timed out. Try again later.'))
+                raise err
+            if not len(self.urlimg):
+                if not self.isbnf:
+                    raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.'))
+                raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher'))
+
+
+def search(title=None, author=None, publisher=None, isbn=None,
+           max_results=5, verbose=False, keywords=None):
+    br = browser()
+    entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
+        keywords=keywords, max_results=max_results)(br, verbose)
+
+    if entries is None or len(entries) == 0:
+        return
+
+    #List of entry
+    ans = ResultList()
+    ans.populate(entries, br, verbose)
+    return ans
+
+def check_for_cover(isbn):
+    br = browser()
+    entry = Query(isbn=isbn, max_results=1)(br, False)[0]
+    return Covers(isbn)(entry).check_cover()
+
+def cover_from_isbn(isbn, timeout = 5.):
+    br = browser()
+    entry = Query(isbn=isbn, max_results=1)(br, False, timeout)[0]
+    return Covers(isbn)(entry).get_cover(br, timeout)
+
+
+def option_parser():
+    parser = OptionParser(textwrap.dedent(\
+    '''\
+        %prog [options]
+
+        Fetch book metadata from Nicebooks. You must specify one of title, author,
+        ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
+        so you should make your query as specific as possible.
+        It can also get covers if the option is activated.
+    '''
+    ))
+    parser.add_option('-t', '--title', help='Book title')
+    parser.add_option('-a', '--author', help='Book author(s)')
+    parser.add_option('-p', '--publisher', help='Book publisher')
+    parser.add_option('-i', '--isbn', help='Book ISBN')
+    parser.add_option('-k', '--keywords', help='Keywords')
+    parser.add_option('-c', '--covers', default=0,
+                      help='Covers: 1-Check/ 2-Download')
+    parser.add_option('-p', '--coverspath', default='',
+                      help='Covers files path')
+    parser.add_option('-m', '--max-results', default=20,
+                      help='Maximum number of results to fetch')
+    parser.add_option('-v', '--verbose', default=0, action='count',
+                      help='Be more verbose about errors')
+    return parser
+
+def main(args=sys.argv):
+    import os
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    try:
+        results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher,
+            keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results)
+    except AssertionError:
+        report(True)
+        parser.print_help()
+        return 1
+    if results is None or len(results) == 0:
+        print 'No result found for this search!'
+        return 0
+    for result in results:
+        print unicode(result).encode(preferred_encoding, 'replace')
+        covact = int(opts.covers)
+        if  covact == 1:
+            textcover = 'No cover found!'
+            if check_for_cover(result.isbn):
+                textcover = 'A cover was found for this book'
+            print textcover
+        elif covact == 2:
+            cover_data, ext = cover_from_isbn(result.isbn)
+            cpath = result.isbn
+            if len(opts.coverspath):
+                cpath = os.path.normpath(opts.coverspath + '/' + result.isbn)
+            oname = os.path.abspath(cpath+'.'+ext)
+            open(oname, 'wb').write(cover_data)
+            print 'Cover saved to file ', oname
+        print
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py
index e2f463b80b..4f418d34d5 100644
--- a/src/calibre/gui2/wizard/__init__.py
+++ b/src/calibre/gui2/wizard/__init__.py
@@ -615,10 +615,14 @@ class LibraryPage(QWizardPage, LibraryUI):
         self.emit(SIGNAL('retranslate()'))
         self.init_languages()
         try:
-            if prefs['language'].lower().startswith('zh'):
-                from calibre.customize.ui import enable_plugin
-                for name in ('Douban Books', 'Douban.com covers'):
-                    enable_plugin(name)
+            lang = prefs['language'].lower()[:2]
+            metadata_plugins = {
+                    'zh' : ('Douban Books', 'Douban.com covers'),
+                    'fr' : ('Nicebooks', 'Nicebooks covers'),
+            }.get(lang, [])
+            from calibre.customize.ui import enable_plugin
+            for name in metadata_plugins:
+                enable_plugin(name)
         except:
             pass