From fc524ee7d4eccbe6f8a0ae63f56a06caaf248fa0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 21 Nov 2010 09:52:40 -0700 Subject: [PATCH] Metadata and cover download plugins from Nicebooks --- src/calibre/customize/builtins.py | 6 +- src/calibre/customize/ui.py | 2 +- src/calibre/ebooks/metadata/nicebooks.py | 424 +++++++++++++++++++++++ src/calibre/gui2/wizard/__init__.py | 12 +- 4 files changed, 437 insertions(+), 7 deletions(-) create mode 100644 src/calibre/ebooks/metadata/nicebooks.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 681d953c9b..87946706cf 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -483,6 +483,7 @@ from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing from calibre.ebooks.metadata.douban import DoubanBooks +from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX @@ -490,8 +491,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] + LibraryThing, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, + Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, + NiceBooksCovers] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 844269e453..c360122842 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name): config['enabled_plugins'] = ep default_disabled_plugins = set([ - 'Douban Books', 'Douban.com covers', + 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers' ]) def is_disabled(plugin): diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py new file mode 100644 index 0000000000..4d19e9611b --- /dev/null +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -0,0 +1,424 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian ' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re, traceback, socket +from urllib import urlencode +from math import ceil +from copy import deepcopy + +from lxml.html import soupparser + +from calibre.utils.date import parse_date, utcnow +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.ebooks.metadata.covers import CoverDownload +from calibre.utils.config import OptionParser + +class NiceBooks(MetadataSource): + + name = 'Nicebooks' + description = _('Downloads metadata from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class NiceBooksCovers(CoverDownload): + + name = 'Nicebooks covers' + description = _('Downloads covers from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + type = _('Cover download') + version = (1, 0, 0) + + def has_cover(self, mi, ans, timeout=5.): + if not mi.isbn: + return False + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + if Covers(mi.isbn)(entry).check_cover(): + self.debug('cover for', mi.isbn, 'found') + ans.set() + except Exception, e: + self.debug(e) + + def get_covers(self, mi, result_queue, abort, timeout=5.): + if not mi.isbn: + return + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + cover_data, ext = Covers(mi.isbn)(entry).get_cover(br, timeout) + if not ext: + ext = 'jpg' + result_queue.put((True, cover_data, ext, self.name)) + except Exception, e: + result_queue.put((False, self.exception_to_string(e), + traceback.format_exc(), self.name)) + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +def replace_monthsfr(datefr): + # Replace french months by english equivalent for parse_date + frtoen = { + u'[jJ]anvier': u'jan', + u'[fF].vrier': u'feb', + u'[mM]ars': u'mar', + u'[aA]vril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uin': u'jun', + u'[jJ]uillet': u'jul', + u'[aA]o.t': u'aug', + u'[sS]eptembre': u'sep', + u'[Oo]ctobre': u'oct', + u'[nN]ovembre': u'nov', + u'[dD].cembre': u'dec' } + for k in frtoen.iterkeys(): + tmp = re.sub(k, frtoen[k], datefr) + if tmp <> datefr: break + return tmp + +class Query(object): + + BASE_URL = 'http://fr.nicebooks.com/' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20): + assert not(title is None and author is None and publisher is None \ + and isbn is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + + if isbn is not None: + q = isbn + else: + q = ' '.join([i for i in (title, author, publisher, keywords) \ + if i is not None]) + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'}) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print 'Query:', self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + #nb of page to call + try: + nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) + except: + #direct hit + return [feed] + + nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10)) + pages =[feed] + if nbpagetoquery > 1: + for i in xrange(2, nbpagetoquery + 1): + try: + urldata = self.urldata + '&p=' + str(i) + raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read() + except Exception, e: + continue + if '<title>404 - ' in raw: + continue + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + continue + pages.append(feed) + + results = [] + for x in pages: + results.extend([i.find_class('title')[0].get('href') \ + for i in x.xpath("//ul[@id='results']/li")]) + return results[:self.max_results] + +class ResultList(list): + + BASE_URL = 'http://fr.nicebooks.com' + + def __init__(self): + self.repub = re.compile(u'\s*.diteur\s*', re.I) + self.reauteur = re.compile(u'\s*auteur.*', re.I) + self.reautclean = re.compile(u'\s*\(.*\)\s*') + + def get_title(self, entry): + # title = deepcopy(entry.find("div[@id='book-info']")) + title = deepcopy(entry) + title.remove(title.find("dl[@title='Informations sur le livre']")) + title = ' '.join([i.text_content() for i in title.iterchildren()]) + return unicode(title.replace('\n', '')) + + def get_authors(self, entry): + # author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + author = entry.find("dl[@title='Informations sur le livre']") + authortext = [] + for x in author.getiterator('dt'): + if self.reauteur.match(x.text): + elt = x.getnext() + while elt.tag == 'dd': + authortext.append(unicode(elt.text_content())) + elt = elt.getnext() + break + if len(authortext) == 1: + authortext = [self.reautclean.sub('', authortext[0])] + return authortext + + def get_description(self, entry, verbose): + try: + return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text) + except: + report(verbose) + return None + + def get_book_info(self, entry, mi, verbose): + entry = entry.find("dl[@title='Informations sur le livre']") + for x in entry.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content().replace('-', '') + if check_isbn(isbntext): + mi.isbn = unicode(isbntext) + elif self.repub.match(x.text): + mi.publisher = unicode(x.getnext().text_content()) + elif x.text == 'Langue': + mi.language = unicode(x.getnext().text_content()) + elif x.text == 'Date de parution': + d = x.getnext().text_content() + try: + default = utcnow().replace(day=15) + d = replace_monthsfr(d) + d = parse_date(d, assume_utc=True, default=default) + mi.pubdate = d + except: + report(verbose) + return mi + + def fill_MI(self, entry, title, authors, verbose): + mi = MetaInformation(title, authors) + mi.author_sort = authors_to_sort_string(authors) + mi.comments = self.get_description(entry, verbose) + # entry = entry.find("dl[@title='Informations sur le livre']") + # mi.publisher = self.get_publisher(entry) + # mi.pubdate = self.get_date(entry, verbose) + # mi.isbn = self.get_ISBN(entry) + # mi.language = self.get_language(entry) + return self.get_book_info(entry, mi, verbose) + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get results + return feed.xpath("//div[@id='container']")[0] + + def populate(self, entries, browser, verbose=False): + #single entry + if len(entries) == 1 and not isinstance(entries[0], str): + try: + entry = entries[0].xpath("//div[@id='container']")[0] + entry = entry.find("div[@id='book-info']") + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return + self.append(self.fill_MI(entry, title, authors, verbose)) + else: + #multiple entries + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + entry = entry.find("div[@id='book-info']") + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, verbose)) + + +class NiceBooksError(Exception): + pass + +class ISBNNotFound(NiceBooksError): + pass + +class Covers(object): + + def __init__(self, isbn = None): + assert isbn is not None + self.urlimg = '' + self.isbn = isbn + self.isbnf = False + + def __call__(self, entry = None): + try: + self.urlimg = entry.xpath("//div[@id='book-picture']/a")[0].get('href') + except: + return self + isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']") + for x in isbno.getiterator('dt'): + if x.text == 'ISBN' and check_isbn(x.getnext().text_content()): + self.isbnf = True + break + return self + + def check_cover(self): + return True if self.urlimg else False + + def get_cover(self, browser, timeout = 5.): + try: + cover, ext = browser.open_novisit(self.urlimg, timeout=timeout).read(), \ + self.urlimg.rpartition('.')[-1] + return cover, ext if ext else 'jpg' + except Exception, err: + if isinstance(getattr(err, 'args', [None])[0], socket.timeout): + err = NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise err + if not len(self.urlimg): + if not self.isbnf: + raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.')) + raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher')) + + +def search(title=None, author=None, publisher=None, isbn=None, + max_results=5, verbose=False, keywords=None): + br = browser() + entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose) + + if entries is None or len(entries) == 0: + return + + #List of entry + ans = ResultList() + ans.populate(entries, br, verbose) + return ans + +def check_for_cover(isbn): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False)[0] + return Covers(isbn)(entry).check_cover() + +def cover_from_isbn(isbn, timeout = 5.): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False, timeout)[0] + return Covers(isbn)(entry).get_cover(br, timeout) + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Nicebooks. You must specify one of title, author, + ISBN, publisher or keywords. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + It can also get covers if the option is activated. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-i', '--isbn', help='Book ISBN') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-c', '--covers', default=0, + help='Covers: 1-Check/ 2-Download') + parser.add_option('-p', '--coverspath', default='', + help='Covers files path') + parser.add_option('-m', '--max-results', default=20, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + import os + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + if results is None or len(results) == 0: + print 'No result found for this search!' + return 0 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + covact = int(opts.covers) + if covact == 1: + textcover = 'No cover found!' + if check_for_cover(result.isbn): + textcover = 'A cover was found for this book' + print textcover + elif covact == 2: + cover_data, ext = cover_from_isbn(result.isbn) + cpath = result.isbn + if len(opts.coverspath): + cpath = os.path.normpath(opts.coverspath + '/' + result.isbn) + oname = os.path.abspath(cpath+'.'+ext) + open(oname, 'wb').write(cover_data) + print 'Cover saved to file ', oname + print + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py index e2f463b80b..4f418d34d5 100644 --- a/src/calibre/gui2/wizard/__init__.py +++ b/src/calibre/gui2/wizard/__init__.py @@ -615,10 +615,14 @@ class LibraryPage(QWizardPage, LibraryUI): self.emit(SIGNAL('retranslate()')) self.init_languages() try: - if prefs['language'].lower().startswith('zh'): - from calibre.customize.ui import enable_plugin - for name in ('Douban Books', 'Douban.com covers'): - enable_plugin(name) + lang = prefs['language'].lower()[:2] + metadata_plugins = { + 'zh' : ('Douban Books', 'Douban.com covers'), + 'fr' : ('Nicebooks', 'Nicebooks covers'), + }.get(lang, []) + from calibre.customize.ui import enable_plugin + for name in metadata_plugins: + enable_plugin(name) except: pass