From eb4e7154dbcb63863ee70bb8dcc14c508631272f Mon Sep 17 00:00:00 2001 From: Sengian Date: Sun, 21 Nov 2010 00:16:24 +0100 Subject: [PATCH] Plugin nicebooks for metadatas and cover. Should be disable by default. --- src/calibre/customize/builtins.py | 6 +- src/calibre/ebooks/metadata/nicebooks.py | 458 +++++++++++++++++++++++ 2 files changed, 462 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/metadata/nicebooks.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 5723da34a8..ce5275d35e 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -483,15 +483,17 @@ from calibre.devices.kobo.driver import KOBO from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ LibraryThing, Fictionwise from calibre.ebooks.metadata.douban import DoubanBooks +from calibre.ebooks.metadata.nicebooks import NiceBooks from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ LibraryThingCovers, DoubanCovers +from calibre.ebooks.metadata.nicebooks import NiceBooksCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, - LibraryThing, Fictionwise, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, - Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] + LibraryThing, Fictionwise, DoubanBooks, NiceBooks,CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, + Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers, NiceBooksCovers] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/ebooks/metadata/nicebooks.py b/src/calibre/ebooks/metadata/nicebooks.py new file mode 100644 index 0000000000..28fb2de562 --- /dev/null +++ b/src/calibre/ebooks/metadata/nicebooks.py @@ -0,0 +1,458 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2010, sengian ' +__docformat__ = 'restructuredtext en' + +import sys, textwrap, re, traceback, socket +from urllib import urlencode +from functools import partial +from math import ceil +from copy import deepcopy + +from lxml import html +from lxml.html import soupparser + +from calibre.utils.date import parse_date, utcnow +from calibre import browser, preferred_encoding +from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata import MetaInformation, check_isbn, \ + authors_to_sort_string +from calibre.ebooks.metadata.fetch import MetadataSource +from calibre.ebooks.metadata.covers import CoverDownload +from calibre.utils.config import OptionParser + +class NiceBooks(MetadataSource): + + name = 'Nicebooks' + description = _('Downloads metadata from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + version = (1, 0, 0) + + def fetch(self): + try: + self.results = search(self.title, self.book_author, self.publisher, + self.isbn, max_results=10, verbose=self.verbose) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + +class NiceBooksCovers(CoverDownload): + + name = 'Nicebooks covers' + description = _('Downloads covers from french Nicebooks') + supported_platforms = ['windows', 'osx', 'linux'] + author = 'Sengian' + type = _('Cover download') + version = (1, 0, 0) + + def has_cover(self, mi, ans, timeout=5.): + if not mi.isbn: + return False + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + if Covers(isbn)(entry).check_cover(): + self.debug('cover for', mi.isbn, 'found') + ans.set() + except Exception, e: + self.debug(e) + + def get_covers(self, mi, result_queue, abort, timeout=5.): + if not mi.isbn: + return + br = browser() + try: + entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0] + cover_data, ext = Covers(isbn)(entry).get_cover(br, timeout) + if not ext: + ext = 'jpg' + result_queue.put((True, cover_data, ext, self.name)) + except Exception, e: + result_queue.put((False, self.exception_to_string(e), + traceback.format_exc(), self.name)) + + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + +def replace_monthsfr(datefr): + # Replace french months by english equivalent for parse_date + frtoen = { + u'[jJ]anvier': u'jan', + u'[fF].vrier': u'feb', + u'[mM]ars': u'mar', + u'[aA]vril': u'apr', + u'[mM]ai': u'may', + u'[jJ]uin': u'jun', + u'[jJ]uillet': u'jul', + u'[aA]o.t': u'aug', + u'[sS]eptembre': u'sep', + u'[Oo]ctobre': u'oct', + u'[nN]ovembre': u'nov', + u'[dD].cembre': u'dec' } + for k in frtoen.iterkeys(): + tmp = re.sub(k, frtoen[k], datefr) + if tmp <> datefr: break + return tmp + +class Query(object): + + BASE_URL = 'http://fr.nicebooks.com/' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20): + assert not(title is None and author is None and publisher is None \ + and isbn is None and keywords is None) + assert (max_results < 21) + + self.max_results = int(max_results) + + q = '' + if isbn is not None: + q += isbn + else: + + if title is not None: + q += title + if author is not None: + q += author + if publisher is not None: + q += publisher + if keywords is not None: + q += keywords + + if isinstance(q, unicode): + q = q.encode('utf-8') + self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'}) + + def __call__(self, browser, verbose, timeout = 5.): + if verbose: + print 'Query:', self.BASE_URL+self.urldata + + try: + raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '404 - ' in raw: + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + #nb of page to call + try: + nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text) + except: + #direct hit + return [feed] + + nbpagetoquery = ceil(min(nbresults, self.max_results)/10) + pages =[feed] + if nbpagetoquery > 1: + for i in xrange(2, nbpagetoquery + 1): + try: + urldata = self.urldata + '&p=' + str(i) + raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read() + except Exception, e: + continue + if '<title>404 - ' in raw: + continue + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + continue + pages.append(feed) + + results = [] + for x in pages: + results.extend([i.find_class('title')[0].get('href') \ + for i in x.xpath("//ul[@id='results']/li")]) + return results[:self.max_results] + +class ResultList(list): + + BASE_URL = 'http://fr.nicebooks.com' + + def __init__(self): + self.repub = re.compile(r'\s*.diteur\s*', re.I) + self.reauteur = re.compile(r'\s*auteur.*', re.I) + self.reautclean = re.compile(r'\s*\(.*\)\s*') + + def get_title(self, entry): + title = deepcopy(entry.find("div[@id='book-info']")) + title.remove(title.find("dl[@title='Informations sur le livre']")) + title = ' '.join([i.text_content() for i in title.iterchildren()]) + return title.replace('\n', '') + + def get_authors(self, entry): + author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + authortext = [] + for x in author.getiterator('dt'): + if self.reauteur.match(x.text): + elt = x.getnext() + i = 0 + while elt.tag <> 'dt' and i < 20: + authortext.append(elt.text_content()) + elt = elt.getnext() + i += 1 + break + if len(authortext) == 1: + authortext = [self.reautclean.sub('', authortext[0])] + return authortext + + def get_description(self, entry, verbose): + try: + return 'RESUME:\n' + entry.xpath("//p[@id='book-description']")[0].text + except: + report(verbose) + return None + + def get_publisher(self, entry): + publisher = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + publitext = None + for x in publisher.getiterator('dt'): + if self.repub.match(x.text): + publitext = x.getnext().text_content() + break + return publitext + + def get_date(self, entry, verbose): + date = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + for x in date.getiterator('dt'): + if x.text == 'Date de parution': + d = x.getnext().text_content() + break + if not len(d): + return None + try: + default = utcnow().replace(day=15) + d = replace_monthsfr(d) + d = parse_date(d, assume_utc=True, default=default) + except: + report(verbose) + d = None + return d + + def get_ISBN(self, entry): + isbn = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + isbntext = None + for x in isbn.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content() + if not check_isbn(isbntext): + return None + break + return isbntext + + def get_language(self, entry): + language = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']") + langtext = None + for x in language.getiterator('dt'): + if x.text == 'Langue': + langtext = x.getnext().text_content() + break + return langtext + + def fill_MI(self, entry, title, authors, verbose): + mi = MetaInformation(title, authors) + mi.comments = self.get_description(entry, verbose) + mi.publisher = self.get_publisher(entry) + mi.pubdate = self.get_date(entry, verbose) + mi.isbn = self.get_ISBN(entry) + mi.author_sort = authors_to_sort_string(authors) + mi.language = self.get_language(entry) + return mi + + def get_individual_metadata(self, browser, linkdata, verbose): + try: + raw = browser.open_novisit(self.BASE_URL + linkdata).read() + except Exception, e: + report(verbose) + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return + raise + if '<title>404 - ' in raw: + report(verbose) + return + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + feed = soupparser.fromstring(raw) + except: + return + + # get results + return feed.xpath("//div[@id='container']")[0] + + def populate(self, entries, browser, verbose=False): + for x in entries: + try: + entry = self.get_individual_metadata(browser, x, verbose) + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + continue + self.append(self.fill_MI(entry, title, authors, verbose)) + + def populate_single(self, feed, verbose=False): + try: + entry = feed.xpath("//div[@id='container']")[0] + title = self.get_title(entry) + authors = self.get_authors(entry) + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + return + self.append(self.fill_MI(entry, title, authors, verbose)) + +class NiceBooksError(Exception): + pass + +class ISBNNotFound(NiceBooksError): + pass + +class Covers(object): + + def __init__(self, isbn = None): + assert isbn is not None + self.urlimg = '' + self.isbn = isbn + self.isbnf = False + + def __call__(self, entry = None): + try: + self.urlimg = entry.xpath("//div[@id='book-picture']/a")[0].get('href') + except: + return self + isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']") + isbntext = None + for x in isbno.getiterator('dt'): + if x.text == 'ISBN': + isbntext = x.getnext().text_content() + break + if isbntext is not None: + self.isbnf = True + return self + + def check_cover(self): + if self.urlimg: + return True + else: + return False + + def get_cover(self, browser, timeout = 5.): + try: + return browser.open_novisit(self.urlimg, timeout=timeout).read(), \ + self.urlimg.rpartition('.')[-1] + except Exception, err: + if isinstance(getattr(err, 'args', [None])[0], socket.timeout): + err = NiceBooksError(_('Nicebooks timed out. Try again later.')) + raise err + if not len(self.urlimg): + if not self.isbnf: + raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.')) + raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher')) + + +def search(title=None, author=None, publisher=None, isbn=None, + verbose=False, max_results=5, keywords=None): + br = browser() + entries = Query(title=title, author=author, isbn=isbn, publisher=publisher, + keywords=keywords, max_results=max_results)(br, verbose) + + if entries is None: + return + + #List of entry + ans = ResultList() + if len(entries) > 1: + ans.populate(entries, br, verbose) + else: + ans.populate_single(entries[0], verbose) + return ans + +def check_for_cover(isbn): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False)[0] + return Covers(isbn)(entry).check_cover() + +def cover_from_isbn(isbn, timeout = 5.): + br = browser() + entry = Query(isbn=isbn, max_results=1)(br, False, timeout)[0] + return Covers(isbn)(entry).get_cover(br, timeout) + + +def option_parser(): + parser = OptionParser(textwrap.dedent(\ + '''\ + %prog [options] + + Fetch book metadata from Nicebooks. You must specify one of title, author, + ISBN, publisher or keywords. Will fetch a maximum of 20 matches, + so you should make your query as specific as possible. + It can also get covers if the option is activated. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-i', '--isbn', help='Book ISBN') + parser.add_option('-k', '--keywords', help='Keywords') + parser.add_option('-c', '--covers', default=0, + help='Covers: 1-Check/ 2-Download') + parser.add_option('-p', '--coverspath', default='', + help='Covers files path') + parser.add_option('-m', '--max-results', default=20, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher, + keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + for result in results: + print unicode(result).encode(preferred_encoding, 'replace') + covact = int(opts.covers) + if covact == 1: + textcover = 'No cover found!' + if check_for_cover(result.isbn): + textcover = 'A cover was found for this book' + print textcover + elif covact == 2: + cover_data, ext = cover_from_isbn(result.isbn) + if not ext: + ext = 'jpg' + cpath = result.isbn + if len(opts.coverspath): + cpath = os.path.normpath(opts.coverspath + '/' + result.isbn) + oname = os.path.abspath(cpath+'.'+ext) + open(oname, 'wb').write(cover_data) + print 'Cover saved to file ', oname + print + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file