Metadata and cover download plugins from Nicebooks

This commit is contained in:
Kovid Goyal 2010-11-21 09:52:40 -07:00
parent 2dccf2701d
commit fc524ee7d4
4 changed files with 437 additions and 7 deletions

View File

@ -483,6 +483,7 @@ from calibre.devices.kobo.driver import KOBO
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
LibraryThing LibraryThing
from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.douban import DoubanBooks
from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
LibraryThingCovers, DoubanCovers LibraryThingCovers, DoubanCovers
from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
@ -490,8 +491,9 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, LibraryThing, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers] Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers,
NiceBooksCovers]
plugins += [ plugins += [
ComicInput, ComicInput,
EPUBInput, EPUBInput,

View File

@ -120,7 +120,7 @@ def enable_plugin(plugin_or_name):
config['enabled_plugins'] = ep config['enabled_plugins'] = ep
default_disabled_plugins = set([ default_disabled_plugins = set([
'Douban Books', 'Douban.com covers', 'Douban Books', 'Douban.com covers', 'Nicebooks', 'Nicebooks covers'
]) ])
def is_disabled(plugin): def is_disabled(plugin):

View File

@ -0,0 +1,424 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>'
__docformat__ = 'restructuredtext en'
import sys, textwrap, re, traceback, socket
from urllib import urlencode
from math import ceil
from copy import deepcopy
from lxml.html import soupparser
from calibre.utils.date import parse_date, utcnow
from calibre import browser, preferred_encoding
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
authors_to_sort_string
from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.ebooks.metadata.covers import CoverDownload
from calibre.utils.config import OptionParser
class NiceBooks(MetadataSource):
name = 'Nicebooks'
description = _('Downloads metadata from french Nicebooks')
supported_platforms = ['windows', 'osx', 'linux']
author = 'Sengian'
version = (1, 0, 0)
def fetch(self):
try:
self.results = search(self.title, self.book_author, self.publisher,
self.isbn, max_results=10, verbose=self.verbose)
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
class NiceBooksCovers(CoverDownload):
name = 'Nicebooks covers'
description = _('Downloads covers from french Nicebooks')
supported_platforms = ['windows', 'osx', 'linux']
author = 'Sengian'
type = _('Cover download')
version = (1, 0, 0)
def has_cover(self, mi, ans, timeout=5.):
if not mi.isbn:
return False
br = browser()
try:
entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0]
if Covers(mi.isbn)(entry).check_cover():
self.debug('cover for', mi.isbn, 'found')
ans.set()
except Exception, e:
self.debug(e)
def get_covers(self, mi, result_queue, abort, timeout=5.):
if not mi.isbn:
return
br = browser()
try:
entry = Query(isbn=mi.isbn, max_results=1)(br, False, timeout)[0]
cover_data, ext = Covers(mi.isbn)(entry).get_cover(br, timeout)
if not ext:
ext = 'jpg'
result_queue.put((True, cover_data, ext, self.name))
except Exception, e:
result_queue.put((False, self.exception_to_string(e),
traceback.format_exc(), self.name))
def report(verbose):
if verbose:
import traceback
traceback.print_exc()
def replace_monthsfr(datefr):
# Replace french months by english equivalent for parse_date
frtoen = {
u'[jJ]anvier': u'jan',
u'[fF].vrier': u'feb',
u'[mM]ars': u'mar',
u'[aA]vril': u'apr',
u'[mM]ai': u'may',
u'[jJ]uin': u'jun',
u'[jJ]uillet': u'jul',
u'[aA]o.t': u'aug',
u'[sS]eptembre': u'sep',
u'[Oo]ctobre': u'oct',
u'[nN]ovembre': u'nov',
u'[dD].cembre': u'dec' }
for k in frtoen.iterkeys():
tmp = re.sub(k, frtoen[k], datefr)
if tmp <> datefr: break
return tmp
class Query(object):
BASE_URL = 'http://fr.nicebooks.com/'
def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None, max_results=20):
assert not(title is None and author is None and publisher is None \
and isbn is None and keywords is None)
assert (max_results < 21)
self.max_results = int(max_results)
if isbn is not None:
q = isbn
else:
q = ' '.join([i for i in (title, author, publisher, keywords) \
if i is not None])
if isinstance(q, unicode):
q = q.encode('utf-8')
self.urldata = 'search?' + urlencode({'q':q,'s':'Rechercher'})
def __call__(self, browser, verbose, timeout = 5.):
if verbose:
print 'Query:', self.BASE_URL+self.urldata
try:
raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read()
except Exception, e:
report(verbose)
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return
raise
if '<title>404 - ' in raw:
return
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
feed = soupparser.fromstring(raw)
except:
return
#nb of page to call
try:
nbresults = int(feed.xpath("//div[@id='topbar']/b")[0].text)
except:
#direct hit
return [feed]
nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/10))
pages =[feed]
if nbpagetoquery > 1:
for i in xrange(2, nbpagetoquery + 1):
try:
urldata = self.urldata + '&p=' + str(i)
raw = browser.open_novisit(self.BASE_URL+urldata, timeout=timeout).read()
except Exception, e:
continue
if '<title>404 - ' in raw:
continue
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
feed = soupparser.fromstring(raw)
except:
continue
pages.append(feed)
results = []
for x in pages:
results.extend([i.find_class('title')[0].get('href') \
for i in x.xpath("//ul[@id='results']/li")])
return results[:self.max_results]
class ResultList(list):
BASE_URL = 'http://fr.nicebooks.com'
def __init__(self):
self.repub = re.compile(u'\s*.diteur\s*', re.I)
self.reauteur = re.compile(u'\s*auteur.*', re.I)
self.reautclean = re.compile(u'\s*\(.*\)\s*')
def get_title(self, entry):
# title = deepcopy(entry.find("div[@id='book-info']"))
title = deepcopy(entry)
title.remove(title.find("dl[@title='Informations sur le livre']"))
title = ' '.join([i.text_content() for i in title.iterchildren()])
return unicode(title.replace('\n', ''))
def get_authors(self, entry):
# author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
author = entry.find("dl[@title='Informations sur le livre']")
authortext = []
for x in author.getiterator('dt'):
if self.reauteur.match(x.text):
elt = x.getnext()
while elt.tag == 'dd':
authortext.append(unicode(elt.text_content()))
elt = elt.getnext()
break
if len(authortext) == 1:
authortext = [self.reautclean.sub('', authortext[0])]
return authortext
def get_description(self, entry, verbose):
try:
return u'RESUME:\n' + unicode(entry.getparent().xpath("//p[@id='book-description']")[0].text)
except:
report(verbose)
return None
def get_book_info(self, entry, mi, verbose):
entry = entry.find("dl[@title='Informations sur le livre']")
for x in entry.getiterator('dt'):
if x.text == 'ISBN':
isbntext = x.getnext().text_content().replace('-', '')
if check_isbn(isbntext):
mi.isbn = unicode(isbntext)
elif self.repub.match(x.text):
mi.publisher = unicode(x.getnext().text_content())
elif x.text == 'Langue':
mi.language = unicode(x.getnext().text_content())
elif x.text == 'Date de parution':
d = x.getnext().text_content()
try:
default = utcnow().replace(day=15)
d = replace_monthsfr(d)
d = parse_date(d, assume_utc=True, default=default)
mi.pubdate = d
except:
report(verbose)
return mi
def fill_MI(self, entry, title, authors, verbose):
mi = MetaInformation(title, authors)
mi.author_sort = authors_to_sort_string(authors)
mi.comments = self.get_description(entry, verbose)
# entry = entry.find("dl[@title='Informations sur le livre']")
# mi.publisher = self.get_publisher(entry)
# mi.pubdate = self.get_date(entry, verbose)
# mi.isbn = self.get_ISBN(entry)
# mi.language = self.get_language(entry)
return self.get_book_info(entry, mi, verbose)
def get_individual_metadata(self, browser, linkdata, verbose):
try:
raw = browser.open_novisit(self.BASE_URL + linkdata).read()
except Exception, e:
report(verbose)
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return
raise
if '<title>404 - ' in raw:
report(verbose)
return
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
feed = soupparser.fromstring(raw)
except:
return
# get results
return feed.xpath("//div[@id='container']")[0]
def populate(self, entries, browser, verbose=False):
#single entry
if len(entries) == 1 and not isinstance(entries[0], str):
try:
entry = entries[0].xpath("//div[@id='container']")[0]
entry = entry.find("div[@id='book-info']")
title = self.get_title(entry)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print 'Failed to get all details for an entry'
print e
return
self.append(self.fill_MI(entry, title, authors, verbose))
else:
#multiple entries
for x in entries:
try:
entry = self.get_individual_metadata(browser, x, verbose)
entry = entry.find("div[@id='book-info']")
title = self.get_title(entry)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print 'Failed to get all details for an entry'
print e
continue
self.append(self.fill_MI(entry, title, authors, verbose))
class NiceBooksError(Exception):
pass
class ISBNNotFound(NiceBooksError):
pass
class Covers(object):
def __init__(self, isbn = None):
assert isbn is not None
self.urlimg = ''
self.isbn = isbn
self.isbnf = False
def __call__(self, entry = None):
try:
self.urlimg = entry.xpath("//div[@id='book-picture']/a")[0].get('href')
except:
return self
isbno = entry.get_element_by_id('book-info').find("dl[@title='Informations sur le livre']")
for x in isbno.getiterator('dt'):
if x.text == 'ISBN' and check_isbn(x.getnext().text_content()):
self.isbnf = True
break
return self
def check_cover(self):
return True if self.urlimg else False
def get_cover(self, browser, timeout = 5.):
try:
cover, ext = browser.open_novisit(self.urlimg, timeout=timeout).read(), \
self.urlimg.rpartition('.')[-1]
return cover, ext if ext else 'jpg'
except Exception, err:
if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
err = NiceBooksError(_('Nicebooks timed out. Try again later.'))
raise err
if not len(self.urlimg):
if not self.isbnf:
raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.'))
raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher'))
def search(title=None, author=None, publisher=None, isbn=None,
max_results=5, verbose=False, keywords=None):
br = browser()
entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
keywords=keywords, max_results=max_results)(br, verbose)
if entries is None or len(entries) == 0:
return
#List of entry
ans = ResultList()
ans.populate(entries, br, verbose)
return ans
def check_for_cover(isbn):
br = browser()
entry = Query(isbn=isbn, max_results=1)(br, False)[0]
return Covers(isbn)(entry).check_cover()
def cover_from_isbn(isbn, timeout = 5.):
br = browser()
entry = Query(isbn=isbn, max_results=1)(br, False, timeout)[0]
return Covers(isbn)(entry).get_cover(br, timeout)
def option_parser():
parser = OptionParser(textwrap.dedent(\
'''\
%prog [options]
Fetch book metadata from Nicebooks. You must specify one of title, author,
ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
so you should make your query as specific as possible.
It can also get covers if the option is activated.
'''
))
parser.add_option('-t', '--title', help='Book title')
parser.add_option('-a', '--author', help='Book author(s)')
parser.add_option('-p', '--publisher', help='Book publisher')
parser.add_option('-i', '--isbn', help='Book ISBN')
parser.add_option('-k', '--keywords', help='Keywords')
parser.add_option('-c', '--covers', default=0,
help='Covers: 1-Check/ 2-Download')
parser.add_option('-p', '--coverspath', default='',
help='Covers files path')
parser.add_option('-m', '--max-results', default=20,
help='Maximum number of results to fetch')
parser.add_option('-v', '--verbose', default=0, action='count',
help='Be more verbose about errors')
return parser
def main(args=sys.argv):
import os
parser = option_parser()
opts, args = parser.parse_args(args)
try:
results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher,
keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results)
except AssertionError:
report(True)
parser.print_help()
return 1
if results is None or len(results) == 0:
print 'No result found for this search!'
return 0
for result in results:
print unicode(result).encode(preferred_encoding, 'replace')
covact = int(opts.covers)
if covact == 1:
textcover = 'No cover found!'
if check_for_cover(result.isbn):
textcover = 'A cover was found for this book'
print textcover
elif covact == 2:
cover_data, ext = cover_from_isbn(result.isbn)
cpath = result.isbn
if len(opts.coverspath):
cpath = os.path.normpath(opts.coverspath + '/' + result.isbn)
oname = os.path.abspath(cpath+'.'+ext)
open(oname, 'wb').write(cover_data)
print 'Cover saved to file ', oname
print
if __name__ == '__main__':
sys.exit(main())

View File

@ -615,10 +615,14 @@ class LibraryPage(QWizardPage, LibraryUI):
self.emit(SIGNAL('retranslate()')) self.emit(SIGNAL('retranslate()'))
self.init_languages() self.init_languages()
try: try:
if prefs['language'].lower().startswith('zh'): lang = prefs['language'].lower()[:2]
from calibre.customize.ui import enable_plugin metadata_plugins = {
for name in ('Douban Books', 'Douban.com covers'): 'zh' : ('Douban Books', 'Douban.com covers'),
enable_plugin(name) 'fr' : ('Nicebooks', 'Nicebooks covers'),
}.get(lang, [])
from calibre.customize.ui import enable_plugin
for name in metadata_plugins:
enable_plugin(name)
except: except:
pass pass