From a0fc1086364cab8d744530274ac5149ecfdda2f1 Mon Sep 17 00:00:00 2001
From: Sengian
Date: Sat, 13 Nov 2010 15:22:18 +0100
Subject: [PATCH] Adding Fictionwise metadata source
---
src/calibre/customize/builtins.py | 4 +-
src/calibre/ebooks/metadata/fetch.py | 18 ++
src/calibre/ebooks/metadata/fictionwise.py | 351 +++++++++++++++++++++
3 files changed, 371 insertions(+), 2 deletions(-)
create mode 100644 src/calibre/ebooks/metadata/fictionwise.py
diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index bd766827a5..04364b6b28 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -481,7 +481,7 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
- LibraryThing
+ LibraryThing, Fictionwise
from calibre.ebooks.metadata.douban import DoubanBooks
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
LibraryThingCovers, DoubanCovers
@@ -490,7 +490,7 @@ from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon,
- LibraryThing, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
+ LibraryThing, Fictionwise, DoubanBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
Epubcheck, OpenLibraryCovers, LibraryThingCovers, DoubanCovers]
plugins += [
ComicInput,
diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py
index dedd251640..c9d6a74cb2 100644
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@@ -267,6 +267,24 @@ class LibraryThing(MetadataSource): # {{{
# }}}
+class Fictionwise(MetadataSource): # {{{
+
+ author = 'Sengian'
+ name = 'Fictionwise'
+ description = _('Downloads metadata from Fictionwise')
+
+ has_html_comments = True
+
+ def fetch(self):
+ from calibre.ebooks.metadata.fictionwise import search
+ try:
+ self.results = search(self.title, self.book_author, self.publisher,
+ self.isbn, max_results=10, verbose=self.verbose)
+ except Exception, e:
+ self.exception = e
+ self.tb = traceback.format_exc()
+
+ # }}}
def result_index(source, result):
if not result.isbn:
diff --git a/src/calibre/ebooks/metadata/fictionwise.py b/src/calibre/ebooks/metadata/fictionwise.py
new file mode 100644
index 0000000000..2fa9a1bcee
--- /dev/null
+++ b/src/calibre/ebooks/metadata/fictionwise.py
@@ -0,0 +1,351 @@
+from __future__ import with_statement
+__license__ = 'GPL 3'
+__copyright__ = '2010, sengian '
+__docformat__ = 'restructuredtext en'
+
+import sys, textwrap, re
+from urllib import urlencode
+
+from lxml import html, etree
+from lxml.html import soupparser
+from lxml.etree import tostring
+
+from calibre import browser, preferred_encoding
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.ebooks.metadata import MetaInformation, check_isbn, \
+ authors_to_sort_string
+from calibre.library.comments import sanitize_comments_html
+from calibre.utils.config import OptionParser
+from calibre.utils.date import parse_date, utcnow
+
+
+def report(verbose):
+ if verbose:
+ import traceback
+ traceback.print_exc()
+
+class Query(object):
+
+ BASE_URL = 'http://www.fictionwise.com/servlet/mw'
+
+ def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20):
+ assert not(title is None and author is None and keywords is None)
+ assert (max_results < 21)
+
+ self.max_results = max_results
+
+ q = { 'template' : 'searchresults_adv.htm' ,
+ 'searchtitle' : '',
+ 'searchauthor' : '',
+ 'searchpublisher' : '',
+ 'searchkeyword' : '',
+ #possibilities startoflast, fullname, lastfirst
+ 'searchauthortype' : 'startoflast',
+ 'searchcategory' : '',
+ 'searchcategory2' : '',
+ 'searchprice_s' : '0',
+ 'searchprice_e' : 'ANY',
+ 'searchformat' : '',
+ 'searchgeo' : 'US',
+ 'searchfwdatetype' : '',
+ #maybe use dates fields if needed?
+ #'sortorder' : 'DESC',
+ #many options available: b.SortTitle, a.SortName,
+ #b.DateFirstPublished, b.FWPublishDate
+ 'sortby' : 'b.SortTitle'
+ }
+ if title is not None:
+ q['searchtitle'] = title
+ if author is not None:
+ q['searchauthor'] = author
+ if publisher is not None:
+ q['searchpublisher'] = publisher
+ if keywords is not None:
+ q['searchkeyword'] = keywords
+
+ if isinstance(q, unicode):
+ q = q.encode('utf-8')
+ self.urldata = urlencode(q)
+
+ def __call__(self, browser, verbose):
+ if verbose:
+ print 'Query:', self.BASE_URL+self.urldata
+
+ try:
+ raw = browser.open_novisit(self.BASE_URL, self.urldata).read()
+ except Exception, e:
+ report(verbose)
+ if callable(getattr(e, 'getcode', None)) and \
+ e.getcode() == 404:
+ return
+ raise
+ if '404 - ' in raw:
+ return
+ raw = xml_to_unicode(raw, strip_encoding_pats=True,
+ resolve_entities=True)[0]
+ try:
+ feed = soupparser.fromstring(raw)
+ except:
+ return
+
+ # get list of results as links
+ results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]")
+ results = results[:self.max_results]
+ results = [i.xpath('descendant-or-self::a')[0].get('href') for i in results]
+ #return feed if no links ie normally a single book or nothing
+ if not results:
+ results = [feed]
+ return results
+
+class ResultList(list):
+
+ BASE_URL = 'http://www.fictionwise.com'
+ COLOR_VALUES = {'BLUE': 4, 'GREEN': 3, 'YELLOW': 2, 'RED': 1, 'NA': 0}
+
+ def __init__(self):
+ self.retitle = re.compile(r'\[[^\[\]]+\]')
+ self.rechkauth = re.compile(r'.*book\s*by', re.I)
+ self.redesc = re.compile(r'book\s*description\s*:\s*(
]+>)*(?P.*)' \
+ + '
]+>.{,15}publisher\s*:', re.I)
+ self.repub = re.compile(r'.*publisher\s*:\s*', re.I)
+ self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I)
+ self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I)
+ self.resplitbr = re.compile(r'
]+>', re.I)
+ self.recomment = re.compile(r'(?s)')
+ self.reimg = re.compile(r'
]*>', re.I)
+ self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I)
+ self.renbcom = re.compile('(?P\d+)\s*Reader Ratings:')
+ self.recolor = re.compile('(?P[^/]+).gif')
+ self.resplitbrdiv = re.compile(r'(
]+>|?div[^>]*>)', re.I)
+ self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I)
+
+ def strip_tags_etree(self, etreeobj, invalid_tags):
+ for itag in invalid_tags:
+ for elt in etreeobj.getiterator(itag):
+ elt.drop_tag()
+ return etreeobj
+
+ def clean_entry(self, entry,
+ invalid_tags = ('font', 'strong', 'b', 'ul', 'span', 'a'),
+ remove_tags_trees = ('script',)):
+ for it in entry[0].iterchildren(tag='table'):
+ entry[0].remove(it)
+ entry[0].remove(entry[0].xpath( 'descendant-or-self::p[1]')[0])
+ entry = entry[0]
+ cleantree = self.strip_tags_etree(entry, invalid_tags)
+ for itag in remove_tags_trees:
+ for elts in cleantree.getiterator(itag):
+ elts.drop_tree()
+ return cleantree
+
+ def output_entry(self, entry, prettyout = True, htmlrm="\d+"):
+ out = tostring(entry, pretty_print=prettyout)
+ reclean = re.compile('(\n+|\t+|\r+|'+htmlrm+';)')
+ return reclean.sub('', out)
+
+ def get_title(self, entry):
+ title = entry.findtext('./')
+ return self.retitle.sub('', title).strip()
+
+ def get_authors(self, entry):
+ authortext = entry.find('./br').tail
+ if not self.rechkauth.search(authortext):
+ return []
+ #TODO: parse all tag if necessary
+ authortext = self.rechkauth.sub('', authortext)
+ return [a.strip() for a in authortext.split('&')]
+
+ def get_rating(self, entrytable, verbose):
+ nbcomment = tostring(entrytable.getprevious())
+ try:
+ nbcomment = self.renbcom.search(nbcomment).group("nbcom")
+ except:
+ report(verbose)
+ return None
+ hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")],
+ float(image.get('height', default=0))) \
+ for image in entrytable.getiterator('img'))
+ #ratings as x/20, not sure
+ return 5*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues())
+
+ def get_description(self, entry):
+ description = self.output_entry(entry.find('./p'),htmlrm="")
+ description = self.redesc.search(description)
+ if not description and not description.group("desc"):
+ return None
+ #remove invalid tags
+ description = self.reimg.sub('', description.group("desc"))
+ description = self.recomment.sub('', description)
+ description = self.resanitize.sub('', sanitize_comments_html(description))
+ return 'SUMMARY:\n' + re.sub(r'\n\s+
','\n', description)
+
+ def get_publisher(self, entry):
+ publisher = self.output_entry(entry.find('./p'))
+ publisher = filter(lambda x: self.repub.search(x) is not None,
+ self.resplitbr.split(publisher))
+ if not len(publisher):
+ return None
+ publisher = self.repub.sub('', publisher[0])
+ return publisher.split(',')[0].strip()
+
+ def get_tags(self, entry):
+ tag = self.output_entry(entry.find('./p'))
+ tag = filter(lambda x: self.retag.search(x) is not None,
+ self.resplitbr.split(tag))
+ if not len(tag):
+ return []
+ return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/'))
+
+ def get_date(self, entry, verbose):
+ date = self.output_entry(entry.find('./p'))
+ date = filter(lambda x: self.redate.search(x) is not None,
+ self.resplitbr.split(date))
+ if not len(date):
+ return None
+ #TODO: parse all tag if necessary
+ try:
+ d = self.redate.sub('', date[0])
+ if d:
+ default = utcnow().replace(day=15)
+ d = parse_date(d, assume_utc=True, default=default)
+ else:
+ d = None
+ except:
+ report(verbose)
+ d = None
+ return d
+
+ def get_ISBN(self, entry):
+ isbns = self.output_entry(entry.getchildren()[2])
+ isbns = filter(lambda x: self.reisbn.search(x) is not None,
+ self.resplitbrdiv.split(isbns))
+ if not len(isbns):
+ return None
+ #TODO: parse all tag if necessary
+ isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))]
+ return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1]
+
+ def fill_MI(self, entry, title, authors, ratings, verbose):
+ mi = MetaInformation(title, authors)
+ mi.rating = ratings
+ mi.comments = self.get_description(entry)
+ mi.publisher = self.get_publisher(entry)
+ mi.tags = self.get_tags(entry)
+ mi.pubdate = self.get_date(entry, verbose)
+ mi.isbn = self.get_ISBN(entry)
+ mi.author_sort = authors_to_sort_string(authors)
+ # mi.language = self.get_language(x, verbose)
+ return mi
+
+ def get_individual_metadata(self, browser, linkdata):
+ try:
+ raw = browser.open_novisit(self.BASE_URL + linkdata).read()
+ except Exception, e:
+ report(verbose)
+ if callable(getattr(e, 'getcode', None)) and \
+ e.getcode() == 404:
+ return
+ raise
+ if '404 - ' in raw:
+ report(verbose)
+ return
+ raw = xml_to_unicode(raw, strip_encoding_pats=True,
+ resolve_entities=True)[0]
+ try:
+ feed = soupparser.fromstring(raw)
+ except:
+ return
+
+ # get results
+ return feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")
+
+ def populate(self, entries, browser, verbose=False):
+ for x in entries:
+ try:
+ entry = self.get_individual_metadata(browser, x)
+ entry = self.clean_entry(entry)
+ title = self.get_title(entry)
+ #ratings: get table for rating then drop
+ for elt in entry.getiterator('table'):
+ ratings = self.get_rating(elt, verbose)
+ elt.getprevious().drop_tree()
+ elt.drop_tree()
+ authors = self.get_authors(entry)
+ except Exception, e:
+ if verbose:
+ print 'Failed to get all details for an entry'
+ print e
+ continue
+ self.append(self.fill_MI(entry, title, authors, ratings, verbose))
+
+ def populate_single(self, feed, verbose=False):
+ try:
+ entry = feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")
+ entry = self.clean_entry(entry)
+ title = self.get_title(entry)
+ #ratings: get table for rating then drop
+ for elt in entry.getiterator('table'):
+ ratings = self.get_rating(elt, verbose)
+ elt.getprevious().drop_tree()
+ elt.drop_tree()
+ authors = self.get_authors(entry)
+ except Exception, e:
+ if verbose:
+ print 'Failed to get all details for an entry'
+ print e
+ return
+ self.append(self.fill_MI(entry, title, authors, ratings, verbose))
+
+
+def search(title=None, author=None, publisher=None, isbn=None,
+ min_viewability='none', verbose=False, max_results=5,
+ keywords=None):
+ br = browser()
+ entries = Query(title=title, author=author, publisher=publisher,
+ keywords=keywords, max_results=max_results)(br, verbose)
+
+ #List of entry
+ ans = ResultList()
+ if len(entries) > 1:
+ ans.populate(entries, br, verbose)
+ else:
+ ans.populate_single(entries[0], verbose)
+ return ans
+
+
+def option_parser():
+ parser = OptionParser(textwrap.dedent(\
+ '''\
+ %prog [options]
+
+ Fetch book metadata from Fictionwise. You must specify one of title, author,
+ or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches,
+ so you should make your query as specific as possible.
+ '''
+ ))
+ parser.add_option('-t', '--title', help='Book title')
+ parser.add_option('-a', '--author', help='Book author(s)')
+ parser.add_option('-p', '--publisher', help='Book publisher')
+ parser.add_option('-k', '--keywords', help='Keywords')
+ parser.add_option('-m', '--max-results', default=5,
+ help='Maximum number of results to fetch')
+ parser.add_option('-v', '--verbose', default=0, action='count',
+ help='Be more verbose about errors')
+ return parser
+
+def main(args=sys.argv):
+ parser = option_parser()
+ opts, args = parser.parse_args(args)
+ try:
+ results = search(opts.title, opts.author, publisher=opts.publisher,
+ keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results)
+ except AssertionError:
+ report(True)
+ parser.print_help()
+ return 1
+ for result in results:
+ print unicode(result).encode(preferred_encoding, 'replace')
+ print
+
+if __name__ == '__main__':
+ sys.exit(main())