From 524013c86c13324f64d5bb19a2dae02094978df2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 19 Mar 2009 12:01:32 -0700 Subject: [PATCH] IGN:Working command line interface to the Google Books Data API to fetch book metadata --- src/calibre/ebooks/metadata/google_books.py | 244 ++++++++++++++++++++ src/calibre/gui2/main.py | 1 + src/calibre/linux.py | 9 +- 3 files changed, 251 insertions(+), 3 deletions(-) create mode 100644 src/calibre/ebooks/metadata/google_books.py diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py new file mode 100644 index 0000000000..261d559127 --- /dev/null +++ b/src/calibre/ebooks/metadata/google_books.py @@ -0,0 +1,244 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, textwrap +from urllib import urlencode +from functools import partial + +from lxml import etree +from dateutil import parser + +from calibre import browser, preferred_encoding +from calibre.ebooks.metadata import MetaInformation +from calibre.utils.config import OptionParser + +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'dc': 'http://purl.org/dc/terms' + } +XPath = partial(etree.XPath, namespaces=NAMESPACES) + +total_results = XPath('//openSearch:totalResults') +start_index = XPath('//openSearch:startIndex') +items_per_page = XPath('//openSearch:itemsPerPage') +entry = XPath('//atom:entry') +entry_id = XPath('descendant::atom:id') +creator = XPath('descendant::dc:creator') +identifier = XPath('descendant::dc:identifier') +title = XPath('descendant::dc:title') +date = XPath('descendant::dc:date') +publisher = XPath('descendant::dc:publisher') +subject = XPath('descendant::dc:subject') +description = XPath('descendant::dc:description') +language = XPath('descendant::dc:language') + +def report(verbose): + if verbose: + import traceback + traceback.print_exc() + + +class Query(object): + + BASE_URL = 'http://books.google.com/books/feeds/volumes?' + + def __init__(self, title=None, author=None, publisher=None, isbn=None, + max_results=20, min_viewability='none', start_index=1): + assert not(title is None and author is None and publisher is None and \ + isbn is None) + assert (max_results < 21) + assert (min_viewability in ('none', 'partial', 'full')) + q = '' + if isbn is not None: + q += 'isbn:'+isbn + else: + def build_term(prefix, parts): + return ' '.join('in'+prefix + ':' + x for x in parts) + if title is not None: + q += build_term('title', title.split()) + if author is not None: + q += build_term('author', author.split()) + if publisher is not None: + q += build_term('publisher', publisher.split()) + + self.url = self.BASE_URL+urlencode({ + 'q':q, + 'max-results':max_results, + 'start-index':start_index, + 'min-viewability':min_viewability, + }) + + def __call__(self, browser, verbose): + if verbose: + print 'Query:', self.url + feed = etree.fromstring(browser.open(self.url).read()) + total = int(total_results(feed)[0].text) + start = int(start_index(feed)[0].text) + entries = entry(feed) + new_start = start + len(entries) + if new_start > total: + new_start = 0 + return entries, new_start + + +class ResultList(list): + + def get_description(self, entry, verbose): + try: + desc = description(entry) + if desc: + return desc[0].text + except: + report(verbose) + + def get_language(self, entry, verbose): + try: + l = language(entry) + if l: + return l[0].text + except: + report(verbose) + + + + def get_title(self, entry): + candidates = [x.text for x in title(entry)] + candidates.sort(cmp=lambda x,y: cmp(len(x), len(y)), reverse=True) + return candidates[0] + + def get_authors(self, entry): + m = creator(entry) + if not m: + m = [] + m = [x.text for x in m] + return m + + def get_author_sort(self, entry, verbose): + for x in creator(entry): + for key, val in x.attrib.items(): + if key.endswith('file-as'): + return val + + def get_identifiers(self, entry, mi): + isbns = [] + for x in identifier(entry): + t = str(x.text).strip() + if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): + if t[:5].upper() == 'ISBN:': + isbns.append(t[5:]) + if isbns: + mi.isbn = sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1] + + def get_tags(self, entry, verbose): + try: + tags = [x.text for x in subject(entry)] + except: + report(verbose) + tags = [] + return tags + + def get_publisher(self, entry, verbose): + try: + pub = publisher(entry)[0].text + except: + pub = None + return pub + + def get_date(self, entry, verbose): + try: + d = date(entry) + if d: + d = parser.parse(d[0].text) + else: + d = None + except: + report(verbose) + d = None + return d + + def populate(self, entries, browser, verbose=False): + for x in entries: + try: + id_url = entry_id(x)[0].text + title = self.get_title(x) + except: + report(verbose) + mi = MetaInformation(title, self.get_authors(x)) + try: + raw = browser.open(id_url).read() + feed = etree.fromstring(raw) + x = entry(feed)[0] + except Exception, e: + if verbose: + print 'Failed to get all details for an entry' + print e + mi.author_sort = self.get_author_sort(x, verbose) + mi.comments = self.get_description(x, verbose) + self.get_identifiers(x, mi) + mi.tags = self.get_tags(x, verbose) + mi.publisher = self.get_publisher(x, verbose) + mi.timestamp = self.get_date(x, verbose) + mi.language = self.get_language(x, verbose) + self.append(mi) + + +def search(title=None, author=None, publisher=None, isbn=None, + min_viewability='none', verbose=False, max_results=40): + br = browser() + start, entries = 1, [] + while start > 0 and len(entries) <= max_results: + new, start = Query(title=title, author=author, publisher=publisher, + isbn=isbn, min_viewability=min_viewability)(br, verbose) + if not new: + break + entries.extend(new) + + entries = entries[:max_results] + + ans = ResultList() + ans.populate(entries, br, verbose) + ans.sort(cmp=lambda x, y:cmp(len(x.comments if x.comments else ''), + len(x.comments if x.comments else '')), + reverse=True) + return ans + +def option_parser(): + parser = OptionParser(textwrap.dedent( + '''\ + %prog [options] + + Fetch book metadata from Google. You must specify one of title, author, + publisher or ISBN. If you specify ISBN the others are ignored. Will + fetch a maximum of 100 matches, so you should make your query as + specific as possible. + ''' + )) + parser.add_option('-t', '--title', help='Book title') + parser.add_option('-a', '--author', help='Book author(s)') + parser.add_option('-p', '--publisher', help='Book publisher') + parser.add_option('-i', '--isbn', help='Book ISBN') + parser.add_option('-m', '--max-results', default=10, + help='Maximum number of results to fetch') + parser.add_option('-v', '--verbose', default=0, action='count', + help='Be more verbose about errors') + return parser + +def main(args=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(args) + try: + results = search(opts.title, opts.author, opts.publisher, opts.isbn, + verbose=opts.verbose, max_results=opts.max_results) + except AssertionError: + report(True) + parser.print_help() + return 1 + for result in results: + print unicode(result).encode(preferred_encoding) + print + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/gui2/main.py b/src/calibre/gui2/main.py index 3ed4b59f75..efd4e528ba 100644 --- a/src/calibre/gui2/main.py +++ b/src/calibre/gui2/main.py @@ -93,6 +93,7 @@ class Main(MainWindow, Ui_MainWindow): self.viewers = collections.deque() self.content_server = None self.system_tray_icon = QSystemTrayIcon(QIcon(':/library'), self) + self.system_tray_icon.setObjectName('calibre') if not config['systray_icon']: self.system_tray_icon.hide() else: diff --git a/src/calibre/linux.py b/src/calibre/linux.py index d46041b828..e43336f238 100644 --- a/src/calibre/linux.py +++ b/src/calibre/linux.py @@ -62,13 +62,14 @@ entry_points = { 'comic2lrf = calibre.ebooks.lrf.comic.convert_from:main', 'comic2epub = calibre.ebooks.epub.from_comic:main', 'comic2mobi = calibre.ebooks.mobi.from_comic:main', - 'comic2pdf = calibre.ebooks.pdf.from_comic:main', + 'comic2pdf = calibre.ebooks.pdf.from_comic:main', 'calibre-debug = calibre.debug:main', 'calibredb = calibre.library.cli:main', 'calibre-fontconfig = calibre.utils.fontconfig:main', 'calibre-parallel = calibre.parallel:main', 'calibre-customize = calibre.customize.ui:main', - 'pdftrim = calibre.ebooks.pdf.pdftrim:main' , + 'pdftrim = calibre.ebooks.pdf.pdftrim:main' , + 'google-books = calibre.ebooks.metadata.google_books:main', ], 'gui_scripts' : [ __appname__+' = calibre.gui2.main:main', @@ -196,7 +197,8 @@ def setup_completion(fatal_errors): from calibre.ebooks.epub.from_comic import option_parser as comic2epub from calibre.ebooks.mobi.from_any import option_parser as any2mobi from calibre.ebooks.mobi.writer import option_parser as oeb2mobi - from calibre.gui2.main import option_parser as guiop + from calibre.gui2.main import option_parser as guiop + from calibre.ebooks.metadata.google_books import option_parser as gbop any_formats = ['epub', 'htm', 'html', 'xhtml', 'xhtm', 'rar', 'zip', 'txt', 'lit', 'rtf', 'pdf', 'prc', 'mobi', 'fb2', 'odt'] f = open_file('/etc/bash_completion.d/libprs500') @@ -244,6 +246,7 @@ def setup_completion(fatal_errors): f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles)) f.write(opts_and_words('feeds2epub', feeds2epub, feed_titles)) f.write(opts_and_words('feeds2mobi', feeds2mobi, feed_titles)) + f.write(opts_and_words('google-books', gbop, [])) f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml', 'opf'])) f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml'])) f.write(opts_and_exts('odt2oeb', odt2oeb, ['odt']))