From fabef627e3dd85d06989551614db5277e72021c7 Mon Sep 17 00:00:00 2001 From: Byron Li Date: Mon, 25 Apr 2011 21:11:24 +0800 Subject: [PATCH] Add a douban.com plugin stub. Not working yet. --- src/calibre/customize/builtins.py | 5 +- src/calibre/ebooks/metadata/sources/douban.py | 361 ++++++++++++++++++ 2 files changed, 364 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/metadata/sources/douban.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index c27fa2a57b..3c769f8dc7 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -628,8 +628,9 @@ if test_eight_code: from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary from calibre.ebooks.metadata.sources.isbndb import ISBNDB from calibre.ebooks.metadata.sources.overdrive import OverDrive - - plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive] + from calibre.ebooks.metadata.sources.douban import Douban + + plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban] # }}} else: diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py new file mode 100644 index 0000000000..b50bb6ff85 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ; 2011, Li Fanxi ' +__docformat__ = 'restructuredtext en' + +import time, hashlib +from urllib import urlencode +from functools import partial +from Queue import Queue, Empty + +from lxml import etree + +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.chardet import xml_to_unicode +from calibre.utils.date import parse_date, utcnow +from calibre.utils.cleantext import clean_ascii_chars +from calibre import as_unicode + +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'dc' : 'http://purl.org/dc/terms', + 'gd' : 'http://schemas.google.com/g/2005' + } + +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'db': 'http://www.douban.com/xmlns/' + } +XPath = partial(etree.XPath, namespaces=NAMESPACES) +total_results = XPath('//openSearch:totalResults') +start_index = XPath('//openSearch:startIndex') +items_per_page = XPath('//openSearch:itemsPerPage') +entry = XPath('//atom:entry') +entry_id = XPath('descendant::atom:id') +title = XPath('descendant::atom:title') +description = XPath('descendant::atom:summary') +publisher = XPath("descendant::db:attribute[@name='publisher']") +isbn = XPath("descendant::db:attribute[@name='isbn13']") +date = XPath("descendant::db:attribute[@name='pubdate']") +creator = XPath("descendant::db:attribute[@name='author']") +tag = XPath("descendant::db:tag") + +def get_details(browser, url, timeout): # {{{ + try: + raw = browser.open_novisit(url, timeout=timeout).read() + except Exception as e: + gc = getattr(e, 'getcode', lambda : -1) + if gc() != 403: + raise + # Google is throttling us, wait a little + time.sleep(2) + raw = browser.open_novisit(url, timeout=timeout).read() + + return raw +# }}} + +def to_metadata(browser, log, entry_, timeout): # {{{ + + def get_text(extra, x): + try: + ans = x(extra) + if ans: + ans = ans[0].text + if ans and ans.strip(): + return ans.strip() + except: + log.exception('Programming error:') + return None + + + id_url = entry_id(entry_)[0].text + google_id = id_url.split('/')[-1] + title_ = ': '.join([x.text for x in title(entry_)]).strip() + authors = [x.text.strip() for x in creator(entry_) if x.text] + if not authors: + authors = [_('Unknown')] + if not id_url or not title: + # Silently discard this entry + return None + + mi = Metadata(title_, authors) + mi.identifiers = {'google':google_id} + try: + raw = get_details(browser, id_url, timeout) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0]) + extra = entry(feed)[0] + except: + log.exception('Failed to get additional details for', mi.title) + return mi + + mi.comments = get_text(extra, description) + #mi.language = get_text(extra, language) + mi.publisher = get_text(extra, publisher) + + # ISBN + isbns = [] + for x in identifier(extra): + t = str(x.text).strip() + if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): + if t[:5].upper() == 'ISBN:': + t = check_isbn(t[5:]) + if t: + isbns.append(t) + if isbns: + mi.isbn = sorted(isbns, key=len)[-1] + mi.all_isbns = isbns + + # Tags + try: + btags = [x.text for x in subject(extra) if x.text] + tags = [] + for t in btags: + atags = [y.strip() for y in t.split('/')] + for tag in atags: + if tag not in tags: + tags.append(tag) + except: + log.exception('Failed to parse tags:') + tags = [] + if tags: + mi.tags = [x.replace(',', ';') for x in tags] + + # pubdate + pubdate = get_text(extra, date) + if pubdate: + try: + default = utcnow().replace(day=15) + mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) + except: + log.error('Failed to parse pubdate %r'%pubdate) + + # Ratings + for x in rating(extra): + try: + mi.rating = float(x.get('average')) + if mi.rating > 5: + mi.rating /= 2 + except: + log.exception('Failed to parse rating') + + # Cover + mi.has_google_cover = None + for x in extra.xpath( + '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): + mi.has_google_cover = x.get('href') + break + + return mi +# }}} + +class Douban(Source): + + name = 'Douban Books' + author = _('Li Fanxi') + + description = _('Downloads metadata from Douban.com') + + capabilities = frozenset(['identify', 'cover']) + touched_fields = frozenset(['title', 'authors', 'tags', + 'comments', 'publisher', 'identifier:isbn', 'rating', + 'identifier:douban']) # language currently disabled + supports_gzip_transfer_encoding = True + cached_cover_url_is_reliable = True + + DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' +# GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1' + +# DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657']) + + def get_book_url(self, identifiers): # {{{ + db = identifiers.get('douban', None) + if db is not None: + return db + else: + return None + # }}} + + def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ + SEARCH_URL = 'http://api.douban.com/book/subjects?' + ISBN_URL = 'http://api.douban.com/book/subject/isbn/' + + q = '' + t = None + isbn = check_isbn(identifiers.get('isbn', None)) + if isbn is not None: + q = isbn + t = 'isbn' + elif title or authors: + def build_term(prefix, parts): + return ' '.join(x for x in parts) + title_tokens = list(self.get_title_tokens(title)) + if title_tokens: + q += build_term('title', title_tokens) + author_tokens = self.get_author_tokens(authors, + only_first_author=True) + if author_tokens: + q += ((' ' if q != '' else '') + + build_term('author', author_tokens)) + t = 'search' + if isinstance(q, unicode): + q = q.encode('utf-8') + if not q: + return None + print(q) + url = None + if t == "isbn": + url = ISBN_URL + q + else: + url = SEARCH_URL + urlencode({ + 'q': q, + }) + if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': + url = url + "?apikey=" + self.DOUBAN_API_KEY + print(url) + return url + # }}} + + def download_cover(self, log, result_queue, abort, # {{{ + title=None, authors=None, identifiers={}, timeout=30): + cached_url = self.get_cached_cover_url(identifiers) + if cached_url is None: + log.info('No cached cover found, running identify') + rq = Queue() + self.identify(log, rq, abort, title=title, authors=authors, + identifiers=identifiers) + if abort.is_set(): + return + results = [] + while True: + try: + results.append(rq.get_nowait()) + except Empty: + break + results.sort(key=self.identify_results_keygen( + title=title, authors=authors, identifiers=identifiers)) + for mi in results: + cached_url = self.get_cached_cover_url(mi.identifiers) + if cached_url is not None: + break + if cached_url is None: + log.info('No cover found') + return + + if abort.is_set(): + return + br = self.browser + log('Downloading cover from:', cached_url) + try: + cdata = br.open_novisit(cached_url, timeout=timeout).read() + if cdata: + if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5: + log.warning('Google returned a dummy image, ignoring') + else: + result_queue.put((self, cdata)) + except: + log.exception('Failed to download cover from:', cached_url) + + # }}} + + def get_cached_cover_url(self, identifiers): # {{{ + url = None + goog = identifiers.get('google', None) + if goog is None: + isbn = identifiers.get('isbn', None) + if isbn is not None: + goog = self.cached_isbn_to_identifier(isbn) + if goog is not None: + url = self.cached_identifier_to_cover_url(goog) + + return url + # }}} + + def get_all_details(self, br, log, entries, abort, # {{{ + result_queue, timeout): + for relevance, i in enumerate(entries): + try: + ans = to_metadata(br, log, i, timeout) + if isinstance(ans, Metadata): + ans.source_relevance = relevance + goog = ans.identifiers['google'] + for isbn in getattr(ans, 'all_isbns', []): + self.cache_isbn_to_identifier(isbn, goog) + if ans.has_google_cover: + self.cache_identifier_to_cover_url(goog, + self.GOOGLE_COVER%goog) + self.clean_downloaded_metadata(ans) + result_queue.put(ans) + except: + log.exception( + 'Failed to get metadata for identify entry:', + etree.tostring(i)) + if abort.is_set(): + break + # }}} + + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ + identifiers={}, timeout=30): + query = self.create_query(log, title=title, authors=authors, + identifiers=identifiers) + if not query: + log.error('Insufficient metadata to construct query') + return + br = self.browser + try: + raw = br.open_novisit(query, timeout=timeout).read() + except Exception as e: + log.exception('Failed to make identify query: %r'%query) + return as_unicode(e) + + try: + parser = etree.XMLParser(recover=True, no_network=True) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0], parser=parser) + entries = entry(feed) + except Exception as e: + log.exception('Failed to parse identify results') + return as_unicode(e) + + if not entries and identifiers and title and authors and \ + not abort.is_set(): + return self.identify(log, result_queue, abort, title=title, + authors=authors, timeout=timeout) + + # There is no point running these queries in threads as google + # throttles requests returning 403 Forbidden errors + self.get_all_details(br, log, entries, abort, result_queue, timeout) + + return None + # }}} + +if __name__ == '__main__': # tests {{{ + # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py + from calibre.ebooks.metadata.sources.test import (test_identify_plugin, + title_test, authors_test) + test_identify_plugin(GoogleBooks.name, + [ + + + ( + {'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby', + 'authors':['Fitzgerald']}, + [title_test('The great gatsby', exact=True), + authors_test(['Francis Scott Fitzgerald'])] + ), + + ( + {'title': 'Flatland', 'authors':['Abbott']}, + [title_test('Flatland', exact=False)] + ), + ]) +# }}} +