From fabef627e3dd85d06989551614db5277e72021c7 Mon Sep 17 00:00:00 2001 From: Byron Li Date: Mon, 25 Apr 2011 21:11:24 +0800 Subject: [PATCH 1/3] Add a douban.com plugin stub. Not working yet. --- src/calibre/customize/builtins.py | 5 +- src/calibre/ebooks/metadata/sources/douban.py | 361 ++++++++++++++++++ 2 files changed, 364 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/metadata/sources/douban.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index c27fa2a57b..3c769f8dc7 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -628,8 +628,9 @@ if test_eight_code: from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary from calibre.ebooks.metadata.sources.isbndb import ISBNDB from calibre.ebooks.metadata.sources.overdrive import OverDrive - - plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive] + from calibre.ebooks.metadata.sources.douban import Douban + + plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban] # }}} else: diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py new file mode 100644 index 0000000000..b50bb6ff85 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ; 2011, Li Fanxi ' +__docformat__ = 'restructuredtext en' + +import time, hashlib +from urllib import urlencode +from functools import partial +from Queue import Queue, Empty + +from lxml import etree + +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.chardet import xml_to_unicode +from calibre.utils.date import parse_date, utcnow +from calibre.utils.cleantext import clean_ascii_chars +from calibre import as_unicode + +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'dc' : 'http://purl.org/dc/terms', + 'gd' : 'http://schemas.google.com/g/2005' + } + +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'db': 'http://www.douban.com/xmlns/' + } +XPath = partial(etree.XPath, namespaces=NAMESPACES) +total_results = XPath('//openSearch:totalResults') +start_index = XPath('//openSearch:startIndex') +items_per_page = XPath('//openSearch:itemsPerPage') +entry = XPath('//atom:entry') +entry_id = XPath('descendant::atom:id') +title = XPath('descendant::atom:title') +description = XPath('descendant::atom:summary') +publisher = XPath("descendant::db:attribute[@name='publisher']") +isbn = XPath("descendant::db:attribute[@name='isbn13']") +date = XPath("descendant::db:attribute[@name='pubdate']") +creator = XPath("descendant::db:attribute[@name='author']") +tag = XPath("descendant::db:tag") + +def get_details(browser, url, timeout): # {{{ + try: + raw = browser.open_novisit(url, timeout=timeout).read() + except Exception as e: + gc = getattr(e, 'getcode', lambda : -1) + if gc() != 403: + raise + # Google is throttling us, wait a little + time.sleep(2) + raw = browser.open_novisit(url, timeout=timeout).read() + + return raw +# }}} + +def to_metadata(browser, log, entry_, timeout): # {{{ + + def get_text(extra, x): + try: + ans = x(extra) + if ans: + ans = ans[0].text + if ans and ans.strip(): + return ans.strip() + except: + log.exception('Programming error:') + return None + + + id_url = entry_id(entry_)[0].text + google_id = id_url.split('/')[-1] + title_ = ': '.join([x.text for x in title(entry_)]).strip() + authors = [x.text.strip() for x in creator(entry_) if x.text] + if not authors: + authors = [_('Unknown')] + if not id_url or not title: + # Silently discard this entry + return None + + mi = Metadata(title_, authors) + mi.identifiers = {'google':google_id} + try: + raw = get_details(browser, id_url, timeout) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0]) + extra = entry(feed)[0] + except: + log.exception('Failed to get additional details for', mi.title) + return mi + + mi.comments = get_text(extra, description) + #mi.language = get_text(extra, language) + mi.publisher = get_text(extra, publisher) + + # ISBN + isbns = [] + for x in identifier(extra): + t = str(x.text).strip() + if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): + if t[:5].upper() == 'ISBN:': + t = check_isbn(t[5:]) + if t: + isbns.append(t) + if isbns: + mi.isbn = sorted(isbns, key=len)[-1] + mi.all_isbns = isbns + + # Tags + try: + btags = [x.text for x in subject(extra) if x.text] + tags = [] + for t in btags: + atags = [y.strip() for y in t.split('/')] + for tag in atags: + if tag not in tags: + tags.append(tag) + except: + log.exception('Failed to parse tags:') + tags = [] + if tags: + mi.tags = [x.replace(',', ';') for x in tags] + + # pubdate + pubdate = get_text(extra, date) + if pubdate: + try: + default = utcnow().replace(day=15) + mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) + except: + log.error('Failed to parse pubdate %r'%pubdate) + + # Ratings + for x in rating(extra): + try: + mi.rating = float(x.get('average')) + if mi.rating > 5: + mi.rating /= 2 + except: + log.exception('Failed to parse rating') + + # Cover + mi.has_google_cover = None + for x in extra.xpath( + '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): + mi.has_google_cover = x.get('href') + break + + return mi +# }}} + +class Douban(Source): + + name = 'Douban Books' + author = _('Li Fanxi') + + description = _('Downloads metadata from Douban.com') + + capabilities = frozenset(['identify', 'cover']) + touched_fields = frozenset(['title', 'authors', 'tags', + 'comments', 'publisher', 'identifier:isbn', 'rating', + 'identifier:douban']) # language currently disabled + supports_gzip_transfer_encoding = True + cached_cover_url_is_reliable = True + + DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' +# GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1' + +# DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657']) + + def get_book_url(self, identifiers): # {{{ + db = identifiers.get('douban', None) + if db is not None: + return db + else: + return None + # }}} + + def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ + SEARCH_URL = 'http://api.douban.com/book/subjects?' + ISBN_URL = 'http://api.douban.com/book/subject/isbn/' + + q = '' + t = None + isbn = check_isbn(identifiers.get('isbn', None)) + if isbn is not None: + q = isbn + t = 'isbn' + elif title or authors: + def build_term(prefix, parts): + return ' '.join(x for x in parts) + title_tokens = list(self.get_title_tokens(title)) + if title_tokens: + q += build_term('title', title_tokens) + author_tokens = self.get_author_tokens(authors, + only_first_author=True) + if author_tokens: + q += ((' ' if q != '' else '') + + build_term('author', author_tokens)) + t = 'search' + if isinstance(q, unicode): + q = q.encode('utf-8') + if not q: + return None + print(q) + url = None + if t == "isbn": + url = ISBN_URL + q + else: + url = SEARCH_URL + urlencode({ + 'q': q, + }) + if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': + url = url + "?apikey=" + self.DOUBAN_API_KEY + print(url) + return url + # }}} + + def download_cover(self, log, result_queue, abort, # {{{ + title=None, authors=None, identifiers={}, timeout=30): + cached_url = self.get_cached_cover_url(identifiers) + if cached_url is None: + log.info('No cached cover found, running identify') + rq = Queue() + self.identify(log, rq, abort, title=title, authors=authors, + identifiers=identifiers) + if abort.is_set(): + return + results = [] + while True: + try: + results.append(rq.get_nowait()) + except Empty: + break + results.sort(key=self.identify_results_keygen( + title=title, authors=authors, identifiers=identifiers)) + for mi in results: + cached_url = self.get_cached_cover_url(mi.identifiers) + if cached_url is not None: + break + if cached_url is None: + log.info('No cover found') + return + + if abort.is_set(): + return + br = self.browser + log('Downloading cover from:', cached_url) + try: + cdata = br.open_novisit(cached_url, timeout=timeout).read() + if cdata: + if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5: + log.warning('Google returned a dummy image, ignoring') + else: + result_queue.put((self, cdata)) + except: + log.exception('Failed to download cover from:', cached_url) + + # }}} + + def get_cached_cover_url(self, identifiers): # {{{ + url = None + goog = identifiers.get('google', None) + if goog is None: + isbn = identifiers.get('isbn', None) + if isbn is not None: + goog = self.cached_isbn_to_identifier(isbn) + if goog is not None: + url = self.cached_identifier_to_cover_url(goog) + + return url + # }}} + + def get_all_details(self, br, log, entries, abort, # {{{ + result_queue, timeout): + for relevance, i in enumerate(entries): + try: + ans = to_metadata(br, log, i, timeout) + if isinstance(ans, Metadata): + ans.source_relevance = relevance + goog = ans.identifiers['google'] + for isbn in getattr(ans, 'all_isbns', []): + self.cache_isbn_to_identifier(isbn, goog) + if ans.has_google_cover: + self.cache_identifier_to_cover_url(goog, + self.GOOGLE_COVER%goog) + self.clean_downloaded_metadata(ans) + result_queue.put(ans) + except: + log.exception( + 'Failed to get metadata for identify entry:', + etree.tostring(i)) + if abort.is_set(): + break + # }}} + + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ + identifiers={}, timeout=30): + query = self.create_query(log, title=title, authors=authors, + identifiers=identifiers) + if not query: + log.error('Insufficient metadata to construct query') + return + br = self.browser + try: + raw = br.open_novisit(query, timeout=timeout).read() + except Exception as e: + log.exception('Failed to make identify query: %r'%query) + return as_unicode(e) + + try: + parser = etree.XMLParser(recover=True, no_network=True) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0], parser=parser) + entries = entry(feed) + except Exception as e: + log.exception('Failed to parse identify results') + return as_unicode(e) + + if not entries and identifiers and title and authors and \ + not abort.is_set(): + return self.identify(log, result_queue, abort, title=title, + authors=authors, timeout=timeout) + + # There is no point running these queries in threads as google + # throttles requests returning 403 Forbidden errors + self.get_all_details(br, log, entries, abort, result_queue, timeout) + + return None + # }}} + +if __name__ == '__main__': # tests {{{ + # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py + from calibre.ebooks.metadata.sources.test import (test_identify_plugin, + title_test, authors_test) + test_identify_plugin(GoogleBooks.name, + [ + + + ( + {'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby', + 'authors':['Fitzgerald']}, + [title_test('The great gatsby', exact=True), + authors_test(['Francis Scott Fitzgerald'])] + ), + + ( + {'title': 'Flatland', 'authors':['Abbott']}, + [title_test('Flatland', exact=False)] + ), + ]) +# }}} + From ea4b5b9054765bb737179d904c9168846def2e45 Mon Sep 17 00:00:00 2001 From: Byron Li Date: Fri, 29 Apr 2011 16:29:57 +0800 Subject: [PATCH 2/3] First working version of Douban book plugin. --- src/calibre/ebooks/metadata/sources/douban.py | 83 +++++++++---------- 1 file changed, 37 insertions(+), 46 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py index b50bb6ff85..8f1794b33f 100644 --- a/src/calibre/ebooks/metadata/sources/douban.py +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -25,14 +25,8 @@ from calibre import as_unicode NAMESPACES = { 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', 'atom' : 'http://www.w3.org/2005/Atom', - 'dc' : 'http://purl.org/dc/terms', - 'gd' : 'http://schemas.google.com/g/2005' - } - -NAMESPACES = { - 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', - 'atom' : 'http://www.w3.org/2005/Atom', - 'db': 'http://www.douban.com/xmlns/' + 'db': 'http://www.douban.com/xmlns/', + 'gd': 'http://schemas.google.com/g/2005' } XPath = partial(etree.XPath, namespaces=NAMESPACES) total_results = XPath('//openSearch:totalResults') @@ -47,6 +41,8 @@ isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") tag = XPath("descendant::db:tag") +rating = XPath("descendant::gd:rating[@name='average']") +cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_details(browser, url, timeout): # {{{ try: @@ -77,7 +73,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ id_url = entry_id(entry_)[0].text - google_id = id_url.split('/')[-1] + douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: @@ -87,7 +83,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ return None mi = Metadata(title_, authors) - mi.identifiers = {'google':google_id} + mi.identifiers = {'douban':douban_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), @@ -103,13 +99,9 @@ def to_metadata(browser, log, entry_, timeout): # {{{ # ISBN isbns = [] - for x in identifier(extra): - t = str(x.text).strip() - if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): - if t[:5].upper() == 'ISBN:': - t = check_isbn(t[5:]) - if t: - isbns.append(t) + for x in [t.text for t in isbn(extra)]: + if check_isbn(x): + isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns @@ -139,21 +131,23 @@ def to_metadata(browser, log, entry_, timeout): # {{{ log.error('Failed to parse pubdate %r'%pubdate) # Ratings - for x in rating(extra): + if rating(extra): try: - mi.rating = float(x.get('average')) - if mi.rating > 5: - mi.rating /= 2 + mi.rating = float(rating(extra).text) / 2.0 except: log.exception('Failed to parse rating') + mi.rating = 0 # Cover - mi.has_google_cover = None - for x in extra.xpath( - '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): - mi.has_google_cover = x.get('href') - break - + mi.has_douban_cover = None + u = cover_url(extra) + print(u) + if u: + u = u[0].replace('/spic/', '/lpic/'); + print(u) + # If URL contains "book-default", the book doesn't have a cover + if u.find('book-default') == -1: + mi.has_douban_cover = u return mi # }}} @@ -172,6 +166,7 @@ class Douban(Source): cached_cover_url_is_reliable = True DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' + DOUBAN_ID_URL = 'http://api.douban.com/book/subject/%s' # GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1' # DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657']) @@ -179,7 +174,7 @@ class Douban(Source): def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: - return db + return DOUBAN_ID_URL % db else: return None # }}} @@ -206,11 +201,11 @@ class Douban(Source): q += ((' ' if q != '' else '') + build_term('author', author_tokens)) t = 'search' + q = q.strip() if isinstance(q, unicode): q = q.encode('utf-8') if not q: return None - print(q) url = None if t == "isbn": url = ISBN_URL + q @@ -220,7 +215,6 @@ class Douban(Source): }) if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': url = url + "?apikey=" + self.DOUBAN_API_KEY - print(url) return url # }}} @@ -257,10 +251,7 @@ class Douban(Source): try: cdata = br.open_novisit(cached_url, timeout=timeout).read() if cdata: - if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5: - log.warning('Google returned a dummy image, ignoring') - else: - result_queue.put((self, cdata)) + result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) @@ -268,13 +259,13 @@ class Douban(Source): def get_cached_cover_url(self, identifiers): # {{{ url = None - goog = identifiers.get('google', None) - if goog is None: + db = identifiers.get('douban', None) + if db is None: isbn = identifiers.get('isbn', None) if isbn is not None: - goog = self.cached_isbn_to_identifier(isbn) - if goog is not None: - url = self.cached_identifier_to_cover_url(goog) + db = self.cached_isbn_to_identifier(isbn) + if db is not None: + url = self.cached_identifier_to_cover_url(db) return url # }}} @@ -286,12 +277,12 @@ class Douban(Source): ans = to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): ans.source_relevance = relevance - goog = ans.identifiers['google'] + db = ans.identifiers['douban'] for isbn in getattr(ans, 'all_isbns', []): - self.cache_isbn_to_identifier(isbn, goog) - if ans.has_google_cover: - self.cache_identifier_to_cover_url(goog, - self.GOOGLE_COVER%goog) + self.cache_isbn_to_identifier(isbn, db) + if ans.has_douban_cover: + self.cache_identifier_to_cover_url(db, + ans.has_douban_cover) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: @@ -315,7 +306,6 @@ class Douban(Source): except Exception as e: log.exception('Failed to make identify query: %r'%query) return as_unicode(e) - try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), @@ -324,7 +314,8 @@ class Douban(Source): except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) - + if not title: + title = "" if not entries and identifiers and title and authors and \ not abort.is_set(): return self.identify(log, result_queue, abort, title=title, From 4bdbab22ca6e8818b76e0ae98ec30094dd00622d Mon Sep 17 00:00:00 2001 From: Li Fanxi Date: Sun, 8 May 2011 22:28:47 +0800 Subject: [PATCH 3/3] Finish the Douban.com books metadata source plugin --- src/calibre/ebooks/metadata/sources/douban.py | 55 +++++++++---------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py index 8f1794b33f..7a8619261b 100644 --- a/src/calibre/ebooks/metadata/sources/douban.py +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -40,8 +40,8 @@ publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") -tag = XPath("descendant::db:tag") -rating = XPath("descendant::gd:rating[@name='average']") +booktag = XPath("descendant::db:tag/attribute::name") +rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_details(browser, url, timeout): # {{{ @@ -51,7 +51,7 @@ def get_details(browser, url, timeout): # {{{ gc = getattr(e, 'getcode', lambda : -1) if gc() != 403: raise - # Google is throttling us, wait a little + # Douban is throttling us, wait a little time.sleep(2) raw = browser.open_novisit(url, timeout=timeout).read() @@ -59,7 +59,6 @@ def get_details(browser, url, timeout): # {{{ # }}} def to_metadata(browser, log, entry_, timeout): # {{{ - def get_text(extra, x): try: ans = x(extra) @@ -71,7 +70,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{ log.exception('Programming error:') return None - id_url = entry_id(entry_)[0].text douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() @@ -92,9 +90,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ except: log.exception('Failed to get additional details for', mi.title) return mi - mi.comments = get_text(extra, description) - #mi.language = get_text(extra, language) mi.publisher = get_text(extra, publisher) # ISBN @@ -108,7 +104,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ # Tags try: - btags = [x.text for x in subject(extra) if x.text] + btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] @@ -120,7 +116,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] - + # pubdate pubdate = get_text(extra, date) if pubdate: @@ -133,7 +129,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{ # Ratings if rating(extra): try: - mi.rating = float(rating(extra).text) / 2.0 + mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 @@ -141,10 +137,8 @@ def to_metadata(browser, log, entry_, timeout): # {{{ # Cover mi.has_douban_cover = None u = cover_url(extra) - print(u) if u: u = u[0].replace('/spic/', '/lpic/'); - print(u) # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u @@ -155,26 +149,24 @@ class Douban(Source): name = 'Douban Books' author = _('Li Fanxi') + version = (2, 0, 0) description = _('Downloads metadata from Douban.com') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'tags', - 'comments', 'publisher', 'identifier:isbn', 'rating', + 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', 'identifier:douban']) # language currently disabled supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = True DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' - DOUBAN_ID_URL = 'http://api.douban.com/book/subject/%s' -# GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1' - -# DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657']) + DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/' def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: - return DOUBAN_ID_URL % db + return ('douban', db, self.DOUBAN_BOOK_URL%db) else: return None # }}} @@ -182,13 +174,18 @@ class Douban(Source): def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ SEARCH_URL = 'http://api.douban.com/book/subjects?' ISBN_URL = 'http://api.douban.com/book/subject/isbn/' + SUBJECT_URL = 'http://api.douban.com/book/subject/' q = '' t = None isbn = check_isbn(identifiers.get('isbn', None)) + subject = identifiers.get('douban', None) if isbn is not None: q = isbn t = 'isbn' + elif subject is not None: + q = subject + t = 'subject' elif title or authors: def build_term(prefix, parts): return ' '.join(x for x in parts) @@ -209,6 +206,8 @@ class Douban(Source): url = None if t == "isbn": url = ISBN_URL + q + elif t == 'subject': + url = SUBJECT_URL + q else: url = SEARCH_URL + urlencode({ 'q': q, @@ -314,14 +313,12 @@ class Douban(Source): except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) - if not title: - title = "" if not entries and identifiers and title and authors and \ not abort.is_set(): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) - # There is no point running these queries in threads as google + # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) @@ -329,23 +326,23 @@ class Douban(Source): # }}} if __name__ == '__main__': # tests {{{ - # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py + # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py from calibre.ebooks.metadata.sources.test import (test_identify_plugin, title_test, authors_test) - test_identify_plugin(GoogleBooks.name, + test_identify_plugin(Douban.name, [ ( - {'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby', - 'authors':['Fitzgerald']}, - [title_test('The great gatsby', exact=True), - authors_test(['Francis Scott Fitzgerald'])] + {'identifiers':{'isbn': '9787536692930'}, 'title':'三体', + 'authors':['刘慈欣']}, + [title_test('三体', exact=True), + authors_test(['刘慈欣'])] ), ( - {'title': 'Flatland', 'authors':['Abbott']}, - [title_test('Flatland', exact=False)] + {'title': 'Linux内核修炼之道', 'authors':['任桥伟']}, + [title_test('Linux内核修炼之道', exact=False)] ), ]) # }}}