Finish the Douban.com books metadata source plugin

This commit is contained in:
Li Fanxi 2011-05-08 22:28:47 +08:00
parent 7bd9cd20fe
commit 4bdbab22ca

View File

@ -40,8 +40,8 @@ publisher = XPath("descendant::db:attribute[@name='publisher']")
isbn = XPath("descendant::db:attribute[@name='isbn13']") isbn = XPath("descendant::db:attribute[@name='isbn13']")
date = XPath("descendant::db:attribute[@name='pubdate']") date = XPath("descendant::db:attribute[@name='pubdate']")
creator = XPath("descendant::db:attribute[@name='author']") creator = XPath("descendant::db:attribute[@name='author']")
tag = XPath("descendant::db:tag") booktag = XPath("descendant::db:tag/attribute::name")
rating = XPath("descendant::gd:rating[@name='average']") rating = XPath("descendant::gd:rating/attribute::average")
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
def get_details(browser, url, timeout): # {{{ def get_details(browser, url, timeout): # {{{
@ -51,7 +51,7 @@ def get_details(browser, url, timeout): # {{{
gc = getattr(e, 'getcode', lambda : -1) gc = getattr(e, 'getcode', lambda : -1)
if gc() != 403: if gc() != 403:
raise raise
# Google is throttling us, wait a little # Douban is throttling us, wait a little
time.sleep(2) time.sleep(2)
raw = browser.open_novisit(url, timeout=timeout).read() raw = browser.open_novisit(url, timeout=timeout).read()
@ -59,7 +59,6 @@ def get_details(browser, url, timeout): # {{{
# }}} # }}}
def to_metadata(browser, log, entry_, timeout): # {{{ def to_metadata(browser, log, entry_, timeout): # {{{
def get_text(extra, x): def get_text(extra, x):
try: try:
ans = x(extra) ans = x(extra)
@ -71,7 +70,6 @@ def to_metadata(browser, log, entry_, timeout): # {{{
log.exception('Programming error:') log.exception('Programming error:')
return None return None
id_url = entry_id(entry_)[0].text id_url = entry_id(entry_)[0].text
douban_id = id_url.split('/')[-1] douban_id = id_url.split('/')[-1]
title_ = ': '.join([x.text for x in title(entry_)]).strip() title_ = ': '.join([x.text for x in title(entry_)]).strip()
@ -92,9 +90,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
except: except:
log.exception('Failed to get additional details for', mi.title) log.exception('Failed to get additional details for', mi.title)
return mi return mi
mi.comments = get_text(extra, description) mi.comments = get_text(extra, description)
#mi.language = get_text(extra, language)
mi.publisher = get_text(extra, publisher) mi.publisher = get_text(extra, publisher)
# ISBN # ISBN
@ -108,7 +104,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
# Tags # Tags
try: try:
btags = [x.text for x in subject(extra) if x.text] btags = [x for x in booktag(extra) if x]
tags = [] tags = []
for t in btags: for t in btags:
atags = [y.strip() for y in t.split('/')] atags = [y.strip() for y in t.split('/')]
@ -120,7 +116,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
tags = [] tags = []
if tags: if tags:
mi.tags = [x.replace(',', ';') for x in tags] mi.tags = [x.replace(',', ';') for x in tags]
# pubdate # pubdate
pubdate = get_text(extra, date) pubdate = get_text(extra, date)
if pubdate: if pubdate:
@ -133,7 +129,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
# Ratings # Ratings
if rating(extra): if rating(extra):
try: try:
mi.rating = float(rating(extra).text) / 2.0 mi.rating = float(rating(extra)[0]) / 2.0
except: except:
log.exception('Failed to parse rating') log.exception('Failed to parse rating')
mi.rating = 0 mi.rating = 0
@ -141,10 +137,8 @@ def to_metadata(browser, log, entry_, timeout): # {{{
# Cover # Cover
mi.has_douban_cover = None mi.has_douban_cover = None
u = cover_url(extra) u = cover_url(extra)
print(u)
if u: if u:
u = u[0].replace('/spic/', '/lpic/'); u = u[0].replace('/spic/', '/lpic/');
print(u)
# If URL contains "book-default", the book doesn't have a cover # If URL contains "book-default", the book doesn't have a cover
if u.find('book-default') == -1: if u.find('book-default') == -1:
mi.has_douban_cover = u mi.has_douban_cover = u
@ -155,26 +149,24 @@ class Douban(Source):
name = 'Douban Books' name = 'Douban Books'
author = _('Li Fanxi') author = _('Li Fanxi')
version = (2, 0, 0)
description = _('Downloads metadata from Douban.com') description = _('Downloads metadata from Douban.com')
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'tags', touched_fields = frozenset(['title', 'authors', 'tags',
'comments', 'publisher', 'identifier:isbn', 'rating', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
'identifier:douban']) # language currently disabled 'identifier:douban']) # language currently disabled
supports_gzip_transfer_encoding = True supports_gzip_transfer_encoding = True
cached_cover_url_is_reliable = True cached_cover_url_is_reliable = True
DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
DOUBAN_ID_URL = 'http://api.douban.com/book/subject/%s' DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/'
# GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'
# DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657'])
def get_book_url(self, identifiers): # {{{ def get_book_url(self, identifiers): # {{{
db = identifiers.get('douban', None) db = identifiers.get('douban', None)
if db is not None: if db is not None:
return DOUBAN_ID_URL % db return ('douban', db, self.DOUBAN_BOOK_URL%db)
else: else:
return None return None
# }}} # }}}
@ -182,13 +174,18 @@ class Douban(Source):
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
SEARCH_URL = 'http://api.douban.com/book/subjects?' SEARCH_URL = 'http://api.douban.com/book/subjects?'
ISBN_URL = 'http://api.douban.com/book/subject/isbn/' ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
SUBJECT_URL = 'http://api.douban.com/book/subject/'
q = '' q = ''
t = None t = None
isbn = check_isbn(identifiers.get('isbn', None)) isbn = check_isbn(identifiers.get('isbn', None))
subject = identifiers.get('douban', None)
if isbn is not None: if isbn is not None:
q = isbn q = isbn
t = 'isbn' t = 'isbn'
elif subject is not None:
q = subject
t = 'subject'
elif title or authors: elif title or authors:
def build_term(prefix, parts): def build_term(prefix, parts):
return ' '.join(x for x in parts) return ' '.join(x for x in parts)
@ -209,6 +206,8 @@ class Douban(Source):
url = None url = None
if t == "isbn": if t == "isbn":
url = ISBN_URL + q url = ISBN_URL + q
elif t == 'subject':
url = SUBJECT_URL + q
else: else:
url = SEARCH_URL + urlencode({ url = SEARCH_URL + urlencode({
'q': q, 'q': q,
@ -314,14 +313,12 @@ class Douban(Source):
except Exception as e: except Exception as e:
log.exception('Failed to parse identify results') log.exception('Failed to parse identify results')
return as_unicode(e) return as_unicode(e)
if not title:
title = ""
if not entries and identifiers and title and authors and \ if not entries and identifiers and title and authors and \
not abort.is_set(): not abort.is_set():
return self.identify(log, result_queue, abort, title=title, return self.identify(log, result_queue, abort, title=title,
authors=authors, timeout=timeout) authors=authors, timeout=timeout)
# There is no point running these queries in threads as google # There is no point running these queries in threads as douban
# throttles requests returning 403 Forbidden errors # throttles requests returning 403 Forbidden errors
self.get_all_details(br, log, entries, abort, result_queue, timeout) self.get_all_details(br, log, entries, abort, result_queue, timeout)
@ -329,23 +326,23 @@ class Douban(Source):
# }}} # }}}
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
title_test, authors_test) title_test, authors_test)
test_identify_plugin(GoogleBooks.name, test_identify_plugin(Douban.name,
[ [
( (
{'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby', {'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
'authors':['Fitzgerald']}, 'authors':['刘慈欣']},
[title_test('The great gatsby', exact=True), [title_test('三体', exact=True),
authors_test(['Francis Scott Fitzgerald'])] authors_test(['刘慈欣'])]
), ),
( (
{'title': 'Flatland', 'authors':['Abbott']}, {'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
[title_test('Flatland', exact=False)] [title_test('Linux内核修炼之道', exact=False)]
), ),
]) ])
# }}} # }}}