Fix & update Douban API

This commit is contained in:
xcffl 2020-03-02 22:08:38 +08:00
parent 6bdbc6f07d
commit 78f858a875
No known key found for this signature in database
GPG Key ID: C64681FA6C2FA680

View File

@ -3,7 +3,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
@ -14,27 +14,26 @@ try:
except ImportError: except ImportError:
from Queue import Empty, Queue from Queue import Empty, Queue
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Option, Source from calibre.ebooks.metadata.sources.base import Option, Source
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre import as_unicode from calibre import as_unicode
NAMESPACES = { NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', 'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/',
'atom' : 'http://www.w3.org/2005/Atom', 'atom': 'http://www.w3.org/2005/Atom',
'db': 'https://www.douban.com/xmlns/', 'db': 'https://www.douban.com/xmlns/',
'gd': 'http://schemas.google.com/g/2005' 'gd': 'http://schemas.google.com/g/2005'
} }
def get_details(browser, url, timeout): # {{{ def get_details(browser, url, timeout): # {{{
try: try:
if Douban.DOUBAN_API_KEY and Douban.DOUBAN_API_KEY != '': if Douban.DOUBAN_API_KEY:
url = url + "?apikey=" + Douban.DOUBAN_API_KEY url = url + "?apikey=" + Douban.DOUBAN_API_KEY
raw = browser.open_novisit(url, timeout=timeout).read() raw = browser.open_novisit(url, timeout=timeout).read()
except Exception as e: except Exception as e:
gc = getattr(e, 'getcode', lambda : -1) gc = getattr(e, 'getcode', lambda: -1)
if gc() != 403: if gc() != 403:
raise raise
# Douban is throttling us, wait a little # Douban is throttling us, wait a little
@ -42,97 +41,73 @@ def get_details(browser, url, timeout): # {{{
raw = browser.open_novisit(url, timeout=timeout).read() raw = browser.open_novisit(url, timeout=timeout).read()
return raw return raw
# }}} # }}}
class Douban(Source): class Douban(Source):
name = 'Douban Books' name = 'Douban Books'
author = 'Li Fanxi' author = 'Li Fanxi, xcffl'
version = (2, 1, 2) version = (3, 0, 0)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
description = _('Downloads metadata and covers from Douban.com. ' description = _(
'Useful only for Chinese language books.') 'Downloads metadata and covers from Douban.com. '
'Useful only for Chinese language books.'
)
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'tags', touched_fields = frozenset([
'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
'identifier:douban']) # language currently disabled 'identifier:isbn', 'rating', 'identifier:douban'
]) # language currently disabled
supports_gzip_transfer_encoding = True supports_gzip_transfer_encoding = True
cached_cover_url_is_reliable = True cached_cover_url_is_reliable = True
DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' DOUBAN_API_KEY = '0df993c66c0c636e29ecbb5344252a4a'
DOUBAN_API_URL = 'https://api.douban.com/v2/book/search'
DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/' DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/'
options = ( options = (
Option('include_subtitle_in_title', 'bool', True, _('Include subtitle in book title:'), Option(
_('Whether to append subtitle in the book title.')), 'include_subtitle_in_title', 'bool', True,
_('Include subtitle in book title:'),
_('Whether to append subtitle in the book title.')
),
) )
def to_metadata(self, browser, log, entry_, timeout): # {{{ def to_metadata(self, browser, log, entry_, timeout): # {{{
from lxml import etree
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.date import parse_date, utcnow from calibre.utils.date import parse_date, utcnow
from calibre.utils.cleantext import clean_ascii_chars
XPath = partial(etree.XPath, namespaces=NAMESPACES) douban_id = entry_.get('id')
entry = XPath('//atom:entry') title = entry_.get('title')
entry_id = XPath('descendant::atom:id') description = entry_.get('summary')
title = XPath('descendant::atom:title') # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field
description = XPath('descendant::atom:summary') publisher = entry_.get('publisher')
subtitle = XPath("descendant::db:attribute[@name='subtitle']") isbns = entry_.get('isbn13') # ISBN11 is obsolute, use ISBN13
publisher = XPath("descendant::db:attribute[@name='publisher']") pubdate = entry_.get('pubdate')
isbn = XPath("descendant::db:attribute[@name='isbn13']") authors = entry_.get('author')
date = XPath("descendant::db:attribute[@name='pubdate']") book_tags = entry_.get('tags')
creator = XPath("descendant::db:attribute[@name='author']") rating = entry_.get('rating')
booktag = XPath("descendant::db:tag/attribute::name") cover_url = entry_.get('image')
rating = XPath("descendant::gd:rating/attribute::average") series = entry_.get('series')
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
def get_text(extra, x):
try:
ans = x(extra)
if ans:
ans = ans[0].text
if ans and ans.strip():
return ans.strip()
except:
log.exception('Programming error:')
return None
id_url = entry_id(entry_)[0].text.replace('http://', 'https://')
douban_id = id_url.split('/')[-1]
title_ = ': '.join([x.text for x in title(entry_)]).strip()
subtitle = ': '.join([x.text for x in subtitle(entry_)]).strip()
if self.prefs['include_subtitle_in_title'] and len(subtitle) > 0:
title_ = title_ + ' - ' + subtitle
authors = [x.text.strip() for x in creator(entry_) if x.text]
if not authors: if not authors:
authors = [_('Unknown')] authors = [_('Unknown')]
if not id_url or not title: if not douban_id or not title:
# Silently discard this entry # Silently discard this entry
return None return None
mi = Metadata(title_, authors) mi = Metadata(title, authors)
mi.identifiers = {'douban':douban_id} mi.identifiers = {'douban': douban_id}
try: mi.publisher = publisher
log.info(id_url) mi.comments = description
raw = get_details(browser, id_url, timeout) # mi.subtitle = subtitle
feed = etree.fromstring(
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
)
extra = entry(feed)[0]
except:
log.exception('Failed to get additional details for', mi.title)
return mi
mi.comments = get_text(extra, description)
mi.publisher = get_text(extra, publisher)
# ISBN # ISBN
isbns = [] for x in isbns:
for x in [t.text for t in isbn(extra)]:
if check_isbn(x): if check_isbn(x):
isbns.append(x) isbns.append(x)
if isbns: if isbns:
@ -140,52 +115,45 @@ class Douban(Source):
mi.all_isbns = isbns mi.all_isbns = isbns
# Tags # Tags
try: mi.tags = [tag['name'] for tag in book_tags]
btags = [x for x in booktag(extra) if x]
tags = []
for t in btags:
atags = [y.strip() for y in t.split('/')]
for tag in atags:
if tag not in tags:
tags.append(tag)
except:
log.exception('Failed to parse tags:')
tags = []
if tags:
mi.tags = [x.replace(',', ';') for x in tags]
# pubdate # pubdate
pubdate = get_text(extra, date)
if pubdate: if pubdate:
try: try:
default = utcnow().replace(day=15) default = utcnow().replace(day=15)
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
except: except:
log.error('Failed to parse pubdate %r'%pubdate) log.error('Failed to parse pubdate %r' % pubdate)
# Ratings # Ratings
if rating(extra): if rating:
try: try:
mi.rating = float(rating(extra)[0]) / 2.0 mi.rating = float(rating['average']) / 2.0
except: except:
log.exception('Failed to parse rating') log.exception('Failed to parse rating')
mi.rating = 0 mi.rating = 0
# Cover # Cover
mi.has_douban_cover = None mi.has_douban_cover = None
u = cover_url(extra) u = cover_url
if u: if u:
u = u[0].replace('/spic/', '/lpic/')
# If URL contains "book-default", the book doesn't have a cover # If URL contains "book-default", the book doesn't have a cover
if u.find('book-default') == -1: if u.find('book-default') == -1:
mi.has_douban_cover = u mi.has_douban_cover = u
# Series
if series:
mi.series = series['title']
return mi return mi
# }}} # }}}
def get_book_url(self, identifiers): # {{{ def get_book_url(self, identifiers): # {{{
db = identifiers.get('douban', None) db = identifiers.get('douban', None)
if db is not None: if db is not None:
return ('douban', db, self.DOUBAN_BOOK_URL%db) return ('douban', db, self.DOUBAN_BOOK_URL % db)
# }}} # }}}
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
@ -193,9 +161,9 @@ class Douban(Source):
from urllib.parse import urlencode from urllib.parse import urlencode
except ImportError: except ImportError:
from urllib import urlencode from urllib import urlencode
SEARCH_URL = 'https://api.douban.com/book/subjects?' SEARCH_URL = 'https://api.douban.com/v2/book/search?count=10&'
ISBN_URL = 'https://api.douban.com/book/subject/isbn/' ISBN_URL = 'https://api.douban.com/v2/book/isbn/'
SUBJECT_URL = 'https://api.douban.com/book/subject/' SUBJECT_URL = 'https://api.douban.com/v2/book/'
q = '' q = ''
t = None t = None
@ -208,16 +176,18 @@ class Douban(Source):
q = subject q = subject
t = 'subject' t = 'subject'
elif title or authors: elif title or authors:
def build_term(prefix, parts): def build_term(prefix, parts):
return ' '.join(x for x in parts) return ' '.join(x for x in parts)
title_tokens = list(self.get_title_tokens(title)) title_tokens = list(self.get_title_tokens(title))
if title_tokens: if title_tokens:
q += build_term('title', title_tokens) q += build_term('title', title_tokens)
author_tokens = list(self.get_author_tokens(authors, author_tokens = list(
only_first_author=True)) self.get_author_tokens(authors, only_first_author=True)
)
if author_tokens: if author_tokens:
q += ((' ' if q != '' else '') + q += ((' ' if q != '' else '') + build_term('author', author_tokens))
build_term('author', author_tokens))
t = 'search' t = 'search'
q = q.strip() q = q.strip()
if isinstance(q, type(u'')): if isinstance(q, type(u'')):
@ -231,24 +201,40 @@ class Douban(Source):
url = SUBJECT_URL + q url = SUBJECT_URL + q
else: else:
url = SEARCH_URL + urlencode({ url = SEARCH_URL + urlencode({
'q': q, 'q': q,
}) })
if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
if t == "isbn" or t == "subject": if t == "isbn" or t == "subject":
url = url + "?apikey=" + self.DOUBAN_API_KEY url = url + "?apikey=" + self.DOUBAN_API_KEY
else: else:
url = url + "&apikey=" + self.DOUBAN_API_KEY url = url + "&apikey=" + self.DOUBAN_API_KEY
return url return url
# }}} # }}}
def download_cover(self, log, result_queue, abort, # {{{ def download_cover(
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): self,
log,
result_queue,
abort, # {{{
title=None,
authors=None,
identifiers={},
timeout=30,
get_best_cover=False
):
cached_url = self.get_cached_cover_url(identifiers) cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None: if cached_url is None:
log.info('No cached cover found, running identify') log.info('No cached cover found, running identify')
rq = Queue() rq = Queue()
self.identify(log, rq, abort, title=title, authors=authors, self.identify(
identifiers=identifiers) log,
rq,
abort,
title=title,
authors=authors,
identifiers=identifiers
)
if abort.is_set(): if abort.is_set():
return return
results = [] results = []
@ -257,8 +243,11 @@ class Douban(Source):
results.append(rq.get_nowait()) results.append(rq.get_nowait())
except Empty: except Empty:
break break
results.sort(key=self.identify_results_keygen( results.sort(
title=title, authors=authors, identifiers=identifiers)) key=self.identify_results_keygen(
title=title, authors=authors, identifiers=identifiers
)
)
for mi in results: for mi in results:
cached_url = self.get_cached_cover_url(mi.identifiers) cached_url = self.get_cached_cover_url(mi.identifiers)
if cached_url is not None: if cached_url is not None:
@ -291,11 +280,18 @@ class Douban(Source):
url = self.cached_identifier_to_cover_url(db) url = self.cached_identifier_to_cover_url(db)
return url return url
# }}} # }}}
def get_all_details(self, br, log, entries, abort, # {{{ def get_all_details(
result_queue, timeout): self,
from lxml import etree br,
log,
entries,
abort, # {{{
result_queue,
timeout
):
for relevance, i in enumerate(entries): for relevance, i in enumerate(entries):
try: try:
ans = self.to_metadata(br, log, i, timeout) ans = self.to_metadata(br, log, i, timeout)
@ -305,29 +301,31 @@ class Douban(Source):
for isbn in getattr(ans, 'all_isbns', []): for isbn in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbn, db) self.cache_isbn_to_identifier(isbn, db)
if ans.has_douban_cover: if ans.has_douban_cover:
self.cache_identifier_to_cover_url(db, self.cache_identifier_to_cover_url(db, ans.has_douban_cover)
ans.has_douban_cover)
self.clean_downloaded_metadata(ans) self.clean_downloaded_metadata(ans)
result_queue.put(ans) result_queue.put(ans)
except: except:
log.exception( log.exception('Failed to get metadata for identify entry:', i)
'Failed to get metadata for identify entry:',
etree.tostring(i))
if abort.is_set(): if abort.is_set():
break break
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(
identifiers={}, timeout=30): self,
from lxml import etree log,
from calibre.ebooks.chardet import xml_to_unicode result_queue,
from calibre.utils.cleantext import clean_ascii_chars abort,
title=None,
authors=None, # {{{
identifiers={},
timeout=30
):
import json
XPath = partial(etree.XPath, namespaces=NAMESPACES) query = self.create_query(
entry = XPath('//atom:entry') log, title=title, authors=authors, identifiers=identifiers
)
query = self.create_query(log, title=title, authors=authors,
identifiers=identifiers)
if not query: if not query:
log.error('Insufficient metadata to construct query') log.error('Insufficient metadata to construct query')
return return
@ -335,45 +333,51 @@ class Douban(Source):
try: try:
raw = br.open_novisit(query, timeout=timeout).read() raw = br.open_novisit(query, timeout=timeout).read()
except Exception as e: except Exception as e:
log.exception('Failed to make identify query: %r'%query) log.exception('Failed to make identify query: %r' % query)
return as_unicode(e) return as_unicode(e)
try: try:
parser = etree.XMLParser(recover=True, no_network=True) entries = json.loads(raw)['books']
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
strip_encoding_pats=True)[0], parser=parser)
entries = entry(feed)
except Exception as e: except Exception as e:
log.exception('Failed to parse identify results') log.exception('Failed to parse identify results')
return as_unicode(e) return as_unicode(e)
if not entries and identifiers and title and authors and \ if not entries and identifiers and title and authors and \
not abort.is_set(): not abort.is_set():
return self.identify(log, result_queue, abort, title=title, return self.identify(
authors=authors, timeout=timeout) log,
result_queue,
abort,
title=title,
authors=authors,
timeout=timeout
)
# There is no point running these queries in threads as douban # There is no point running these queries in threads as douban
# throttles requests returning 403 Forbidden errors # throttles requests returning 403 Forbidden errors
self.get_all_details(br, log, entries, abort, result_queue, timeout) self.get_all_details(br, log, entries, abort, result_queue, timeout)
return None return None
# }}} # }}}
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (
title_test, authors_test) test_identify_plugin, title_test, authors_test
test_identify_plugin(Douban.name, )
[ test_identify_plugin(
( Douban.name, [
{'identifiers':{'isbn': '9787536692930'}, 'title':'三体', ({
'authors':['刘慈欣']}, 'identifiers': {
[title_test('三体', exact=True), 'isbn': '9787536692930'
authors_test(['刘慈欣'])] },
), 'title': '三体',
'authors': ['刘慈欣']
( }, [title_test('三体', exact=True),
{'title': 'Linux内核修炼之道', 'authors':['任桥伟']}, authors_test(['刘慈欣'])]),
[title_test('Linux内核修炼之道', exact=False)] ({
), 'title': 'Linux内核修炼之道',
]) 'authors': ['任桥伟']
}, [title_test('Linux内核修炼之道', exact=False)]),
]
)
# }}} # }}}