mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix & update Douban API
This commit is contained in:
parent
6bdbc6f07d
commit
78f858a875
@ -3,7 +3,7 @@
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
@ -14,27 +14,26 @@ try:
|
||||
except ImportError:
|
||||
from Queue import Empty, Queue
|
||||
|
||||
|
||||
from calibre.ebooks.metadata import check_isbn
|
||||
from calibre.ebooks.metadata.sources.base import Option, Source
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre import as_unicode
|
||||
|
||||
NAMESPACES = {
|
||||
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||
'atom' : 'http://www.w3.org/2005/Atom',
|
||||
'db': 'https://www.douban.com/xmlns/',
|
||||
'gd': 'http://schemas.google.com/g/2005'
|
||||
}
|
||||
'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||
'atom': 'http://www.w3.org/2005/Atom',
|
||||
'db': 'https://www.douban.com/xmlns/',
|
||||
'gd': 'http://schemas.google.com/g/2005'
|
||||
}
|
||||
|
||||
|
||||
def get_details(browser, url, timeout): # {{{
|
||||
try:
|
||||
if Douban.DOUBAN_API_KEY and Douban.DOUBAN_API_KEY != '':
|
||||
if Douban.DOUBAN_API_KEY:
|
||||
url = url + "?apikey=" + Douban.DOUBAN_API_KEY
|
||||
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||
except Exception as e:
|
||||
gc = getattr(e, 'getcode', lambda : -1)
|
||||
gc = getattr(e, 'getcode', lambda: -1)
|
||||
if gc() != 403:
|
||||
raise
|
||||
# Douban is throttling us, wait a little
|
||||
@ -42,97 +41,73 @@ def get_details(browser, url, timeout): # {{{
|
||||
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||
|
||||
return raw
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
class Douban(Source):
|
||||
|
||||
name = 'Douban Books'
|
||||
author = 'Li Fanxi'
|
||||
version = (2, 1, 2)
|
||||
author = 'Li Fanxi, xcffl'
|
||||
version = (3, 0, 0)
|
||||
minimum_calibre_version = (2, 80, 0)
|
||||
|
||||
description = _('Downloads metadata and covers from Douban.com. '
|
||||
'Useful only for Chinese language books.')
|
||||
description = _(
|
||||
'Downloads metadata and covers from Douban.com. '
|
||||
'Useful only for Chinese language books.'
|
||||
)
|
||||
|
||||
capabilities = frozenset(['identify', 'cover'])
|
||||
touched_fields = frozenset(['title', 'authors', 'tags',
|
||||
'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
|
||||
'identifier:douban']) # language currently disabled
|
||||
touched_fields = frozenset([
|
||||
'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
|
||||
'identifier:isbn', 'rating', 'identifier:douban'
|
||||
]) # language currently disabled
|
||||
supports_gzip_transfer_encoding = True
|
||||
cached_cover_url_is_reliable = True
|
||||
|
||||
DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
|
||||
DOUBAN_API_KEY = '0df993c66c0c636e29ecbb5344252a4a'
|
||||
DOUBAN_API_URL = 'https://api.douban.com/v2/book/search'
|
||||
DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/'
|
||||
|
||||
options = (
|
||||
Option('include_subtitle_in_title', 'bool', True, _('Include subtitle in book title:'),
|
||||
_('Whether to append subtitle in the book title.')),
|
||||
Option(
|
||||
'include_subtitle_in_title', 'bool', True,
|
||||
_('Include subtitle in book title:'),
|
||||
_('Whether to append subtitle in the book title.')
|
||||
),
|
||||
)
|
||||
|
||||
def to_metadata(self, browser, log, entry_, timeout): # {{{
|
||||
from lxml import etree
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
|
||||
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||
entry = XPath('//atom:entry')
|
||||
entry_id = XPath('descendant::atom:id')
|
||||
title = XPath('descendant::atom:title')
|
||||
description = XPath('descendant::atom:summary')
|
||||
subtitle = XPath("descendant::db:attribute[@name='subtitle']")
|
||||
publisher = XPath("descendant::db:attribute[@name='publisher']")
|
||||
isbn = XPath("descendant::db:attribute[@name='isbn13']")
|
||||
date = XPath("descendant::db:attribute[@name='pubdate']")
|
||||
creator = XPath("descendant::db:attribute[@name='author']")
|
||||
booktag = XPath("descendant::db:tag/attribute::name")
|
||||
rating = XPath("descendant::gd:rating/attribute::average")
|
||||
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
|
||||
douban_id = entry_.get('id')
|
||||
title = entry_.get('title')
|
||||
description = entry_.get('summary')
|
||||
# subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field
|
||||
publisher = entry_.get('publisher')
|
||||
isbns = entry_.get('isbn13') # ISBN11 is obsolute, use ISBN13
|
||||
pubdate = entry_.get('pubdate')
|
||||
authors = entry_.get('author')
|
||||
book_tags = entry_.get('tags')
|
||||
rating = entry_.get('rating')
|
||||
cover_url = entry_.get('image')
|
||||
series = entry_.get('series')
|
||||
|
||||
def get_text(extra, x):
|
||||
try:
|
||||
ans = x(extra)
|
||||
if ans:
|
||||
ans = ans[0].text
|
||||
if ans and ans.strip():
|
||||
return ans.strip()
|
||||
except:
|
||||
log.exception('Programming error:')
|
||||
return None
|
||||
|
||||
id_url = entry_id(entry_)[0].text.replace('http://', 'https://')
|
||||
douban_id = id_url.split('/')[-1]
|
||||
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||
subtitle = ': '.join([x.text for x in subtitle(entry_)]).strip()
|
||||
if self.prefs['include_subtitle_in_title'] and len(subtitle) > 0:
|
||||
title_ = title_ + ' - ' + subtitle
|
||||
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||
if not authors:
|
||||
authors = [_('Unknown')]
|
||||
if not id_url or not title:
|
||||
if not douban_id or not title:
|
||||
# Silently discard this entry
|
||||
return None
|
||||
|
||||
mi = Metadata(title_, authors)
|
||||
mi.identifiers = {'douban':douban_id}
|
||||
try:
|
||||
log.info(id_url)
|
||||
raw = get_details(browser, id_url, timeout)
|
||||
feed = etree.fromstring(
|
||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||
)
|
||||
extra = entry(feed)[0]
|
||||
except:
|
||||
log.exception('Failed to get additional details for', mi.title)
|
||||
return mi
|
||||
mi.comments = get_text(extra, description)
|
||||
mi.publisher = get_text(extra, publisher)
|
||||
mi = Metadata(title, authors)
|
||||
mi.identifiers = {'douban': douban_id}
|
||||
mi.publisher = publisher
|
||||
mi.comments = description
|
||||
# mi.subtitle = subtitle
|
||||
|
||||
# ISBN
|
||||
isbns = []
|
||||
for x in [t.text for t in isbn(extra)]:
|
||||
for x in isbns:
|
||||
if check_isbn(x):
|
||||
isbns.append(x)
|
||||
if isbns:
|
||||
@ -140,52 +115,45 @@ class Douban(Source):
|
||||
mi.all_isbns = isbns
|
||||
|
||||
# Tags
|
||||
try:
|
||||
btags = [x for x in booktag(extra) if x]
|
||||
tags = []
|
||||
for t in btags:
|
||||
atags = [y.strip() for y in t.split('/')]
|
||||
for tag in atags:
|
||||
if tag not in tags:
|
||||
tags.append(tag)
|
||||
except:
|
||||
log.exception('Failed to parse tags:')
|
||||
tags = []
|
||||
if tags:
|
||||
mi.tags = [x.replace(',', ';') for x in tags]
|
||||
mi.tags = [tag['name'] for tag in book_tags]
|
||||
|
||||
# pubdate
|
||||
pubdate = get_text(extra, date)
|
||||
if pubdate:
|
||||
try:
|
||||
default = utcnow().replace(day=15)
|
||||
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||
except:
|
||||
log.error('Failed to parse pubdate %r'%pubdate)
|
||||
log.error('Failed to parse pubdate %r' % pubdate)
|
||||
|
||||
# Ratings
|
||||
if rating(extra):
|
||||
if rating:
|
||||
try:
|
||||
mi.rating = float(rating(extra)[0]) / 2.0
|
||||
mi.rating = float(rating['average']) / 2.0
|
||||
except:
|
||||
log.exception('Failed to parse rating')
|
||||
mi.rating = 0
|
||||
|
||||
# Cover
|
||||
mi.has_douban_cover = None
|
||||
u = cover_url(extra)
|
||||
u = cover_url
|
||||
if u:
|
||||
u = u[0].replace('/spic/', '/lpic/')
|
||||
# If URL contains "book-default", the book doesn't have a cover
|
||||
if u.find('book-default') == -1:
|
||||
mi.has_douban_cover = u
|
||||
|
||||
# Series
|
||||
if series:
|
||||
mi.series = series['title']
|
||||
|
||||
return mi
|
||||
|
||||
# }}}
|
||||
|
||||
def get_book_url(self, identifiers): # {{{
|
||||
db = identifiers.get('douban', None)
|
||||
if db is not None:
|
||||
return ('douban', db, self.DOUBAN_BOOK_URL%db)
|
||||
return ('douban', db, self.DOUBAN_BOOK_URL % db)
|
||||
|
||||
# }}}
|
||||
|
||||
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||
@ -193,9 +161,9 @@ class Douban(Source):
|
||||
from urllib.parse import urlencode
|
||||
except ImportError:
|
||||
from urllib import urlencode
|
||||
SEARCH_URL = 'https://api.douban.com/book/subjects?'
|
||||
ISBN_URL = 'https://api.douban.com/book/subject/isbn/'
|
||||
SUBJECT_URL = 'https://api.douban.com/book/subject/'
|
||||
SEARCH_URL = 'https://api.douban.com/v2/book/search?count=10&'
|
||||
ISBN_URL = 'https://api.douban.com/v2/book/isbn/'
|
||||
SUBJECT_URL = 'https://api.douban.com/v2/book/'
|
||||
|
||||
q = ''
|
||||
t = None
|
||||
@ -208,16 +176,18 @@ class Douban(Source):
|
||||
q = subject
|
||||
t = 'subject'
|
||||
elif title or authors:
|
||||
|
||||
def build_term(prefix, parts):
|
||||
return ' '.join(x for x in parts)
|
||||
|
||||
title_tokens = list(self.get_title_tokens(title))
|
||||
if title_tokens:
|
||||
q += build_term('title', title_tokens)
|
||||
author_tokens = list(self.get_author_tokens(authors,
|
||||
only_first_author=True))
|
||||
author_tokens = list(
|
||||
self.get_author_tokens(authors, only_first_author=True)
|
||||
)
|
||||
if author_tokens:
|
||||
q += ((' ' if q != '' else '') +
|
||||
build_term('author', author_tokens))
|
||||
q += ((' ' if q != '' else '') + build_term('author', author_tokens))
|
||||
t = 'search'
|
||||
q = q.strip()
|
||||
if isinstance(q, type(u'')):
|
||||
@ -231,24 +201,40 @@ class Douban(Source):
|
||||
url = SUBJECT_URL + q
|
||||
else:
|
||||
url = SEARCH_URL + urlencode({
|
||||
'q': q,
|
||||
})
|
||||
'q': q,
|
||||
})
|
||||
if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
|
||||
if t == "isbn" or t == "subject":
|
||||
url = url + "?apikey=" + self.DOUBAN_API_KEY
|
||||
else:
|
||||
url = url + "&apikey=" + self.DOUBAN_API_KEY
|
||||
return url
|
||||
|
||||
# }}}
|
||||
|
||||
def download_cover(self, log, result_queue, abort, # {{{
|
||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
||||
def download_cover(
|
||||
self,
|
||||
log,
|
||||
result_queue,
|
||||
abort, # {{{
|
||||
title=None,
|
||||
authors=None,
|
||||
identifiers={},
|
||||
timeout=30,
|
||||
get_best_cover=False
|
||||
):
|
||||
cached_url = self.get_cached_cover_url(identifiers)
|
||||
if cached_url is None:
|
||||
log.info('No cached cover found, running identify')
|
||||
rq = Queue()
|
||||
self.identify(log, rq, abort, title=title, authors=authors,
|
||||
identifiers=identifiers)
|
||||
self.identify(
|
||||
log,
|
||||
rq,
|
||||
abort,
|
||||
title=title,
|
||||
authors=authors,
|
||||
identifiers=identifiers
|
||||
)
|
||||
if abort.is_set():
|
||||
return
|
||||
results = []
|
||||
@ -257,8 +243,11 @@ class Douban(Source):
|
||||
results.append(rq.get_nowait())
|
||||
except Empty:
|
||||
break
|
||||
results.sort(key=self.identify_results_keygen(
|
||||
title=title, authors=authors, identifiers=identifiers))
|
||||
results.sort(
|
||||
key=self.identify_results_keygen(
|
||||
title=title, authors=authors, identifiers=identifiers
|
||||
)
|
||||
)
|
||||
for mi in results:
|
||||
cached_url = self.get_cached_cover_url(mi.identifiers)
|
||||
if cached_url is not None:
|
||||
@ -291,11 +280,18 @@ class Douban(Source):
|
||||
url = self.cached_identifier_to_cover_url(db)
|
||||
|
||||
return url
|
||||
|
||||
# }}}
|
||||
|
||||
def get_all_details(self, br, log, entries, abort, # {{{
|
||||
result_queue, timeout):
|
||||
from lxml import etree
|
||||
def get_all_details(
|
||||
self,
|
||||
br,
|
||||
log,
|
||||
entries,
|
||||
abort, # {{{
|
||||
result_queue,
|
||||
timeout
|
||||
):
|
||||
for relevance, i in enumerate(entries):
|
||||
try:
|
||||
ans = self.to_metadata(br, log, i, timeout)
|
||||
@ -305,29 +301,31 @@ class Douban(Source):
|
||||
for isbn in getattr(ans, 'all_isbns', []):
|
||||
self.cache_isbn_to_identifier(isbn, db)
|
||||
if ans.has_douban_cover:
|
||||
self.cache_identifier_to_cover_url(db,
|
||||
ans.has_douban_cover)
|
||||
self.cache_identifier_to_cover_url(db, ans.has_douban_cover)
|
||||
self.clean_downloaded_metadata(ans)
|
||||
result_queue.put(ans)
|
||||
except:
|
||||
log.exception(
|
||||
'Failed to get metadata for identify entry:',
|
||||
etree.tostring(i))
|
||||
log.exception('Failed to get metadata for identify entry:', i)
|
||||
if abort.is_set():
|
||||
break
|
||||
|
||||
# }}}
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||
identifiers={}, timeout=30):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
def identify(
|
||||
self,
|
||||
log,
|
||||
result_queue,
|
||||
abort,
|
||||
title=None,
|
||||
authors=None, # {{{
|
||||
identifiers={},
|
||||
timeout=30
|
||||
):
|
||||
import json
|
||||
|
||||
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||
entry = XPath('//atom:entry')
|
||||
|
||||
query = self.create_query(log, title=title, authors=authors,
|
||||
identifiers=identifiers)
|
||||
query = self.create_query(
|
||||
log, title=title, authors=authors, identifiers=identifiers
|
||||
)
|
||||
if not query:
|
||||
log.error('Insufficient metadata to construct query')
|
||||
return
|
||||
@ -335,45 +333,51 @@ class Douban(Source):
|
||||
try:
|
||||
raw = br.open_novisit(query, timeout=timeout).read()
|
||||
except Exception as e:
|
||||
log.exception('Failed to make identify query: %r'%query)
|
||||
log.exception('Failed to make identify query: %r' % query)
|
||||
return as_unicode(e)
|
||||
try:
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||
strip_encoding_pats=True)[0], parser=parser)
|
||||
entries = entry(feed)
|
||||
entries = json.loads(raw)['books']
|
||||
except Exception as e:
|
||||
log.exception('Failed to parse identify results')
|
||||
return as_unicode(e)
|
||||
if not entries and identifiers and title and authors and \
|
||||
not abort.is_set():
|
||||
return self.identify(log, result_queue, abort, title=title,
|
||||
authors=authors, timeout=timeout)
|
||||
|
||||
return self.identify(
|
||||
log,
|
||||
result_queue,
|
||||
abort,
|
||||
title=title,
|
||||
authors=authors,
|
||||
timeout=timeout
|
||||
)
|
||||
# There is no point running these queries in threads as douban
|
||||
# throttles requests returning 403 Forbidden errors
|
||||
self.get_all_details(br, log, entries, abort, result_queue, timeout)
|
||||
|
||||
return None
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
if __name__ == '__main__': # tests {{{
|
||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
|
||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||
title_test, authors_test)
|
||||
test_identify_plugin(Douban.name,
|
||||
[
|
||||
(
|
||||
{'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
|
||||
'authors':['刘慈欣']},
|
||||
[title_test('三体', exact=True),
|
||||
authors_test(['刘慈欣'])]
|
||||
),
|
||||
|
||||
(
|
||||
{'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
|
||||
[title_test('Linux内核修炼之道', exact=False)]
|
||||
),
|
||||
])
|
||||
from calibre.ebooks.metadata.sources.test import (
|
||||
test_identify_plugin, title_test, authors_test
|
||||
)
|
||||
test_identify_plugin(
|
||||
Douban.name, [
|
||||
({
|
||||
'identifiers': {
|
||||
'isbn': '9787536692930'
|
||||
},
|
||||
'title': '三体',
|
||||
'authors': ['刘慈欣']
|
||||
}, [title_test('三体', exact=True),
|
||||
authors_test(['刘慈欣'])]),
|
||||
({
|
||||
'title': 'Linux内核修炼之道',
|
||||
'authors': ['任桥伟']
|
||||
}, [title_test('Linux内核修炼之道', exact=False)]),
|
||||
]
|
||||
)
|
||||
# }}}
|
||||
|
Loading…
x
Reference in New Issue
Block a user