From d1a6113a8a8ae09c6dba43e7ac093e554f29e48b Mon Sep 17 00:00:00 2001 From: Li Fanxi Date: Wed, 5 Jun 2019 23:23:01 +0800 Subject: [PATCH] Support subtitle in Douban metadata plugin --- src/calibre/ebooks/metadata/sources/douban.py | 228 +++++++++--------- 1 file changed, 118 insertions(+), 110 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/douban.py b/src/calibre/ebooks/metadata/sources/douban.py index b8cd22a67a..ecf0f3c059 100644 --- a/src/calibre/ebooks/metadata/sources/douban.py +++ b/src/calibre/ebooks/metadata/sources/douban.py @@ -1,5 +1,6 @@ #!/usr/bin/env python2 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai + from __future__ import absolute_import, division, print_function, unicode_literals __license__ = 'GPL v3' @@ -15,7 +16,7 @@ except ImportError: from calibre.ebooks.metadata import check_isbn -from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.sources.base import Option, Source from calibre.ebooks.metadata.book.base import Metadata from calibre import as_unicode @@ -44,112 +45,6 @@ def get_details(browser, url, timeout): # {{{ # }}} -def to_metadata(browser, log, entry_, timeout): # {{{ - from lxml import etree - from calibre.ebooks.chardet import xml_to_unicode - from calibre.utils.date import parse_date, utcnow - from calibre.utils.cleantext import clean_ascii_chars - - XPath = partial(etree.XPath, namespaces=NAMESPACES) - entry = XPath('//atom:entry') - entry_id = XPath('descendant::atom:id') - title = XPath('descendant::atom:title') - description = XPath('descendant::atom:summary') - publisher = XPath("descendant::db:attribute[@name='publisher']") - isbn = XPath("descendant::db:attribute[@name='isbn13']") - date = XPath("descendant::db:attribute[@name='pubdate']") - creator = XPath("descendant::db:attribute[@name='author']") - booktag = XPath("descendant::db:tag/attribute::name") - rating = XPath("descendant::gd:rating/attribute::average") - cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") - - def get_text(extra, x): - try: - ans = x(extra) - if ans: - ans = ans[0].text - if ans and ans.strip(): - return ans.strip() - except: - log.exception('Programming error:') - return None - - id_url = entry_id(entry_)[0].text.replace('http://', 'https://') - douban_id = id_url.split('/')[-1] - title_ = ': '.join([x.text for x in title(entry_)]).strip() - authors = [x.text.strip() for x in creator(entry_) if x.text] - if not authors: - authors = [_('Unknown')] - if not id_url or not title: - # Silently discard this entry - return None - - mi = Metadata(title_, authors) - mi.identifiers = {'douban':douban_id} - try: - raw = get_details(browser, id_url, timeout) - feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), - strip_encoding_pats=True)[0]) - extra = entry(feed)[0] - except: - log.exception('Failed to get additional details for', mi.title) - return mi - mi.comments = get_text(extra, description) - mi.publisher = get_text(extra, publisher) - - # ISBN - isbns = [] - for x in [t.text for t in isbn(extra)]: - if check_isbn(x): - isbns.append(x) - if isbns: - mi.isbn = sorted(isbns, key=len)[-1] - mi.all_isbns = isbns - - # Tags - try: - btags = [x for x in booktag(extra) if x] - tags = [] - for t in btags: - atags = [y.strip() for y in t.split('/')] - for tag in atags: - if tag not in tags: - tags.append(tag) - except: - log.exception('Failed to parse tags:') - tags = [] - if tags: - mi.tags = [x.replace(',', ';') for x in tags] - - # pubdate - pubdate = get_text(extra, date) - if pubdate: - try: - default = utcnow().replace(day=15) - mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) - except: - log.error('Failed to parse pubdate %r'%pubdate) - - # Ratings - if rating(extra): - try: - mi.rating = float(rating(extra)[0]) / 2.0 - except: - log.exception('Failed to parse rating') - mi.rating = 0 - - # Cover - mi.has_douban_cover = None - u = cover_url(extra) - if u: - u = u[0].replace('/spic/', '/lpic/') - # If URL contains "book-default", the book doesn't have a cover - if u.find('book-default') == -1: - mi.has_douban_cover = u - return mi -# }}} - - class Douban(Source): name = 'Douban Books' @@ -170,6 +65,121 @@ class Douban(Source): DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/' + options = ( + Option('include_subtitle_in_title', 'bool', True, _('Include subtitle in book title:'), + _('Whether to append subtitle in the book title.')), + ) + + def to_metadata(self, browser, log, entry_, timeout): # {{{ + from lxml import etree + from calibre.ebooks.chardet import xml_to_unicode + from calibre.utils.date import parse_date, utcnow + from calibre.utils.cleantext import clean_ascii_chars + + XPath = partial(etree.XPath, namespaces=NAMESPACES) + entry = XPath('//atom:entry') + entry_id = XPath('descendant::atom:id') + title = XPath('descendant::atom:title') + description = XPath('descendant::atom:summary') + subtitle = XPath("descendant::db:attribute[@name='subtitle']") + publisher = XPath("descendant::db:attribute[@name='publisher']") + isbn = XPath("descendant::db:attribute[@name='isbn13']") + date = XPath("descendant::db:attribute[@name='pubdate']") + creator = XPath("descendant::db:attribute[@name='author']") + booktag = XPath("descendant::db:tag/attribute::name") + rating = XPath("descendant::gd:rating/attribute::average") + cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") + + def get_text(extra, x): + try: + ans = x(extra) + if ans: + ans = ans[0].text + if ans and ans.strip(): + return ans.strip() + except: + log.exception('Programming error:') + return None + + id_url = entry_id(entry_)[0].text.replace('http://', 'https://') + douban_id = id_url.split('/')[-1] + title_ = ': '.join([x.text for x in title(entry_)]).strip() + subtitle = ': '.join([x.text for x in subtitle(entry_)]).strip() + if self.prefs['include_subtitle_in_title'] and len(subtitle) > 0: + title_ = title_ + ' - ' + subtitle + authors = [x.text.strip() for x in creator(entry_) if x.text] + if not authors: + authors = [_('Unknown')] + if not id_url or not title: + # Silently discard this entry + return None + + mi = Metadata(title_, authors) + mi.identifiers = {'douban':douban_id} + try: + log.info(id_url) + raw = get_details(browser, id_url, timeout) + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), + strip_encoding_pats=True)[0]) + extra = entry(feed)[0] + except: + log.exception('Failed to get additional details for', mi.title) + return mi + mi.comments = get_text(extra, description) + mi.publisher = get_text(extra, publisher) + + # ISBN + isbns = [] + for x in [t.text for t in isbn(extra)]: + if check_isbn(x): + isbns.append(x) + if isbns: + mi.isbn = sorted(isbns, key=len)[-1] + mi.all_isbns = isbns + + # Tags + try: + btags = [x for x in booktag(extra) if x] + tags = [] + for t in btags: + atags = [y.strip() for y in t.split('/')] + for tag in atags: + if tag not in tags: + tags.append(tag) + except: + log.exception('Failed to parse tags:') + tags = [] + if tags: + mi.tags = [x.replace(',', ';') for x in tags] + + # pubdate + pubdate = get_text(extra, date) + if pubdate: + try: + default = utcnow().replace(day=15) + mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) + except: + log.error('Failed to parse pubdate %r'%pubdate) + + # Ratings + if rating(extra): + try: + mi.rating = float(rating(extra)[0]) / 2.0 + except: + log.exception('Failed to parse rating') + mi.rating = 0 + + # Cover + mi.has_douban_cover = None + u = cover_url(extra) + if u: + u = u[0].replace('/spic/', '/lpic/') + # If URL contains "book-default", the book doesn't have a cover + if u.find('book-default') == -1: + mi.has_douban_cover = u + return mi + # }}} + def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: @@ -286,7 +296,7 @@ class Douban(Source): from lxml import etree for relevance, i in enumerate(entries): try: - ans = to_metadata(br, log, i, timeout) + ans = self.to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): ans.source_relevance = relevance db = ans.identifiers['douban'] @@ -352,8 +362,6 @@ if __name__ == '__main__': # tests {{{ title_test, authors_test) test_identify_plugin(Douban.name, [ - - ( {'identifiers':{'isbn': '9787536692930'}, 'title':'三体', 'authors':['刘慈欣']},