mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add a douban.com plugin stub. Not working yet.
This commit is contained in:
parent
1acc3716b6
commit
fabef627e3
@ -628,8 +628,9 @@ if test_eight_code:
|
|||||||
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
|
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
|
||||||
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
|
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
|
||||||
from calibre.ebooks.metadata.sources.overdrive import OverDrive
|
from calibre.ebooks.metadata.sources.overdrive import OverDrive
|
||||||
|
from calibre.ebooks.metadata.sources.douban import Douban
|
||||||
|
|
||||||
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]
|
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
else:
|
else:
|
||||||
|
361
src/calibre/ebooks/metadata/sources/douban.py
Normal file
361
src/calibre/ebooks/metadata/sources/douban.py
Normal file
@ -0,0 +1,361 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import time, hashlib
|
||||||
|
from urllib import urlencode
|
||||||
|
from functools import partial
|
||||||
|
from Queue import Queue, Empty
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata import check_isbn
|
||||||
|
from calibre.ebooks.metadata.sources.base import Source
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
from calibre.utils.date import parse_date, utcnow
|
||||||
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
from calibre import as_unicode
|
||||||
|
|
||||||
|
NAMESPACES = {
|
||||||
|
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||||
|
'atom' : 'http://www.w3.org/2005/Atom',
|
||||||
|
'dc' : 'http://purl.org/dc/terms',
|
||||||
|
'gd' : 'http://schemas.google.com/g/2005'
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACES = {
|
||||||
|
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||||
|
'atom' : 'http://www.w3.org/2005/Atom',
|
||||||
|
'db': 'http://www.douban.com/xmlns/'
|
||||||
|
}
|
||||||
|
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||||
|
total_results = XPath('//openSearch:totalResults')
|
||||||
|
start_index = XPath('//openSearch:startIndex')
|
||||||
|
items_per_page = XPath('//openSearch:itemsPerPage')
|
||||||
|
entry = XPath('//atom:entry')
|
||||||
|
entry_id = XPath('descendant::atom:id')
|
||||||
|
title = XPath('descendant::atom:title')
|
||||||
|
description = XPath('descendant::atom:summary')
|
||||||
|
publisher = XPath("descendant::db:attribute[@name='publisher']")
|
||||||
|
isbn = XPath("descendant::db:attribute[@name='isbn13']")
|
||||||
|
date = XPath("descendant::db:attribute[@name='pubdate']")
|
||||||
|
creator = XPath("descendant::db:attribute[@name='author']")
|
||||||
|
tag = XPath("descendant::db:tag")
|
||||||
|
|
||||||
|
def get_details(browser, url, timeout): # {{{
|
||||||
|
try:
|
||||||
|
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||||
|
except Exception as e:
|
||||||
|
gc = getattr(e, 'getcode', lambda : -1)
|
||||||
|
if gc() != 403:
|
||||||
|
raise
|
||||||
|
# Google is throttling us, wait a little
|
||||||
|
time.sleep(2)
|
||||||
|
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||||
|
|
||||||
|
return raw
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def to_metadata(browser, log, entry_, timeout): # {{{
|
||||||
|
|
||||||
|
def get_text(extra, x):
|
||||||
|
try:
|
||||||
|
ans = x(extra)
|
||||||
|
if ans:
|
||||||
|
ans = ans[0].text
|
||||||
|
if ans and ans.strip():
|
||||||
|
return ans.strip()
|
||||||
|
except:
|
||||||
|
log.exception('Programming error:')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
id_url = entry_id(entry_)[0].text
|
||||||
|
google_id = id_url.split('/')[-1]
|
||||||
|
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||||
|
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||||
|
if not authors:
|
||||||
|
authors = [_('Unknown')]
|
||||||
|
if not id_url or not title:
|
||||||
|
# Silently discard this entry
|
||||||
|
return None
|
||||||
|
|
||||||
|
mi = Metadata(title_, authors)
|
||||||
|
mi.identifiers = {'google':google_id}
|
||||||
|
try:
|
||||||
|
raw = get_details(browser, id_url, timeout)
|
||||||
|
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||||
|
strip_encoding_pats=True)[0])
|
||||||
|
extra = entry(feed)[0]
|
||||||
|
except:
|
||||||
|
log.exception('Failed to get additional details for', mi.title)
|
||||||
|
return mi
|
||||||
|
|
||||||
|
mi.comments = get_text(extra, description)
|
||||||
|
#mi.language = get_text(extra, language)
|
||||||
|
mi.publisher = get_text(extra, publisher)
|
||||||
|
|
||||||
|
# ISBN
|
||||||
|
isbns = []
|
||||||
|
for x in identifier(extra):
|
||||||
|
t = str(x.text).strip()
|
||||||
|
if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
|
||||||
|
if t[:5].upper() == 'ISBN:':
|
||||||
|
t = check_isbn(t[5:])
|
||||||
|
if t:
|
||||||
|
isbns.append(t)
|
||||||
|
if isbns:
|
||||||
|
mi.isbn = sorted(isbns, key=len)[-1]
|
||||||
|
mi.all_isbns = isbns
|
||||||
|
|
||||||
|
# Tags
|
||||||
|
try:
|
||||||
|
btags = [x.text for x in subject(extra) if x.text]
|
||||||
|
tags = []
|
||||||
|
for t in btags:
|
||||||
|
atags = [y.strip() for y in t.split('/')]
|
||||||
|
for tag in atags:
|
||||||
|
if tag not in tags:
|
||||||
|
tags.append(tag)
|
||||||
|
except:
|
||||||
|
log.exception('Failed to parse tags:')
|
||||||
|
tags = []
|
||||||
|
if tags:
|
||||||
|
mi.tags = [x.replace(',', ';') for x in tags]
|
||||||
|
|
||||||
|
# pubdate
|
||||||
|
pubdate = get_text(extra, date)
|
||||||
|
if pubdate:
|
||||||
|
try:
|
||||||
|
default = utcnow().replace(day=15)
|
||||||
|
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||||
|
except:
|
||||||
|
log.error('Failed to parse pubdate %r'%pubdate)
|
||||||
|
|
||||||
|
# Ratings
|
||||||
|
for x in rating(extra):
|
||||||
|
try:
|
||||||
|
mi.rating = float(x.get('average'))
|
||||||
|
if mi.rating > 5:
|
||||||
|
mi.rating /= 2
|
||||||
|
except:
|
||||||
|
log.exception('Failed to parse rating')
|
||||||
|
|
||||||
|
# Cover
|
||||||
|
mi.has_google_cover = None
|
||||||
|
for x in extra.xpath(
|
||||||
|
'//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
|
||||||
|
mi.has_google_cover = x.get('href')
|
||||||
|
break
|
||||||
|
|
||||||
|
return mi
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
class Douban(Source):
|
||||||
|
|
||||||
|
name = 'Douban Books'
|
||||||
|
author = _('Li Fanxi')
|
||||||
|
|
||||||
|
description = _('Downloads metadata from Douban.com')
|
||||||
|
|
||||||
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
|
touched_fields = frozenset(['title', 'authors', 'tags',
|
||||||
|
'comments', 'publisher', 'identifier:isbn', 'rating',
|
||||||
|
'identifier:douban']) # language currently disabled
|
||||||
|
supports_gzip_transfer_encoding = True
|
||||||
|
cached_cover_url_is_reliable = True
|
||||||
|
|
||||||
|
DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
|
||||||
|
# GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'
|
||||||
|
|
||||||
|
# DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657'])
|
||||||
|
|
||||||
|
def get_book_url(self, identifiers): # {{{
|
||||||
|
db = identifiers.get('douban', None)
|
||||||
|
if db is not None:
|
||||||
|
return db
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||||
|
SEARCH_URL = 'http://api.douban.com/book/subjects?'
|
||||||
|
ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
|
||||||
|
|
||||||
|
q = ''
|
||||||
|
t = None
|
||||||
|
isbn = check_isbn(identifiers.get('isbn', None))
|
||||||
|
if isbn is not None:
|
||||||
|
q = isbn
|
||||||
|
t = 'isbn'
|
||||||
|
elif title or authors:
|
||||||
|
def build_term(prefix, parts):
|
||||||
|
return ' '.join(x for x in parts)
|
||||||
|
title_tokens = list(self.get_title_tokens(title))
|
||||||
|
if title_tokens:
|
||||||
|
q += build_term('title', title_tokens)
|
||||||
|
author_tokens = self.get_author_tokens(authors,
|
||||||
|
only_first_author=True)
|
||||||
|
if author_tokens:
|
||||||
|
q += ((' ' if q != '' else '') +
|
||||||
|
build_term('author', author_tokens))
|
||||||
|
t = 'search'
|
||||||
|
if isinstance(q, unicode):
|
||||||
|
q = q.encode('utf-8')
|
||||||
|
if not q:
|
||||||
|
return None
|
||||||
|
print(q)
|
||||||
|
url = None
|
||||||
|
if t == "isbn":
|
||||||
|
url = ISBN_URL + q
|
||||||
|
else:
|
||||||
|
url = SEARCH_URL + urlencode({
|
||||||
|
'q': q,
|
||||||
|
})
|
||||||
|
if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
|
||||||
|
url = url + "?apikey=" + self.DOUBAN_API_KEY
|
||||||
|
print(url)
|
||||||
|
return url
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def download_cover(self, log, result_queue, abort, # {{{
|
||||||
|
title=None, authors=None, identifiers={}, timeout=30):
|
||||||
|
cached_url = self.get_cached_cover_url(identifiers)
|
||||||
|
if cached_url is None:
|
||||||
|
log.info('No cached cover found, running identify')
|
||||||
|
rq = Queue()
|
||||||
|
self.identify(log, rq, abort, title=title, authors=authors,
|
||||||
|
identifiers=identifiers)
|
||||||
|
if abort.is_set():
|
||||||
|
return
|
||||||
|
results = []
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
results.append(rq.get_nowait())
|
||||||
|
except Empty:
|
||||||
|
break
|
||||||
|
results.sort(key=self.identify_results_keygen(
|
||||||
|
title=title, authors=authors, identifiers=identifiers))
|
||||||
|
for mi in results:
|
||||||
|
cached_url = self.get_cached_cover_url(mi.identifiers)
|
||||||
|
if cached_url is not None:
|
||||||
|
break
|
||||||
|
if cached_url is None:
|
||||||
|
log.info('No cover found')
|
||||||
|
return
|
||||||
|
|
||||||
|
if abort.is_set():
|
||||||
|
return
|
||||||
|
br = self.browser
|
||||||
|
log('Downloading cover from:', cached_url)
|
||||||
|
try:
|
||||||
|
cdata = br.open_novisit(cached_url, timeout=timeout).read()
|
||||||
|
if cdata:
|
||||||
|
if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5:
|
||||||
|
log.warning('Google returned a dummy image, ignoring')
|
||||||
|
else:
|
||||||
|
result_queue.put((self, cdata))
|
||||||
|
except:
|
||||||
|
log.exception('Failed to download cover from:', cached_url)
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def get_cached_cover_url(self, identifiers): # {{{
|
||||||
|
url = None
|
||||||
|
goog = identifiers.get('google', None)
|
||||||
|
if goog is None:
|
||||||
|
isbn = identifiers.get('isbn', None)
|
||||||
|
if isbn is not None:
|
||||||
|
goog = self.cached_isbn_to_identifier(isbn)
|
||||||
|
if goog is not None:
|
||||||
|
url = self.cached_identifier_to_cover_url(goog)
|
||||||
|
|
||||||
|
return url
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def get_all_details(self, br, log, entries, abort, # {{{
|
||||||
|
result_queue, timeout):
|
||||||
|
for relevance, i in enumerate(entries):
|
||||||
|
try:
|
||||||
|
ans = to_metadata(br, log, i, timeout)
|
||||||
|
if isinstance(ans, Metadata):
|
||||||
|
ans.source_relevance = relevance
|
||||||
|
goog = ans.identifiers['google']
|
||||||
|
for isbn in getattr(ans, 'all_isbns', []):
|
||||||
|
self.cache_isbn_to_identifier(isbn, goog)
|
||||||
|
if ans.has_google_cover:
|
||||||
|
self.cache_identifier_to_cover_url(goog,
|
||||||
|
self.GOOGLE_COVER%goog)
|
||||||
|
self.clean_downloaded_metadata(ans)
|
||||||
|
result_queue.put(ans)
|
||||||
|
except:
|
||||||
|
log.exception(
|
||||||
|
'Failed to get metadata for identify entry:',
|
||||||
|
etree.tostring(i))
|
||||||
|
if abort.is_set():
|
||||||
|
break
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||||
|
identifiers={}, timeout=30):
|
||||||
|
query = self.create_query(log, title=title, authors=authors,
|
||||||
|
identifiers=identifiers)
|
||||||
|
if not query:
|
||||||
|
log.error('Insufficient metadata to construct query')
|
||||||
|
return
|
||||||
|
br = self.browser
|
||||||
|
try:
|
||||||
|
raw = br.open_novisit(query, timeout=timeout).read()
|
||||||
|
except Exception as e:
|
||||||
|
log.exception('Failed to make identify query: %r'%query)
|
||||||
|
return as_unicode(e)
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser = etree.XMLParser(recover=True, no_network=True)
|
||||||
|
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
||||||
|
strip_encoding_pats=True)[0], parser=parser)
|
||||||
|
entries = entry(feed)
|
||||||
|
except Exception as e:
|
||||||
|
log.exception('Failed to parse identify results')
|
||||||
|
return as_unicode(e)
|
||||||
|
|
||||||
|
if not entries and identifiers and title and authors and \
|
||||||
|
not abort.is_set():
|
||||||
|
return self.identify(log, result_queue, abort, title=title,
|
||||||
|
authors=authors, timeout=timeout)
|
||||||
|
|
||||||
|
# There is no point running these queries in threads as google
|
||||||
|
# throttles requests returning 403 Forbidden errors
|
||||||
|
self.get_all_details(br, log, entries, abort, result_queue, timeout)
|
||||||
|
|
||||||
|
return None
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
if __name__ == '__main__': # tests {{{
|
||||||
|
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
|
||||||
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
|
title_test, authors_test)
|
||||||
|
test_identify_plugin(GoogleBooks.name,
|
||||||
|
[
|
||||||
|
|
||||||
|
|
||||||
|
(
|
||||||
|
{'identifiers':{'isbn': '0743273567'}, 'title':'Great Gatsby',
|
||||||
|
'authors':['Fitzgerald']},
|
||||||
|
[title_test('The great gatsby', exact=True),
|
||||||
|
authors_test(['Francis Scott Fitzgerald'])]
|
||||||
|
),
|
||||||
|
|
||||||
|
(
|
||||||
|
{'title': 'Flatland', 'authors':['Abbott']},
|
||||||
|
[title_test('Flatland', exact=False)]
|
||||||
|
),
|
||||||
|
])
|
||||||
|
# }}}
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user