First working version of Douban book plugin.

This commit is contained in:
Byron Li 2011-04-29 16:29:57 +08:00
parent fabef627e3
commit ea4b5b9054

View File

@ -25,14 +25,8 @@ from calibre import as_unicode
NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
'atom' : 'http://www.w3.org/2005/Atom',
'dc' : 'http://purl.org/dc/terms',
'gd' : 'http://schemas.google.com/g/2005'
}
NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
'atom' : 'http://www.w3.org/2005/Atom',
'db': 'http://www.douban.com/xmlns/'
'db': 'http://www.douban.com/xmlns/',
'gd': 'http://schemas.google.com/g/2005'
}
XPath = partial(etree.XPath, namespaces=NAMESPACES)
total_results = XPath('//openSearch:totalResults')
@ -47,6 +41,8 @@ isbn = XPath("descendant::db:attribute[@name='isbn13']")
date = XPath("descendant::db:attribute[@name='pubdate']")
creator = XPath("descendant::db:attribute[@name='author']")
tag = XPath("descendant::db:tag")
rating = XPath("descendant::gd:rating[@name='average']")
cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
def get_details(browser, url, timeout): # {{{
try:
@ -77,7 +73,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
id_url = entry_id(entry_)[0].text
google_id = id_url.split('/')[-1]
douban_id = id_url.split('/')[-1]
title_ = ': '.join([x.text for x in title(entry_)]).strip()
authors = [x.text.strip() for x in creator(entry_) if x.text]
if not authors:
@ -87,7 +83,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
return None
mi = Metadata(title_, authors)
mi.identifiers = {'google':google_id}
mi.identifiers = {'douban':douban_id}
try:
raw = get_details(browser, id_url, timeout)
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
@ -103,13 +99,9 @@ def to_metadata(browser, log, entry_, timeout): # {{{
# ISBN
isbns = []
for x in identifier(extra):
t = str(x.text).strip()
if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
if t[:5].upper() == 'ISBN:':
t = check_isbn(t[5:])
if t:
isbns.append(t)
for x in [t.text for t in isbn(extra)]:
if check_isbn(x):
isbns.append(x)
if isbns:
mi.isbn = sorted(isbns, key=len)[-1]
mi.all_isbns = isbns
@ -139,21 +131,23 @@ def to_metadata(browser, log, entry_, timeout): # {{{
log.error('Failed to parse pubdate %r'%pubdate)
# Ratings
for x in rating(extra):
if rating(extra):
try:
mi.rating = float(x.get('average'))
if mi.rating > 5:
mi.rating /= 2
mi.rating = float(rating(extra).text) / 2.0
except:
log.exception('Failed to parse rating')
mi.rating = 0
# Cover
mi.has_google_cover = None
for x in extra.xpath(
'//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
mi.has_google_cover = x.get('href')
break
mi.has_douban_cover = None
u = cover_url(extra)
print(u)
if u:
u = u[0].replace('/spic/', '/lpic/');
print(u)
# If URL contains "book-default", the book doesn't have a cover
if u.find('book-default') == -1:
mi.has_douban_cover = u
return mi
# }}}
@ -172,6 +166,7 @@ class Douban(Source):
cached_cover_url_is_reliable = True
DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
DOUBAN_ID_URL = 'http://api.douban.com/book/subject/%s'
# GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'
# DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657'])
@ -179,7 +174,7 @@ class Douban(Source):
def get_book_url(self, identifiers): # {{{
db = identifiers.get('douban', None)
if db is not None:
return db
return DOUBAN_ID_URL % db
else:
return None
# }}}
@ -206,11 +201,11 @@ class Douban(Source):
q += ((' ' if q != '' else '') +
build_term('author', author_tokens))
t = 'search'
q = q.strip()
if isinstance(q, unicode):
q = q.encode('utf-8')
if not q:
return None
print(q)
url = None
if t == "isbn":
url = ISBN_URL + q
@ -220,7 +215,6 @@ class Douban(Source):
})
if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
url = url + "?apikey=" + self.DOUBAN_API_KEY
print(url)
return url
# }}}
@ -257,10 +251,7 @@ class Douban(Source):
try:
cdata = br.open_novisit(cached_url, timeout=timeout).read()
if cdata:
if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5:
log.warning('Google returned a dummy image, ignoring')
else:
result_queue.put((self, cdata))
result_queue.put((self, cdata))
except:
log.exception('Failed to download cover from:', cached_url)
@ -268,13 +259,13 @@ class Douban(Source):
def get_cached_cover_url(self, identifiers): # {{{
url = None
goog = identifiers.get('google', None)
if goog is None:
db = identifiers.get('douban', None)
if db is None:
isbn = identifiers.get('isbn', None)
if isbn is not None:
goog = self.cached_isbn_to_identifier(isbn)
if goog is not None:
url = self.cached_identifier_to_cover_url(goog)
db = self.cached_isbn_to_identifier(isbn)
if db is not None:
url = self.cached_identifier_to_cover_url(db)
return url
# }}}
@ -286,12 +277,12 @@ class Douban(Source):
ans = to_metadata(br, log, i, timeout)
if isinstance(ans, Metadata):
ans.source_relevance = relevance
goog = ans.identifiers['google']
db = ans.identifiers['douban']
for isbn in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbn, goog)
if ans.has_google_cover:
self.cache_identifier_to_cover_url(goog,
self.GOOGLE_COVER%goog)
self.cache_isbn_to_identifier(isbn, db)
if ans.has_douban_cover:
self.cache_identifier_to_cover_url(db,
ans.has_douban_cover)
self.clean_downloaded_metadata(ans)
result_queue.put(ans)
except:
@ -315,7 +306,6 @@ class Douban(Source):
except Exception as e:
log.exception('Failed to make identify query: %r'%query)
return as_unicode(e)
try:
parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
@ -324,7 +314,8 @@ class Douban(Source):
except Exception as e:
log.exception('Failed to parse identify results')
return as_unicode(e)
if not title:
title = ""
if not entries and identifiers and title and authors and \
not abort.is_set():
return self.identify(log, result_queue, abort, title=title,