First working version of Douban book plugin.

2025-07-09 03:04:10 -04:00 · 2011-04-29 16:29:57 +08:00 · 2011-04-29 16:29:57 +08:00 · ea4b5b9054
commit ea4b5b9054
parent fabef627e3
1 changed files with 37 additions and 46 deletions
--- a/src/calibre/ebooks/metadata/sources/douban.py
+++ b/src/calibre/ebooks/metadata/sources/douban.py
@ -25,14 +25,8 @@ from calibre import as_unicode
 NAMESPACES = {
              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
              'atom' : 'http://www.w3.org/2005/Atom',
-              'dc'   : 'http://purl.org/dc/terms',
-              'gd'   : 'http://schemas.google.com/g/2005'
-            }
-
-NAMESPACES = {
-              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
-              'atom' : 'http://www.w3.org/2005/Atom',
-              'db': 'http://www.douban.com/xmlns/'
+              'db': 'http://www.douban.com/xmlns/',
+              'gd': 'http://schemas.google.com/g/2005'
            }
 XPath = partial(etree.XPath, namespaces=NAMESPACES)
 total_results  = XPath('//openSearch:totalResults')
@ -47,6 +41,8 @@ isbn           = XPath("descendant::db:attribute[@name='isbn13']")
 date           = XPath("descendant::db:attribute[@name='pubdate']")
 creator        = XPath("descendant::db:attribute[@name='author']")
 tag            = XPath("descendant::db:tag")
+rating         = XPath("descendant::gd:rating[@name='average']")
+cover_url      = XPath("descendant::atom:link[@rel='image']/attribute::href")

 def get_details(browser, url, timeout): # {{{
    try:
@ -77,7 +73,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{


    id_url = entry_id(entry_)[0].text
-    google_id = id_url.split('/')[-1]
+    douban_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
@ -87,7 +83,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
        return None

    mi = Metadata(title_, authors)
-    mi.identifiers = {'google':google_id}
+    mi.identifiers = {'douban':douban_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
@ -103,13 +99,9 @@ def to_metadata(browser, log, entry_, timeout): # {{{

    # ISBN
    isbns = []
-    for x in identifier(extra):
-        t = str(x.text).strip()
-        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
-            if t[:5].upper() == 'ISBN:':
-                t = check_isbn(t[5:])
-                if t:
-                    isbns.append(t)
+    for x in [t.text for t in isbn(extra)]:
+        if check_isbn(x):
+            isbns.append(x)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns
@ -139,21 +131,23 @@ def to_metadata(browser, log, entry_, timeout): # {{{
            log.error('Failed to parse pubdate %r'%pubdate)

    # Ratings
-    for x in rating(extra):
+    if rating(extra):
        try:
-            mi.rating = float(x.get('average'))
-            if mi.rating > 5:
-                mi.rating /= 2
+            mi.rating = float(rating(extra).text) / 2.0
        except:
            log.exception('Failed to parse rating')
+            mi.rating = 0

    # Cover
-    mi.has_google_cover = None
-    for x in extra.xpath(
-            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
-        mi.has_google_cover = x.get('href')
-        break
-
+    mi.has_douban_cover = None
+    u = cover_url(extra)
+    print(u)
+    if u:
+        u = u[0].replace('/spic/', '/lpic/');
+        print(u)
+        # If URL contains "book-default", the book doesn't have a cover
+        if u.find('book-default') == -1:
+            mi.has_douban_cover = u
    return mi
 # }}}

@ -172,6 +166,7 @@ class Douban(Source):
    cached_cover_url_is_reliable = True

    DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
+    DOUBAN_ID_URL = 'http://api.douban.com/book/subject/%s'
 #    GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'

 #    DUMMY_IMAGE_MD5 = frozenset(['0de4383ebad0adad5eeb8975cd796657'])
@ -179,7 +174,7 @@ class Douban(Source):
    def get_book_url(self, identifiers): # {{{
        db = identifiers.get('douban', None)
        if db is not None:
-            return db
+            return DOUBAN_ID_URL % db
        else:
            return None
    # }}}
@ -206,11 +201,11 @@ class Douban(Source):
                q += ((' ' if q != '' else '') + 
                    build_term('author', author_tokens))
            t = 'search'
+        q = q.strip()
        if isinstance(q, unicode):
            q = q.encode('utf-8')
        if not q:
            return None
-        print(q)
        url = None
        if t == "isbn":
            url = ISBN_URL + q
@ -220,7 +215,6 @@ class Douban(Source):
                    })
        if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
            url = url + "?apikey=" + self.DOUBAN_API_KEY
-        print(url)
        return url
    # }}}

@ -257,10 +251,7 @@ class Douban(Source):
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            if cdata:
-                if hashlib.md5(cdata).hexdigest() in self.DUMMY_IMAGE_MD5:
-                    log.warning('Google returned a dummy image, ignoring')
-                else:
-                    result_queue.put((self, cdata))
+                result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)

@ -268,13 +259,13 @@ class Douban(Source):

    def get_cached_cover_url(self, identifiers): # {{{
        url = None
-        goog = identifiers.get('google', None)
-        if goog is None:
+        db = identifiers.get('douban', None)
+        if db is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
-                goog = self.cached_isbn_to_identifier(isbn)
-        if goog is not None:
-            url = self.cached_identifier_to_cover_url(goog)
+                db = self.cached_isbn_to_identifier(isbn)
+        if db is not None:
+            url = self.cached_identifier_to_cover_url(db)

        return url
    # }}}
@ -286,12 +277,12 @@ class Douban(Source):
                ans = to_metadata(br, log, i, timeout)
                if isinstance(ans, Metadata):
                    ans.source_relevance = relevance
-                    goog = ans.identifiers['google']
+                    db = ans.identifiers['douban']
                    for isbn in getattr(ans, 'all_isbns', []):
-                        self.cache_isbn_to_identifier(isbn, goog)
-                    if ans.has_google_cover:
-                        self.cache_identifier_to_cover_url(goog,
-                                self.GOOGLE_COVER%goog)
+                        self.cache_isbn_to_identifier(isbn, db)
+                    if ans.has_douban_cover:
+                        self.cache_identifier_to_cover_url(db,
+                                ans.has_douban_cover)
                    self.clean_downloaded_metadata(ans)
                    result_queue.put(ans)
            except:
@ -315,7 +306,6 @@ class Douban(Source):
        except Exception as e:
            log.exception('Failed to make identify query: %r'%query)
            return as_unicode(e)
-
        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
@ -324,7 +314,8 @@ class Douban(Source):
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
-
+        if not title:
+            title = ""
        if not entries and identifiers and title and authors and \
                not abort.is_set():
            return self.identify(log, result_queue, abort, title=title,