Get books: Update the Gutenberg plugin to adapt for changes to the website

2025-09-14 16:18:05 -04:00 · 2020-10-23 11:01:35 +05:30 · 2020-10-23 11:01:35 +05:30 · 62e9722478
commit 62e9722478
parent b0e276435f
1 changed files with 63 additions and 94 deletions
--- a/src/calibre/gui2/store/stores/gutenberg_plugin.py
+++ b/src/calibre/gui2/store/stores/gutenberg_plugin.py
@ -1,136 +1,105 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, division, print_function, unicode_literals
-store_version = 6  # Needed for dynamic plugin loading
+store_version = 7  # Needed for dynamic plugin loading
 __license__ = 'GPL 3'
 __copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import base64
 import mimetypes
-import re
+
 from contextlib import closing
 try:
    from urllib.parse import quote_plus
 except ImportError:
    from urllib import quote_plus
 from html5_parser import parse
 from lxml import etree
-from calibre import browser, url_slash_cleaner
+from calibre import browser
-from calibre.constants import __appname__, __version__
+from calibre.gui2 import open_url
-from calibre.gui2.store.basic_config import BasicStoreConfig
+from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
 from calibre.gui2.store.search_result import SearchResult
-
+from calibre.gui2.store.web_store_dialog import WebStoreDialog
-web_url = 'http://m.gutenberg.org/'
+from css_selectors import Select
-def fix_url(url):
+def absurl(href):
-    if url and url.startswith('//'):
+    if href.startswith('//'):
-        url = 'http:' + url
+        href = 'https:' + href
-    return url
+    elif href.startswith('/'):
        href = 'https://www.gutenberg.org' + href
    return href
 def search(query, max_results=10, timeout=60, write_raw_to=None):
-    url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query)
+    url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format(quote_plus(query))
    counter = max_results
-    br = browser(user_agent='calibre/'+__version__)
+    br = browser()
-    with closing(br.open(url, timeout=timeout)) as f:
+    raw = br.open(url).read()
        raw = f.read()
        if write_raw_to is not None:
            with open(write_raw_to, 'wb') as f:
                f.write(raw)
        doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
        for data in doc.xpath('//*[local-name() = "entry"]'):
            if counter <= 0:
                break
-            counter -= 1
+    if write_raw_to is not None:
        with open(write_raw_to, 'wb') as f:
            f.write(raw)
-            s = SearchResult()
+    root = parse(raw)
    CSSSelect = Select(root)
    for li in CSSSelect('li.booklink'):
        if counter <= 0:
            break
        counter -= 1
-            # We could use the <link rel="alternate" type="text/html" ...> tag from the
+        s = SearchResult()
-            # detail odps page but this is easier.
+        a = next(CSSSelect('a.link', li))
-            id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip())
+        s.detail_item = absurl(a.get('href'))
-            s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id)))
+        s.title = etree.tostring(next(CSSSelect('span.title', li)), method='text', encoding='unicode').strip()
-            s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
+        s.author = etree.tostring(next(CSSSelect('span.subtitle', li)), method='text', encoding='unicode').strip()
-            s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
+        for img in CSSSelect('img.cover-thumb', li):
-            if not s.title or not s.author:
+            s.cover_url = absurl(img.get('src'))
-                continue
+            break
-            # Get the formats and direct download links.
+        # Get the formats and direct download links.
-            with closing(br.open(id, timeout=timeout/4)) as nf:
+        details_doc = parse(br.open_novisit(s.detail_item).read())
-                ndoc = etree.fromstring(nf.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
+        doc_select = Select(details_doc)
-                for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
+        for tr in doc_select('table.files tr[typeof="pgterms:file"]'):
-                    type = link.get('type')
+            for a in doc_select('a.link', tr):
-                    href = link.get('href')
+                href = a.get('href')
-                    if type:
+                type = a.get('type')
-                        ext = mimetypes.guess_extension(type)
+                ext = mimetypes.guess_extension(type.split(';')[0]) if type else None
-                        if ext:
+                if href and ext:
-                            ext = ext[1:].upper().strip()
+                    url = absurl(href.split('?')[0])
-                            s.downloads[ext] = fix_url(href)
+                    ext = ext[1:].upper().strip()
                    if ext not in s.downloads:
                        s.downloads[ext] = url
                    break
-            s.formats = ', '.join(s.downloads.keys())
+        s.formats = ', '.join(s.downloads.keys())
-            if not s.formats:
+        if not s.formats:
-                continue
+            continue
-            for link in data.xpath('./*[local-name() = "link"]'):
+        yield s
                rel = link.get('rel')
                href = link.get('href')
                type = link.get('type')
                if rel and href and type:
                    href = fix_url(href)
                    if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
                        if href.startswith('data:image/png;base64,'):
                            cdata = href.replace('data:image/png;base64,', '')
                            if not isinstance(cdata, bytes):
                                cdata = cdata.encode('ascii')
                            s.cover_data = base64.b64decode(cdata)
            yield s
-class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
+class GutenbergStore(StorePlugin):
    open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
    web_url = web_url
    def create_browser(self):
        from calibre import browser
        user_agent = '%s/%s' % (__appname__, __version__)
        return browser(user_agent=user_agent)
    def search(self, query, max_results=10, timeout=60):
        '''
        Gutenberg's ODPS feed is poorly implmented and has a number of issues
        which require very special handling to fix the results.
        Issues:
          * "Sort Alphabetically" and "Sort by Release Date" are returned
            as book entries.
          * The author is put into a "content" tag and not the author tag.
          * The link to the book itself goes to an odps page which we need
            to turn into a link to a web page.
          * acquisition links are not part of the search result so we have
            to go to the odps item itself. Detail item pages have a nasty
            note saying:
              DON'T USE THIS PAGE FOR SCRAPING.
              Seriously. You'll only get your IP blocked.
            We're using the ODPS feed because people are getting blocked with
            the previous implementation so due to this using ODPS probably
            won't solve this issue.
          * Images are not links but base64 encoded strings. They are also not
            real cover images but a little blue book thumbnail.
        '''
        for result in search(query, max_results, timeout):
            yield result
    def open(self, parent=None, detail_item=None, external=False):
        url = detail_item or absurl('/')
        if external:
            open_url(url)
            return
        d = WebStoreDialog(self.gui, url, parent, detail_item)
        d.setWindowTitle(self.name)
        d.exec_()
 if __name__ == '__main__':
    import sys
    for result in search(' '.join(sys.argv[1:]), write_raw_to='/t/gutenberg.html'):
        print(result)