Get books: Update the Gutenberg plugin to adapt for changes to the website

2025-09-14 16:18:05 -04:00 · 2020-10-23 11:01:35 +05:30 · 2020-10-23 11:01:35 +05:30 · 62e9722478
commit 62e9722478
parent b0e276435f
1 changed files with 63 additions and 94 deletions
--- a/src/calibre/gui2/store/stores/gutenberg_plugin.py
+++ b/src/calibre/gui2/store/stores/gutenberg_plugin.py
@ -1,136 +1,105 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, division, print_function, unicode_literals

-store_version = 6  # Needed for dynamic plugin loading
+store_version = 7  # Needed for dynamic plugin loading

 __license__ = 'GPL 3'
 __copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

-import base64
 import mimetypes
-import re
-from contextlib import closing
+
 try:
    from urllib.parse import quote_plus
 except ImportError:
    from urllib import quote_plus

+from html5_parser import parse
 from lxml import etree

-from calibre import browser, url_slash_cleaner
-from calibre.constants import __appname__, __version__
-from calibre.gui2.store.basic_config import BasicStoreConfig
-from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
+from calibre import browser
+from calibre.gui2 import open_url
+from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.search_result import SearchResult
-
-web_url = 'http://m.gutenberg.org/'
+from calibre.gui2.store.web_store_dialog import WebStoreDialog
+from css_selectors import Select


-def fix_url(url):
-    if url and url.startswith('//'):
-        url = 'http:' + url
-    return url
+def absurl(href):
+    if href.startswith('//'):
+        href = 'https:' + href
+    elif href.startswith('/'):
+        href = 'https://www.gutenberg.org' + href
+    return href


 def search(query, max_results=10, timeout=60, write_raw_to=None):
-    url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query)
+    url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format(quote_plus(query))

    counter = max_results
-    br = browser(user_agent='calibre/'+__version__)
-    with closing(br.open(url, timeout=timeout)) as f:
-        raw = f.read()
-        if write_raw_to is not None:
-            with open(write_raw_to, 'wb') as f:
-                f.write(raw)
-        doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
-        for data in doc.xpath('//*[local-name() = "entry"]'):
-            if counter <= 0:
-                break
+    br = browser()
+    raw = br.open(url).read()

-            counter -= 1
+    if write_raw_to is not None:
+        with open(write_raw_to, 'wb') as f:
+            f.write(raw)

-            s = SearchResult()
+    root = parse(raw)
+    CSSSelect = Select(root)
+    for li in CSSSelect('li.booklink'):
+        if counter <= 0:
+            break
+        counter -= 1

-            # We could use the <link rel="alternate" type="text/html" ...> tag from the
-            # detail odps page but this is easier.
-            id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip())
-            s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id)))
-            s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
-            s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
-            if not s.title or not s.author:
-                continue
+        s = SearchResult()
+        a = next(CSSSelect('a.link', li))
+        s.detail_item = absurl(a.get('href'))
+        s.title = etree.tostring(next(CSSSelect('span.title', li)), method='text', encoding='unicode').strip()
+        s.author = etree.tostring(next(CSSSelect('span.subtitle', li)), method='text', encoding='unicode').strip()
+        for img in CSSSelect('img.cover-thumb', li):
+            s.cover_url = absurl(img.get('src'))
+            break

-            # Get the formats and direct download links.
-            with closing(br.open(id, timeout=timeout/4)) as nf:
-                ndoc = etree.fromstring(nf.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
-                for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
-                    type = link.get('type')
-                    href = link.get('href')
-                    if type:
-                        ext = mimetypes.guess_extension(type)
-                        if ext:
-                            ext = ext[1:].upper().strip()
-                            s.downloads[ext] = fix_url(href)
+        # Get the formats and direct download links.
+        details_doc = parse(br.open_novisit(s.detail_item).read())
+        doc_select = Select(details_doc)
+        for tr in doc_select('table.files tr[typeof="pgterms:file"]'):
+            for a in doc_select('a.link', tr):
+                href = a.get('href')
+                type = a.get('type')
+                ext = mimetypes.guess_extension(type.split(';')[0]) if type else None
+                if href and ext:
+                    url = absurl(href.split('?')[0])
+                    ext = ext[1:].upper().strip()
+                    if ext not in s.downloads:
+                        s.downloads[ext] = url
+                    break

-            s.formats = ', '.join(s.downloads.keys())
-            if not s.formats:
-                continue
+        s.formats = ', '.join(s.downloads.keys())
+        if not s.formats:
+            continue

-            for link in data.xpath('./*[local-name() = "link"]'):
-                rel = link.get('rel')
-                href = link.get('href')
-                type = link.get('type')
-
-                if rel and href and type:
-                    href = fix_url(href)
-                    if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
-                        if href.startswith('data:image/png;base64,'):
-                            cdata = href.replace('data:image/png;base64,', '')
-                            if not isinstance(cdata, bytes):
-                                cdata = cdata.encode('ascii')
-                            s.cover_data = base64.b64decode(cdata)
-
-            yield s
+        yield s


-class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
-
-    open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
-    web_url = web_url
-
-    def create_browser(self):
-        from calibre import browser
-        user_agent = '%s/%s' % (__appname__, __version__)
-        return browser(user_agent=user_agent)
+class GutenbergStore(StorePlugin):

    def search(self, query, max_results=10, timeout=60):
-        '''
-        Gutenberg's ODPS feed is poorly implmented and has a number of issues
-        which require very special handling to fix the results.
-
-        Issues:
-          * "Sort Alphabetically" and "Sort by Release Date" are returned
-            as book entries.
-          * The author is put into a "content" tag and not the author tag.
-          * The link to the book itself goes to an odps page which we need
-            to turn into a link to a web page.
-          * acquisition links are not part of the search result so we have
-            to go to the odps item itself. Detail item pages have a nasty
-            note saying:
-              DON'T USE THIS PAGE FOR SCRAPING.
-              Seriously. You'll only get your IP blocked.
-            We're using the ODPS feed because people are getting blocked with
-            the previous implementation so due to this using ODPS probably
-            won't solve this issue.
-          * Images are not links but base64 encoded strings. They are also not
-            real cover images but a little blue book thumbnail.
-        '''
        for result in search(query, max_results, timeout):
            yield result

+    def open(self, parent=None, detail_item=None, external=False):
+        url = detail_item or absurl('/')
+        if external:
+            open_url(url)
+            return
+        d = WebStoreDialog(self.gui, url, parent, detail_item)
+        d.setWindowTitle(self.name)
+        d.exec_()
+

 if __name__ == '__main__':
    import sys
+
    for result in search(' '.join(sys.argv[1:]), write_raw_to='/t/gutenberg.html'):
        print(result)