Get books: Update the Gutenberg plugin to adapt for changes to the website

This commit is contained in:
Kovid Goyal 2020-10-23 11:01:35 +05:30
parent b0e276435f
commit 62e9722478
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,136 +1,105 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
store_version = 6 # Needed for dynamic plugin loading
store_version = 7 # Needed for dynamic plugin loading
__license__ = 'GPL 3'
__copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import base64
import mimetypes
import re
from contextlib import closing
try:
from urllib.parse import quote_plus
except ImportError:
from urllib import quote_plus
from html5_parser import parse
from lxml import etree
from calibre import browser, url_slash_cleaner
from calibre.constants import __appname__, __version__
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre import browser
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult
web_url = 'http://m.gutenberg.org/'
from calibre.gui2.store.web_store_dialog import WebStoreDialog
from css_selectors import Select
def fix_url(url):
if url and url.startswith('//'):
url = 'http:' + url
return url
def absurl(href):
if href.startswith('//'):
href = 'https:' + href
elif href.startswith('/'):
href = 'https://www.gutenberg.org' + href
return href
def search(query, max_results=10, timeout=60, write_raw_to=None):
url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query)
url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format(quote_plus(query))
counter = max_results
br = browser(user_agent='calibre/'+__version__)
with closing(br.open(url, timeout=timeout)) as f:
raw = f.read()
if write_raw_to is not None:
with open(write_raw_to, 'wb') as f:
f.write(raw)
doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0:
break
br = browser()
raw = br.open(url).read()
counter -= 1
if write_raw_to is not None:
with open(write_raw_to, 'wb') as f:
f.write(raw)
s = SearchResult()
root = parse(raw)
CSSSelect = Select(root)
for li in CSSSelect('li.booklink'):
if counter <= 0:
break
counter -= 1
# We could use the <link rel="alternate" type="text/html" ...> tag from the
# detail odps page but this is easier.
id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip())
s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id)))
s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
if not s.title or not s.author:
continue
s = SearchResult()
a = next(CSSSelect('a.link', li))
s.detail_item = absurl(a.get('href'))
s.title = etree.tostring(next(CSSSelect('span.title', li)), method='text', encoding='unicode').strip()
s.author = etree.tostring(next(CSSSelect('span.subtitle', li)), method='text', encoding='unicode').strip()
for img in CSSSelect('img.cover-thumb', li):
s.cover_url = absurl(img.get('src'))
break
# Get the formats and direct download links.
with closing(br.open(id, timeout=timeout/4)) as nf:
ndoc = etree.fromstring(nf.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
type = link.get('type')
href = link.get('href')
if type:
ext = mimetypes.guess_extension(type)
if ext:
ext = ext[1:].upper().strip()
s.downloads[ext] = fix_url(href)
# Get the formats and direct download links.
details_doc = parse(br.open_novisit(s.detail_item).read())
doc_select = Select(details_doc)
for tr in doc_select('table.files tr[typeof="pgterms:file"]'):
for a in doc_select('a.link', tr):
href = a.get('href')
type = a.get('type')
ext = mimetypes.guess_extension(type.split(';')[0]) if type else None
if href and ext:
url = absurl(href.split('?')[0])
ext = ext[1:].upper().strip()
if ext not in s.downloads:
s.downloads[ext] = url
break
s.formats = ', '.join(s.downloads.keys())
if not s.formats:
continue
s.formats = ', '.join(s.downloads.keys())
if not s.formats:
continue
for link in data.xpath('./*[local-name() = "link"]'):
rel = link.get('rel')
href = link.get('href')
type = link.get('type')
if rel and href and type:
href = fix_url(href)
if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
if href.startswith('data:image/png;base64,'):
cdata = href.replace('data:image/png;base64,', '')
if not isinstance(cdata, bytes):
cdata = cdata.encode('ascii')
s.cover_data = base64.b64decode(cdata)
yield s
yield s
class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
web_url = web_url
def create_browser(self):
from calibre import browser
user_agent = '%s/%s' % (__appname__, __version__)
return browser(user_agent=user_agent)
class GutenbergStore(StorePlugin):
def search(self, query, max_results=10, timeout=60):
'''
Gutenberg's ODPS feed is poorly implmented and has a number of issues
which require very special handling to fix the results.
Issues:
* "Sort Alphabetically" and "Sort by Release Date" are returned
as book entries.
* The author is put into a "content" tag and not the author tag.
* The link to the book itself goes to an odps page which we need
to turn into a link to a web page.
* acquisition links are not part of the search result so we have
to go to the odps item itself. Detail item pages have a nasty
note saying:
DON'T USE THIS PAGE FOR SCRAPING.
Seriously. You'll only get your IP blocked.
We're using the ODPS feed because people are getting blocked with
the previous implementation so due to this using ODPS probably
won't solve this issue.
* Images are not links but base64 encoded strings. They are also not
real cover images but a little blue book thumbnail.
'''
for result in search(query, max_results, timeout):
yield result
def open(self, parent=None, detail_item=None, external=False):
url = detail_item or absurl('/')
if external:
open_url(url)
return
d = WebStoreDialog(self.gui, url, parent, detail_item)
d.setWindowTitle(self.name)
d.exec_()
if __name__ == '__main__':
import sys
for result in search(' '.join(sys.argv[1:]), write_raw_to='/t/gutenberg.html'):
print(result)