mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Get books: Update the Gutenberg plugin to adapt for changes to the website
This commit is contained in:
parent
b0e276435f
commit
62e9722478
@ -1,136 +1,105 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
store_version = 6 # Needed for dynamic plugin loading
|
store_version = 7 # Needed for dynamic plugin loading
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import base64
|
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import re
|
|
||||||
from contextlib import closing
|
|
||||||
try:
|
try:
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urllib import quote_plus
|
from urllib import quote_plus
|
||||||
|
|
||||||
|
from html5_parser import parse
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre import browser, url_slash_cleaner
|
from calibre import browser
|
||||||
from calibre.constants import __appname__, __version__
|
from calibre.gui2 import open_url
|
||||||
from calibre.gui2.store.basic_config import BasicStoreConfig
|
from calibre.gui2.store import StorePlugin
|
||||||
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
|
|
||||||
from calibre.gui2.store.search_result import SearchResult
|
from calibre.gui2.store.search_result import SearchResult
|
||||||
|
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
||||||
web_url = 'http://m.gutenberg.org/'
|
from css_selectors import Select
|
||||||
|
|
||||||
|
|
||||||
def fix_url(url):
|
def absurl(href):
|
||||||
if url and url.startswith('//'):
|
if href.startswith('//'):
|
||||||
url = 'http:' + url
|
href = 'https:' + href
|
||||||
return url
|
elif href.startswith('/'):
|
||||||
|
href = 'https://www.gutenberg.org' + href
|
||||||
|
return href
|
||||||
|
|
||||||
|
|
||||||
def search(query, max_results=10, timeout=60, write_raw_to=None):
|
def search(query, max_results=10, timeout=60, write_raw_to=None):
|
||||||
url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query)
|
url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format(quote_plus(query))
|
||||||
|
|
||||||
counter = max_results
|
counter = max_results
|
||||||
br = browser(user_agent='calibre/'+__version__)
|
br = browser()
|
||||||
with closing(br.open(url, timeout=timeout)) as f:
|
raw = br.open(url).read()
|
||||||
raw = f.read()
|
|
||||||
if write_raw_to is not None:
|
|
||||||
with open(write_raw_to, 'wb') as f:
|
|
||||||
f.write(raw)
|
|
||||||
doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
|
||||||
for data in doc.xpath('//*[local-name() = "entry"]'):
|
|
||||||
if counter <= 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
counter -= 1
|
if write_raw_to is not None:
|
||||||
|
with open(write_raw_to, 'wb') as f:
|
||||||
|
f.write(raw)
|
||||||
|
|
||||||
s = SearchResult()
|
root = parse(raw)
|
||||||
|
CSSSelect = Select(root)
|
||||||
|
for li in CSSSelect('li.booklink'):
|
||||||
|
if counter <= 0:
|
||||||
|
break
|
||||||
|
counter -= 1
|
||||||
|
|
||||||
# We could use the <link rel="alternate" type="text/html" ...> tag from the
|
s = SearchResult()
|
||||||
# detail odps page but this is easier.
|
a = next(CSSSelect('a.link', li))
|
||||||
id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip())
|
s.detail_item = absurl(a.get('href'))
|
||||||
s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id)))
|
s.title = etree.tostring(next(CSSSelect('span.title', li)), method='text', encoding='unicode').strip()
|
||||||
s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
|
s.author = etree.tostring(next(CSSSelect('span.subtitle', li)), method='text', encoding='unicode').strip()
|
||||||
s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
|
for img in CSSSelect('img.cover-thumb', li):
|
||||||
if not s.title or not s.author:
|
s.cover_url = absurl(img.get('src'))
|
||||||
continue
|
break
|
||||||
|
|
||||||
# Get the formats and direct download links.
|
# Get the formats and direct download links.
|
||||||
with closing(br.open(id, timeout=timeout/4)) as nf:
|
details_doc = parse(br.open_novisit(s.detail_item).read())
|
||||||
ndoc = etree.fromstring(nf.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
doc_select = Select(details_doc)
|
||||||
for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
|
for tr in doc_select('table.files tr[typeof="pgterms:file"]'):
|
||||||
type = link.get('type')
|
for a in doc_select('a.link', tr):
|
||||||
href = link.get('href')
|
href = a.get('href')
|
||||||
if type:
|
type = a.get('type')
|
||||||
ext = mimetypes.guess_extension(type)
|
ext = mimetypes.guess_extension(type.split(';')[0]) if type else None
|
||||||
if ext:
|
if href and ext:
|
||||||
ext = ext[1:].upper().strip()
|
url = absurl(href.split('?')[0])
|
||||||
s.downloads[ext] = fix_url(href)
|
ext = ext[1:].upper().strip()
|
||||||
|
if ext not in s.downloads:
|
||||||
|
s.downloads[ext] = url
|
||||||
|
break
|
||||||
|
|
||||||
s.formats = ', '.join(s.downloads.keys())
|
s.formats = ', '.join(s.downloads.keys())
|
||||||
if not s.formats:
|
if not s.formats:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for link in data.xpath('./*[local-name() = "link"]'):
|
yield s
|
||||||
rel = link.get('rel')
|
|
||||||
href = link.get('href')
|
|
||||||
type = link.get('type')
|
|
||||||
|
|
||||||
if rel and href and type:
|
|
||||||
href = fix_url(href)
|
|
||||||
if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
|
|
||||||
if href.startswith('data:image/png;base64,'):
|
|
||||||
cdata = href.replace('data:image/png;base64,', '')
|
|
||||||
if not isinstance(cdata, bytes):
|
|
||||||
cdata = cdata.encode('ascii')
|
|
||||||
s.cover_data = base64.b64decode(cdata)
|
|
||||||
|
|
||||||
yield s
|
|
||||||
|
|
||||||
|
|
||||||
class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
|
class GutenbergStore(StorePlugin):
|
||||||
|
|
||||||
open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
|
|
||||||
web_url = web_url
|
|
||||||
|
|
||||||
def create_browser(self):
|
|
||||||
from calibre import browser
|
|
||||||
user_agent = '%s/%s' % (__appname__, __version__)
|
|
||||||
return browser(user_agent=user_agent)
|
|
||||||
|
|
||||||
def search(self, query, max_results=10, timeout=60):
|
def search(self, query, max_results=10, timeout=60):
|
||||||
'''
|
|
||||||
Gutenberg's ODPS feed is poorly implmented and has a number of issues
|
|
||||||
which require very special handling to fix the results.
|
|
||||||
|
|
||||||
Issues:
|
|
||||||
* "Sort Alphabetically" and "Sort by Release Date" are returned
|
|
||||||
as book entries.
|
|
||||||
* The author is put into a "content" tag and not the author tag.
|
|
||||||
* The link to the book itself goes to an odps page which we need
|
|
||||||
to turn into a link to a web page.
|
|
||||||
* acquisition links are not part of the search result so we have
|
|
||||||
to go to the odps item itself. Detail item pages have a nasty
|
|
||||||
note saying:
|
|
||||||
DON'T USE THIS PAGE FOR SCRAPING.
|
|
||||||
Seriously. You'll only get your IP blocked.
|
|
||||||
We're using the ODPS feed because people are getting blocked with
|
|
||||||
the previous implementation so due to this using ODPS probably
|
|
||||||
won't solve this issue.
|
|
||||||
* Images are not links but base64 encoded strings. They are also not
|
|
||||||
real cover images but a little blue book thumbnail.
|
|
||||||
'''
|
|
||||||
for result in search(query, max_results, timeout):
|
for result in search(query, max_results, timeout):
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
def open(self, parent=None, detail_item=None, external=False):
|
||||||
|
url = detail_item or absurl('/')
|
||||||
|
if external:
|
||||||
|
open_url(url)
|
||||||
|
return
|
||||||
|
d = WebStoreDialog(self.gui, url, parent, detail_item)
|
||||||
|
d.setWindowTitle(self.name)
|
||||||
|
d.exec_()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
for result in search(' '.join(sys.argv[1:]), write_raw_to='/t/gutenberg.html'):
|
for result in search(' '.join(sys.argv[1:]), write_raw_to='/t/gutenberg.html'):
|
||||||
print(result)
|
print(result)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user