diff --git a/src/calibre/gui2/store/stores/gutenberg_plugin.py b/src/calibre/gui2/store/stores/gutenberg_plugin.py index 62852eeb79..9c5beb9bb0 100644 --- a/src/calibre/gui2/store/stores/gutenberg_plugin.py +++ b/src/calibre/gui2/store/stores/gutenberg_plugin.py @@ -1,136 +1,105 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, division, print_function, unicode_literals -store_version = 6 # Needed for dynamic plugin loading +store_version = 7 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2011, 2013, John Schember ' __docformat__ = 'restructuredtext en' -import base64 import mimetypes -import re -from contextlib import closing + try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus +from html5_parser import parse from lxml import etree -from calibre import browser, url_slash_cleaner -from calibre.constants import __appname__, __version__ -from calibre.gui2.store.basic_config import BasicStoreConfig -from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore +from calibre import browser +from calibre.gui2 import open_url +from calibre.gui2.store import StorePlugin from calibre.gui2.store.search_result import SearchResult - -web_url = 'http://m.gutenberg.org/' +from calibre.gui2.store.web_store_dialog import WebStoreDialog +from css_selectors import Select -def fix_url(url): - if url and url.startswith('//'): - url = 'http:' + url - return url +def absurl(href): + if href.startswith('//'): + href = 'https:' + href + elif href.startswith('/'): + href = 'https://www.gutenberg.org' + href + return href def search(query, max_results=10, timeout=60, write_raw_to=None): - url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query) + url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format(quote_plus(query)) counter = max_results - br = browser(user_agent='calibre/'+__version__) - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_raw_to is not None: - with open(write_raw_to, 'wb') as f: - f.write(raw) - doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) - for data in doc.xpath('//*[local-name() = "entry"]'): - if counter <= 0: - break + br = browser() + raw = br.open(url).read() - counter -= 1 + if write_raw_to is not None: + with open(write_raw_to, 'wb') as f: + f.write(raw) - s = SearchResult() + root = parse(raw) + CSSSelect = Select(root) + for li in CSSSelect('li.booklink'): + if counter <= 0: + break + counter -= 1 - # We could use the tag from the - # detail odps page but this is easier. - id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()) - s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id))) - s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() - s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip() - if not s.title or not s.author: - continue + s = SearchResult() + a = next(CSSSelect('a.link', li)) + s.detail_item = absurl(a.get('href')) + s.title = etree.tostring(next(CSSSelect('span.title', li)), method='text', encoding='unicode').strip() + s.author = etree.tostring(next(CSSSelect('span.subtitle', li)), method='text', encoding='unicode').strip() + for img in CSSSelect('img.cover-thumb', li): + s.cover_url = absurl(img.get('src')) + break - # Get the formats and direct download links. - with closing(br.open(id, timeout=timeout/4)) as nf: - ndoc = etree.fromstring(nf.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) - for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): - type = link.get('type') - href = link.get('href') - if type: - ext = mimetypes.guess_extension(type) - if ext: - ext = ext[1:].upper().strip() - s.downloads[ext] = fix_url(href) + # Get the formats and direct download links. + details_doc = parse(br.open_novisit(s.detail_item).read()) + doc_select = Select(details_doc) + for tr in doc_select('table.files tr[typeof="pgterms:file"]'): + for a in doc_select('a.link', tr): + href = a.get('href') + type = a.get('type') + ext = mimetypes.guess_extension(type.split(';')[0]) if type else None + if href and ext: + url = absurl(href.split('?')[0]) + ext = ext[1:].upper().strip() + if ext not in s.downloads: + s.downloads[ext] = url + break - s.formats = ', '.join(s.downloads.keys()) - if not s.formats: - continue + s.formats = ', '.join(s.downloads.keys()) + if not s.formats: + continue - for link in data.xpath('./*[local-name() = "link"]'): - rel = link.get('rel') - href = link.get('href') - type = link.get('type') - - if rel and href and type: - href = fix_url(href) - if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): - if href.startswith('data:image/png;base64,'): - cdata = href.replace('data:image/png;base64,', '') - if not isinstance(cdata, bytes): - cdata = cdata.encode('ascii') - s.cover_data = base64.b64decode(cdata) - - yield s + yield s -class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore): - - open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml' - web_url = web_url - - def create_browser(self): - from calibre import browser - user_agent = '%s/%s' % (__appname__, __version__) - return browser(user_agent=user_agent) +class GutenbergStore(StorePlugin): def search(self, query, max_results=10, timeout=60): - ''' - Gutenberg's ODPS feed is poorly implmented and has a number of issues - which require very special handling to fix the results. - - Issues: - * "Sort Alphabetically" and "Sort by Release Date" are returned - as book entries. - * The author is put into a "content" tag and not the author tag. - * The link to the book itself goes to an odps page which we need - to turn into a link to a web page. - * acquisition links are not part of the search result so we have - to go to the odps item itself. Detail item pages have a nasty - note saying: - DON'T USE THIS PAGE FOR SCRAPING. - Seriously. You'll only get your IP blocked. - We're using the ODPS feed because people are getting blocked with - the previous implementation so due to this using ODPS probably - won't solve this issue. - * Images are not links but base64 encoded strings. They are also not - real cover images but a little blue book thumbnail. - ''' for result in search(query, max_results, timeout): yield result + def open(self, parent=None, detail_item=None, external=False): + url = detail_item or absurl('/') + if external: + open_url(url) + return + d = WebStoreDialog(self.gui, url, parent, detail_item) + d.setWindowTitle(self.name) + d.exec_() + if __name__ == '__main__': import sys + for result in search(' '.join(sys.argv[1:]), write_raw_to='/t/gutenberg.html'): print(result)