From f5665069189ad58db96610b6ff8fe1cf051348a3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 10 Apr 2013 22:17:36 -0400 Subject: [PATCH] Store: Change Gutenberg plugin to use constant user agent string. Change plugin to use ODPS feed. --- .../gui2/store/stores/gutenberg_plugin.py | 133 ++++++++++-------- 1 file changed, 73 insertions(+), 60 deletions(-) diff --git a/src/calibre/gui2/store/stores/gutenberg_plugin.py b/src/calibre/gui2/store/stores/gutenberg_plugin.py index b057cfe50f..99d404c74c 100644 --- a/src/calibre/gui2/store/stores/gutenberg_plugin.py +++ b/src/calibre/gui2/store/stores/gutenberg_plugin.py @@ -1,91 +1,104 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 2 # Needed for dynamic plugin loading +store_version = 3 # Needed for dynamic plugin loading __license__ = 'GPL 3' -__copyright__ = '2011, John Schember ' +__copyright__ = '2011, 2013, John Schember ' __docformat__ = 'restructuredtext en' +import base64 import mimetypes +import re import urllib from contextlib import closing -from lxml import html +from lxml import etree -from PyQt4.Qt import QUrl - -from calibre import browser, random_user_agent, url_slash_cleaner -from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre import browser, url_slash_cleaner +from calibre.constants import __version__ from calibre.gui2.store.basic_config import BasicStoreConfig +from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore from calibre.gui2.store.search_result import SearchResult -from calibre.gui2.store.web_store_dialog import WebStoreDialog -class GutenbergStore(BasicStoreConfig, StorePlugin): +class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore): - def open(self, parent=None, detail_item=None, external=False): - url = 'http://gutenberg.org/' - - if detail_item: - detail_item = url_slash_cleaner(url + detail_item) - - if external or self.config.get('open_external', False): - open_url(QUrl(detail_item if detail_item else url)) - else: - d = WebStoreDialog(self.gui, url, parent, detail_item) - d.setWindowTitle(self.name) - d.set_tags(self.config.get('tags', '')) - d.exec_() + open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml' + web_url = 'http://m.gutenberg.org/' def search(self, query, max_results=10, timeout=60): - url = 'http://m.gutenberg.org/ebooks/search.mobile/?default_prefix=all&sort_order=title&query=' + urllib.quote_plus(query) + ''' + Gutenberg's ODPS feed is poorly implmented and has a number of issues + which require very special handling to fix the results. - br = browser(user_agent=random_user_agent()) + Issues: + * "Sort Alphabetically" and "Sort by Release Date" are returned + as book entries. + * The author is put into a "content" tag and not the author tag. + * The link to the book itself goes to an odps page which we need + to turn into a link to a web page. + * acquisition links are not part of the search result so we have + to go to the odps item itself. Detail item pages have a nasty + note saying: + DON'T USE THIS PAGE FOR SCRAPING. + Seriously. You'll only get your IP blocked. + We're using the ODPS feed because people are getting blocked with + the previous implementation so due to this using ODPS probably + won't solve this issue. + * Images are not links but base64 encoded strings. They are also not + real cover images but a little blue book thumbnail. + ''' + + url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query) counter = max_results + br = browser(user_agent='calibre/'+__version__) with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//ol[@class="results"]/li[@class="booklink"]'): + doc = etree.fromstring(f.read()) + for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break - id = ''.join(data.xpath('./a/@href')) - id = id.split('.mobile')[0] - - title = ''.join(data.xpath('.//span[@class="title"]/text()')) - author = ''.join(data.xpath('.//span[@class="subtitle"]/text()')) - counter -= 1 s = SearchResult() - s.cover_url = '' - s.detail_item = id.strip() - s.title = title.strip() - s.author = author.strip() - s.price = '$0.00' - s.drm = SearchResult.DRM_UNLOCKED + # We could use the tag from the + # detail odps page but this is easier. + id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() + s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (self.web_url, re.sub('[^\d]', '', id))) + if not s.detail_item: + continue + + s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() + s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip() + if not s.title or not s.author: + continue + + # Get the formats and direct download links. + with closing(br.open(id, timeout=timeout/4)) as nf: + ndoc = etree.fromstring(nf.read()) + for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): + type = link.get('type') + href = link.get('href') + if type: + ext = mimetypes.guess_extension(type) + if ext: + ext = ext[1:].upper().strip() + s.downloads[ext] = href + + s.formats = ', '.join(s.downloads.keys()) + if not s.formats: + continue + + for link in data.xpath('./*[local-name() = "link"]'): + rel = link.get('rel') + href = link.get('href') + type = link.get('type') + + if rel and href and type: + if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): + if href.startswith('data:image/png;base64,'): + s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', '')) yield s - - def get_details(self, search_result, timeout): - url = url_slash_cleaner('http://m.gutenberg.org/' + search_result.detail_item) - - br = browser(user_agent=random_user_agent()) - with closing(br.open(url, timeout=timeout)) as nf: - doc = html.fromstring(nf.read()) - - for save_item in doc.xpath('//li[contains(@class, "icon_save")]/a'): - type = save_item.get('type') - href = save_item.get('href') - - if type: - ext = mimetypes.guess_extension(type) - if ext: - ext = ext[1:].upper().strip() - search_result.downloads[ext] = href - - search_result.formats = ', '.join(search_result.downloads.keys()) - - return True