From f5665069189ad58db96610b6ff8fe1cf051348a3 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Wed, 10 Apr 2013 22:17:36 -0400
Subject: [PATCH] Store: Change Gutenberg plugin to use constant user agent
 string. Change plugin to use ODPS feed.

---
 .../gui2/store/stores/gutenberg_plugin.py     | 133 ++++++++++--------
 1 file changed, 73 insertions(+), 60 deletions(-)

diff --git a/src/calibre/gui2/store/stores/gutenberg_plugin.py b/src/calibre/gui2/store/stores/gutenberg_plugin.py
index b057cfe50f..99d404c74c 100644
--- a/src/calibre/gui2/store/stores/gutenberg_plugin.py
+++ b/src/calibre/gui2/store/stores/gutenberg_plugin.py
@@ -1,91 +1,104 @@
 # -*- coding: utf-8 -*-
 
 from __future__ import (unicode_literals, division, absolute_import, print_function)
-store_version = 2 # Needed for dynamic plugin loading
+store_version = 3 # Needed for dynamic plugin loading
 
 __license__ = 'GPL 3'
-__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
+import base64
 import mimetypes
+import re
 import urllib
 from contextlib import closing
 
-from lxml import html
+from lxml import etree
 
-from PyQt4.Qt import QUrl
-
-from calibre import browser, random_user_agent, url_slash_cleaner
-from calibre.gui2 import open_url
-from calibre.gui2.store import StorePlugin
+from calibre import browser, url_slash_cleaner
+from calibre.constants import __version__
 from calibre.gui2.store.basic_config import BasicStoreConfig
+from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
 from calibre.gui2.store.search_result import SearchResult
-from calibre.gui2.store.web_store_dialog import WebStoreDialog
 
-class GutenbergStore(BasicStoreConfig, StorePlugin):
+class GutenbergStore(BasicStoreConfig, OpenSearchOPDSStore):
 
-    def open(self, parent=None, detail_item=None, external=False):
-        url = 'http://gutenberg.org/'
-
-        if detail_item:
-            detail_item = url_slash_cleaner(url + detail_item)
-
-        if external or self.config.get('open_external', False):
-            open_url(QUrl(detail_item if detail_item else url))
-        else:
-            d = WebStoreDialog(self.gui, url, parent, detail_item)
-            d.setWindowTitle(self.name)
-            d.set_tags(self.config.get('tags', ''))
-            d.exec_()
+    open_search_url = 'http://www.gutenberg.org/catalog/osd-books.xml'
+    web_url = 'http://m.gutenberg.org/'
 
     def search(self, query, max_results=10, timeout=60):
-        url = 'http://m.gutenberg.org/ebooks/search.mobile/?default_prefix=all&sort_order=title&query=' + urllib.quote_plus(query)
+        '''
+        Gutenberg's ODPS feed is poorly implmented and has a number of issues
+        which require very special handling to fix the results.
 
-        br = browser(user_agent=random_user_agent())
+        Issues:
+          * "Sort Alphabetically" and "Sort by Release Date" are returned
+            as book entries.
+          * The author is put into a "content" tag and not the author tag.
+          * The link to the book itself goes to an odps page which we need
+            to turn into a link to a web page.
+          * acquisition links are not part of the search result so we have
+            to go to the odps item itself. Detail item pages have a nasty
+            note saying:
+              DON'T USE THIS PAGE FOR SCRAPING. 
+              Seriously. You'll only get your IP blocked.
+            We're using the ODPS feed because people are getting blocked with
+            the previous implementation so due to this using ODPS probably
+            won't solve this issue.
+          * Images are not links but base64 encoded strings. They are also not
+            real cover images but a little blue book thumbnail.
+        '''
+
+        url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query)
 
         counter = max_results
+        br = browser(user_agent='calibre/'+__version__)
         with closing(br.open(url, timeout=timeout)) as f:
-            doc = html.fromstring(f.read())
-            for data in doc.xpath('//ol[@class="results"]/li[@class="booklink"]'):
+            doc = etree.fromstring(f.read())
+            for data in doc.xpath('//*[local-name() = "entry"]'):
                 if counter <= 0:
                     break
 
-                id = ''.join(data.xpath('./a/@href'))
-                id = id.split('.mobile')[0]
-
-                title = ''.join(data.xpath('.//span[@class="title"]/text()'))
-                author = ''.join(data.xpath('.//span[@class="subtitle"]/text()'))
-
                 counter -= 1
 
                 s = SearchResult()
-                s.cover_url = ''
 
-                s.detail_item = id.strip()
-                s.title = title.strip()
-                s.author = author.strip()
-                s.price = '$0.00'
-                s.drm = SearchResult.DRM_UNLOCKED
+                # We could use the <link rel="alternate" type="text/html" ...> tag from the
+                # detail odps page but this is easier.
+                id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
+                s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (self.web_url, re.sub('[^\d]', '', id)))
+                if not s.detail_item:
+                    continue
+
+                s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
+                s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip()
+                if not s.title or not s.author:
+                    continue
+
+                # Get the formats and direct download links.
+                with closing(br.open(id, timeout=timeout/4)) as nf:
+                    ndoc = etree.fromstring(nf.read())
+                    for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
+                        type = link.get('type')
+                        href = link.get('href')
+                        if type:
+                            ext = mimetypes.guess_extension(type)
+                            if ext:
+                                ext = ext[1:].upper().strip()
+                                s.downloads[ext] = href
+
+                s.formats = ', '.join(s.downloads.keys())
+                if not s.formats:
+                    continue
+
+                for link in data.xpath('./*[local-name() = "link"]'):
+                    rel = link.get('rel')
+                    href = link.get('href')
+                    type = link.get('type')
+
+                    if rel and href and type:
+                        if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
+                            if href.startswith('data:image/png;base64,'):
+                                s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', ''))
 
                 yield s
-
-    def get_details(self, search_result, timeout):
-        url = url_slash_cleaner('http://m.gutenberg.org/' + search_result.detail_item)
-
-        br = browser(user_agent=random_user_agent())
-        with closing(br.open(url, timeout=timeout)) as nf:
-            doc = html.fromstring(nf.read())
-
-            for save_item in doc.xpath('//li[contains(@class, "icon_save")]/a'):
-                type = save_item.get('type')
-                href = save_item.get('href')
-
-                if type:
-                    ext = mimetypes.guess_extension(type)
-                    if ext:
-                        ext = ext[1:].upper().strip()
-                        search_result.downloads[ext] = href
-
-                search_result.formats = ', '.join(search_result.downloads.keys())
-
-        return True