From 3e0797872c0eaa08f2a4f93927e16be87aa834ae Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 3 Jul 2011 10:59:54 -0400
Subject: [PATCH] Store: Manybooks uses opds feed (faster, more accurate, fixes
 covers not showing in many cases, fix formats list). Opensearch: support
 creating search urls from Stanza catalogs. Store: opensearch based classes
 don't need to quote the search terms as the opensearch module does this
 already.

---
 src/calibre/gui2/store/opensearch_store.py    |   7 +-
 .../gui2/store/stores/manybooks_plugin.py     | 144 ++++++++++--------
 src/calibre/utils/opensearch/__init__.py      |   2 +-
 src/calibre/utils/opensearch/description.py   |  19 ++-
 4 files changed, 96 insertions(+), 76 deletions(-)

diff --git a/src/calibre/gui2/store/opensearch_store.py b/src/calibre/gui2/store/opensearch_store.py
index 6e8f5de7ba..bcc92b25f1 100644
--- a/src/calibre/gui2/store/opensearch_store.py
+++ b/src/calibre/gui2/store/opensearch_store.py
@@ -7,7 +7,6 @@ __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
 import mimetypes
-import urllib
 from contextlib import closing
 
 from lxml import etree
@@ -50,7 +49,7 @@ class OpenSearchOPDSStore(StorePlugin):
         oquery = Query(url_template)
 
         # set up initial values
-        oquery.searchTerms = urllib.quote_plus(query)
+        oquery.searchTerms = query
         oquery.count = max_results
         url = oquery.url()
         
@@ -99,7 +98,3 @@ class OpenSearchOPDSStore(StorePlugin):
                 
 
                 yield s
-
-class OpenSearchOPDSDetailStore(OpenSearchOPDSStore):
-    
-    pass
diff --git a/src/calibre/gui2/store/stores/manybooks_plugin.py b/src/calibre/gui2/store/stores/manybooks_plugin.py
index 829a97012f..c7dbf0a608 100644
--- a/src/calibre/gui2/store/stores/manybooks_plugin.py
+++ b/src/calibre/gui2/store/stores/manybooks_plugin.py
@@ -6,89 +6,101 @@ __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
-import re
-import urllib
+import mimetypes
 from contextlib import closing
 
-from lxml import html
+from lxml import etree
 
-from PyQt4.Qt import QUrl
-
-from calibre import browser, url_slash_cleaner
-from calibre.gui2 import open_url
-from calibre.gui2.store import StorePlugin
+from calibre import browser
 from calibre.gui2.store.basic_config import BasicStoreConfig
+from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
 from calibre.gui2.store.search_result import SearchResult
-from calibre.gui2.store.web_store_dialog import WebStoreDialog
+from calibre.utils.opensearch.description import Description
+from calibre.utils.opensearch.query import Query
 
-class ManyBooksStore(BasicStoreConfig, StorePlugin):
+class ManyBooksStore(BasicStoreConfig, OpenSearchOPDSStore):
 
-    def open(self, parent=None, detail_item=None, external=False):
-        url = 'http://manybooks.net/'
-
-        detail_url = None
-        if detail_item:
-            detail_url = url + detail_item
-
-        if external or self.config.get('open_external', False):
-            open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url)))
-        else:
-            d = WebStoreDialog(self.gui, url, parent, detail_url)
-            d.setWindowTitle(self.name)
-            d.set_tags(self.config.get('tags', ''))
-            d.exec_()
+    open_search_url = 'http://www.manybooks.net/opds/'
+    web_url = 'http://manybooks.net'
 
     def search(self, query, max_results=10, timeout=60):
-        # ManyBooks website separates results for title and author.
-        # It also doesn't do a clear job of references authors and
-        # secondary titles. Google is also faster.
-        # Using a google search so we can search on both fields at once.
-        url = 'http://www.google.com/xhtml?q=site:manybooks.net+' + urllib.quote_plus(query)
+        '''
+        Manybooks uses a very strange opds feed. The opds
+        main feed is structured like a stanza feed. The 
+        search result entries give very little information
+        and requires you to go to a detail link. The detail
+        link has the wrong type specified (text/html instead
+        of application/atom+xml).
+        '''
+        if not hasattr(self, 'open_search_url'):
+            return
 
-        br = browser()
+        description = Description(self.open_search_url)
+        url_template = description.get_best_template()
+        if not url_template:
+            return
+        oquery = Query(url_template)
 
+        # set up initial values
+        oquery.searchTerms = query
+        oquery.count = max_results
+        url = oquery.url()
+        
         counter = max_results
+        br = browser()
         with closing(br.open(url, timeout=timeout)) as f:
-            doc = html.fromstring(f.read())
-            for data in doc.xpath('//div[@class="edewpi"]//div[@class="r ld"]'):
+            doc = etree.fromstring(f.read())
+            for data in doc.xpath('//*[local-name() = "entry"]'):
                 if counter <= 0:
                     break
-
-                url = ''
-                url_a = data.xpath('div[@class="jd"]/a')
-                if url_a:
-                    url_a = url_a[0]
-                    url = url_a.get('href', None)
-                if url:
-                    url = url.split('u=')[-1][:-2]
-                if '/titles/' not in url:
-                    continue
-                id = url.split('/')[-1]
-                id = id.strip()
-
-                url_a = html.fromstring(html.tostring(url_a))
-                heading = ''.join(url_a.xpath('//text()'))
-                title, _, author = heading.rpartition('by ')
-                author = author.split('-')[0]
-                price = '$0.00'
-
-                cover_url = ''
-                mo = re.match('^\D+', id)
-                if mo:
-                    cover_name = mo.group()
-                    cover_name = cover_name.replace('etext', '')
-                    cover_id = id.split('.')[0]
-                    cover_url = 'http://www.manybooks.net/images/' + id[0] + '/' + cover_name + '/' + cover_id + '-thumb.jpg'
-
+            
                 counter -= 1
-
+    
                 s = SearchResult()
-                s.cover_url = cover_url
-                s.title = title.strip()
-                s.author = author.strip()
-                s.price = price.strip()
-                s.detail_item = '/titles/' + id
+                
+                detail_links = data.xpath('./*[local-name() = "link" and @type = "text/html"]')
+                if not detail_links:
+                    continue
+                detail_link = detail_links[0]
+                detail_href = detail_link.get('href')
+                if not detail_href:
+                    continue
+
+                s.detail_item = 'http://manybooks.net/titles/' + detail_href.split('tid=')[-1] + '.html'
+                # These can have HTML inside of them. We are going to get them again later
+                # just in case.
+                s.title = ''.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
+                s.author = ', '.join(data.xpath('./*[local-name() = "author"]//text()')).strip()
+                
+                # Follow the detail link to get the rest of the info.
+                with closing(br.open(detail_href, timeout=timeout/4)) as df:
+                    ddoc = etree.fromstring(df.read())
+                    ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
+                    if ddata:
+                        ddata = ddata[0]
+
+                        # This is the real title and author info we want. We got
+                        # it previously just in case it's not specified here for some reason.
+                        s.title = ''.join(ddata.xpath('./*[local-name() = "title"]//text()')).strip()
+                        s.author = ', '.join(ddata.xpath('./*[local-name() = "author"]//text()')).strip()
+                        if s.author.startswith(','):
+                            s.author = s.author[1:]
+                        if s.author.endswith(','):
+                            s.author = s.author[:-1]
+                        
+                        s.cover_url = ''.join(ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href')).strip()
+                        
+                        for link in ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
+                            type = link.get('type')
+                            href = link.get('href')
+                            if type:
+                                ext = mimetypes.guess_extension(type)
+                                if ext:
+                                    ext = ext[1:].upper().strip()
+                                    s.downloads[ext] = href
+
+                s.price = '$0.00'
                 s.drm = SearchResult.DRM_UNLOCKED
-                s.formts = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'
+                s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'
 
                 yield s
diff --git a/src/calibre/utils/opensearch/__init__.py b/src/calibre/utils/opensearch/__init__.py
index 3d0c4d8787..62bd0e0236 100644
--- a/src/calibre/utils/opensearch/__init__.py
+++ b/src/calibre/utils/opensearch/__init__.py
@@ -28,7 +28,7 @@ if not url_template:
 query = Query(url_template)
 
 # set up initial values.
-query.searchTerms = urllib.quote_plus(search_terms)
+query.searchTerms = search_terms
 # Note the count is ignored by some feeds.
 query.count = max_results
 
diff --git a/src/calibre/utils/opensearch/description.py b/src/calibre/utils/opensearch/description.py
index 0b5afd8a7e..d5922d0c2b 100644
--- a/src/calibre/utils/opensearch/description.py
+++ b/src/calibre/utils/opensearch/description.py
@@ -40,7 +40,7 @@ class Description(object):
         with closing(br.open(url, timeout=15)) as f:
             doc = etree.fromstring(f.read())
         
-        # version 1.1 has repeating Url elements
+        # version 1.1 has repeating Url elements.
         self.urls = []
         for element in doc.xpath('//*[local-name() = "Url"]'):
             template = element.get('template')
@@ -50,9 +50,22 @@ class Description(object):
                 url.template = template
                 url.type = type
                 self.urls.append(url)
+        # Stanza catalogs.
+        for element in doc.xpath('//*[local-name() = "link"]'):
+            if element.get('rel') != 'search':
+                continue
+            href = element.get('href')
+            type = element.get('type')
+            if href and type:
+                url = URL()
+                url.template = href
+                url.type = type
+                self.urls.append(url)
 
-        # this is version 1.0 specific
-        self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
+        # this is version 1.0 specific.
+        self.url = ''
+        if not self.urls:
+            self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
         self.format = ''.join(doc.xpath('//*[local-name() = "Format"][1]//text()'))
 
         self.shortname = ''.join(doc.xpath('//*[local-name() = "ShortName"][1]//text()'))