From 3e0797872c0eaa08f2a4f93927e16be87aa834ae Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 3 Jul 2011 10:59:54 -0400 Subject: [PATCH] Store: Manybooks uses opds feed (faster, more accurate, fixes covers not showing in many cases, fix formats list). Opensearch: support creating search urls from Stanza catalogs. Store: opensearch based classes don't need to quote the search terms as the opensearch module does this already. --- src/calibre/gui2/store/opensearch_store.py | 7 +- .../gui2/store/stores/manybooks_plugin.py | 144 ++++++++++-------- src/calibre/utils/opensearch/__init__.py | 2 +- src/calibre/utils/opensearch/description.py | 19 ++- 4 files changed, 96 insertions(+), 76 deletions(-) diff --git a/src/calibre/gui2/store/opensearch_store.py b/src/calibre/gui2/store/opensearch_store.py index 6e8f5de7ba..bcc92b25f1 100644 --- a/src/calibre/gui2/store/opensearch_store.py +++ b/src/calibre/gui2/store/opensearch_store.py @@ -7,7 +7,6 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' import mimetypes -import urllib from contextlib import closing from lxml import etree @@ -50,7 +49,7 @@ class OpenSearchOPDSStore(StorePlugin): oquery = Query(url_template) # set up initial values - oquery.searchTerms = urllib.quote_plus(query) + oquery.searchTerms = query oquery.count = max_results url = oquery.url() @@ -99,7 +98,3 @@ class OpenSearchOPDSStore(StorePlugin): yield s - -class OpenSearchOPDSDetailStore(OpenSearchOPDSStore): - - pass diff --git a/src/calibre/gui2/store/stores/manybooks_plugin.py b/src/calibre/gui2/store/stores/manybooks_plugin.py index 829a97012f..c7dbf0a608 100644 --- a/src/calibre/gui2/store/stores/manybooks_plugin.py +++ b/src/calibre/gui2/store/stores/manybooks_plugin.py @@ -6,89 +6,101 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re -import urllib +import mimetypes from contextlib import closing -from lxml import html +from lxml import etree -from PyQt4.Qt import QUrl - -from calibre import browser, url_slash_cleaner -from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre import browser from calibre.gui2.store.basic_config import BasicStoreConfig +from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore from calibre.gui2.store.search_result import SearchResult -from calibre.gui2.store.web_store_dialog import WebStoreDialog +from calibre.utils.opensearch.description import Description +from calibre.utils.opensearch.query import Query -class ManyBooksStore(BasicStoreConfig, StorePlugin): +class ManyBooksStore(BasicStoreConfig, OpenSearchOPDSStore): - def open(self, parent=None, detail_item=None, external=False): - url = 'http://manybooks.net/' - - detail_url = None - if detail_item: - detail_url = url + detail_item - - if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) - else: - d = WebStoreDialog(self.gui, url, parent, detail_url) - d.setWindowTitle(self.name) - d.set_tags(self.config.get('tags', '')) - d.exec_() + open_search_url = 'http://www.manybooks.net/opds/' + web_url = 'http://manybooks.net' def search(self, query, max_results=10, timeout=60): - # ManyBooks website separates results for title and author. - # It also doesn't do a clear job of references authors and - # secondary titles. Google is also faster. - # Using a google search so we can search on both fields at once. - url = 'http://www.google.com/xhtml?q=site:manybooks.net+' + urllib.quote_plus(query) + ''' + Manybooks uses a very strange opds feed. The opds + main feed is structured like a stanza feed. The + search result entries give very little information + and requires you to go to a detail link. The detail + link has the wrong type specified (text/html instead + of application/atom+xml). + ''' + if not hasattr(self, 'open_search_url'): + return - br = browser() + description = Description(self.open_search_url) + url_template = description.get_best_template() + if not url_template: + return + oquery = Query(url_template) + # set up initial values + oquery.searchTerms = query + oquery.count = max_results + url = oquery.url() + counter = max_results + br = browser() with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[@class="edewpi"]//div[@class="r ld"]'): + doc = etree.fromstring(f.read()) + for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break - - url = '' - url_a = data.xpath('div[@class="jd"]/a') - if url_a: - url_a = url_a[0] - url = url_a.get('href', None) - if url: - url = url.split('u=')[-1][:-2] - if '/titles/' not in url: - continue - id = url.split('/')[-1] - id = id.strip() - - url_a = html.fromstring(html.tostring(url_a)) - heading = ''.join(url_a.xpath('//text()')) - title, _, author = heading.rpartition('by ') - author = author.split('-')[0] - price = '$0.00' - - cover_url = '' - mo = re.match('^\D+', id) - if mo: - cover_name = mo.group() - cover_name = cover_name.replace('etext', '') - cover_id = id.split('.')[0] - cover_url = 'http://www.manybooks.net/images/' + id[0] + '/' + cover_name + '/' + cover_id + '-thumb.jpg' - + counter -= 1 - + s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = '/titles/' + id + + detail_links = data.xpath('./*[local-name() = "link" and @type = "text/html"]') + if not detail_links: + continue + detail_link = detail_links[0] + detail_href = detail_link.get('href') + if not detail_href: + continue + + s.detail_item = 'http://manybooks.net/titles/' + detail_href.split('tid=')[-1] + '.html' + # These can have HTML inside of them. We are going to get them again later + # just in case. + s.title = ''.join(data.xpath('./*[local-name() = "title"]//text()')).strip() + s.author = ', '.join(data.xpath('./*[local-name() = "author"]//text()')).strip() + + # Follow the detail link to get the rest of the info. + with closing(br.open(detail_href, timeout=timeout/4)) as df: + ddoc = etree.fromstring(df.read()) + ddata = ddoc.xpath('//*[local-name() = "entry"][1]') + if ddata: + ddata = ddata[0] + + # This is the real title and author info we want. We got + # it previously just in case it's not specified here for some reason. + s.title = ''.join(ddata.xpath('./*[local-name() = "title"]//text()')).strip() + s.author = ', '.join(ddata.xpath('./*[local-name() = "author"]//text()')).strip() + if s.author.startswith(','): + s.author = s.author[1:] + if s.author.endswith(','): + s.author = s.author[:-1] + + s.cover_url = ''.join(ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href')).strip() + + for link in ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): + type = link.get('type') + href = link.get('href') + if type: + ext = mimetypes.guess_extension(type) + if ext: + ext = ext[1:].upper().strip() + s.downloads[ext] = href + + s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED - s.formts = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR' + s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR' yield s diff --git a/src/calibre/utils/opensearch/__init__.py b/src/calibre/utils/opensearch/__init__.py index 3d0c4d8787..62bd0e0236 100644 --- a/src/calibre/utils/opensearch/__init__.py +++ b/src/calibre/utils/opensearch/__init__.py @@ -28,7 +28,7 @@ if not url_template: query = Query(url_template) # set up initial values. -query.searchTerms = urllib.quote_plus(search_terms) +query.searchTerms = search_terms # Note the count is ignored by some feeds. query.count = max_results diff --git a/src/calibre/utils/opensearch/description.py b/src/calibre/utils/opensearch/description.py index 0b5afd8a7e..d5922d0c2b 100644 --- a/src/calibre/utils/opensearch/description.py +++ b/src/calibre/utils/opensearch/description.py @@ -40,7 +40,7 @@ class Description(object): with closing(br.open(url, timeout=15)) as f: doc = etree.fromstring(f.read()) - # version 1.1 has repeating Url elements + # version 1.1 has repeating Url elements. self.urls = [] for element in doc.xpath('//*[local-name() = "Url"]'): template = element.get('template') @@ -50,9 +50,22 @@ class Description(object): url.template = template url.type = type self.urls.append(url) + # Stanza catalogs. + for element in doc.xpath('//*[local-name() = "link"]'): + if element.get('rel') != 'search': + continue + href = element.get('href') + type = element.get('type') + if href and type: + url = URL() + url.template = href + url.type = type + self.urls.append(url) - # this is version 1.0 specific - self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()')) + # this is version 1.0 specific. + self.url = '' + if not self.urls: + self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()')) self.format = ''.join(doc.xpath('//*[local-name() = "Format"][1]//text()')) self.shortname = ''.join(doc.xpath('//*[local-name() = "ShortName"][1]//text()'))