Store: Manybooks uses opds feed (faster, more accurate, fixes covers not showing in many cases, fix formats list). Opensearch: support creating search urls from Stanza catalogs. Store: opensearch based classes don't need to quote the search terms as the opensearch module does this already.

This commit is contained in:
John Schember 2011-07-03 10:59:54 -04:00
parent 58ca9bc7d0
commit 3e0797872c
4 changed files with 96 additions and 76 deletions

View File

@ -7,7 +7,6 @@ __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import mimetypes import mimetypes
import urllib
from contextlib import closing from contextlib import closing
from lxml import etree from lxml import etree
@ -50,7 +49,7 @@ class OpenSearchOPDSStore(StorePlugin):
oquery = Query(url_template) oquery = Query(url_template)
# set up initial values # set up initial values
oquery.searchTerms = urllib.quote_plus(query) oquery.searchTerms = query
oquery.count = max_results oquery.count = max_results
url = oquery.url() url = oquery.url()
@ -99,7 +98,3 @@ class OpenSearchOPDSStore(StorePlugin):
yield s yield s
class OpenSearchOPDSDetailStore(OpenSearchOPDSStore):
pass

View File

@ -6,89 +6,101 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re import mimetypes
import urllib
from contextlib import closing from contextlib import closing
from lxml import html from lxml import etree
from PyQt4.Qt import QUrl from calibre import browser
from calibre import browser, url_slash_cleaner
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog from calibre.utils.opensearch.description import Description
from calibre.utils.opensearch.query import Query
class ManyBooksStore(BasicStoreConfig, StorePlugin): class ManyBooksStore(BasicStoreConfig, OpenSearchOPDSStore):
def open(self, parent=None, detail_item=None, external=False): open_search_url = 'http://www.manybooks.net/opds/'
url = 'http://manybooks.net/' web_url = 'http://manybooks.net'
detail_url = None
if detail_item:
detail_url = url + detail_item
if external or self.config.get('open_external', False):
open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url)))
else:
d = WebStoreDialog(self.gui, url, parent, detail_url)
d.setWindowTitle(self.name)
d.set_tags(self.config.get('tags', ''))
d.exec_()
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
# ManyBooks website separates results for title and author. '''
# It also doesn't do a clear job of references authors and Manybooks uses a very strange opds feed. The opds
# secondary titles. Google is also faster. main feed is structured like a stanza feed. The
# Using a google search so we can search on both fields at once. search result entries give very little information
url = 'http://www.google.com/xhtml?q=site:manybooks.net+' + urllib.quote_plus(query) and requires you to go to a detail link. The detail
link has the wrong type specified (text/html instead
of application/atom+xml).
'''
if not hasattr(self, 'open_search_url'):
return
br = browser() description = Description(self.open_search_url)
url_template = description.get_best_template()
if not url_template:
return
oquery = Query(url_template)
# set up initial values
oquery.searchTerms = query
oquery.count = max_results
url = oquery.url()
counter = max_results counter = max_results
br = browser()
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read()) doc = etree.fromstring(f.read())
for data in doc.xpath('//div[@class="edewpi"]//div[@class="r ld"]'): for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0: if counter <= 0:
break break
url = ''
url_a = data.xpath('div[@class="jd"]/a')
if url_a:
url_a = url_a[0]
url = url_a.get('href', None)
if url:
url = url.split('u=')[-1][:-2]
if '/titles/' not in url:
continue
id = url.split('/')[-1]
id = id.strip()
url_a = html.fromstring(html.tostring(url_a))
heading = ''.join(url_a.xpath('//text()'))
title, _, author = heading.rpartition('by ')
author = author.split('-')[0]
price = '$0.00'
cover_url = ''
mo = re.match('^\D+', id)
if mo:
cover_name = mo.group()
cover_name = cover_name.replace('etext', '')
cover_id = id.split('.')[0]
cover_url = 'http://www.manybooks.net/images/' + id[0] + '/' + cover_name + '/' + cover_id + '-thumb.jpg'
counter -= 1 counter -= 1
s = SearchResult() s = SearchResult()
s.cover_url = cover_url
s.title = title.strip() detail_links = data.xpath('./*[local-name() = "link" and @type = "text/html"]')
s.author = author.strip() if not detail_links:
s.price = price.strip() continue
s.detail_item = '/titles/' + id detail_link = detail_links[0]
detail_href = detail_link.get('href')
if not detail_href:
continue
s.detail_item = 'http://manybooks.net/titles/' + detail_href.split('tid=')[-1] + '.html'
# These can have HTML inside of them. We are going to get them again later
# just in case.
s.title = ''.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
s.author = ', '.join(data.xpath('./*[local-name() = "author"]//text()')).strip()
# Follow the detail link to get the rest of the info.
with closing(br.open(detail_href, timeout=timeout/4)) as df:
ddoc = etree.fromstring(df.read())
ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
if ddata:
ddata = ddata[0]
# This is the real title and author info we want. We got
# it previously just in case it's not specified here for some reason.
s.title = ''.join(ddata.xpath('./*[local-name() = "title"]//text()')).strip()
s.author = ', '.join(ddata.xpath('./*[local-name() = "author"]//text()')).strip()
if s.author.startswith(','):
s.author = s.author[1:]
if s.author.endswith(','):
s.author = s.author[:-1]
s.cover_url = ''.join(ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href')).strip()
for link in ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
type = link.get('type')
href = link.get('href')
if type:
ext = mimetypes.guess_extension(type)
if ext:
ext = ext[1:].upper().strip()
s.downloads[ext] = href
s.price = '$0.00'
s.drm = SearchResult.DRM_UNLOCKED s.drm = SearchResult.DRM_UNLOCKED
s.formts = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR' s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'
yield s yield s

View File

@ -28,7 +28,7 @@ if not url_template:
query = Query(url_template) query = Query(url_template)
# set up initial values. # set up initial values.
query.searchTerms = urllib.quote_plus(search_terms) query.searchTerms = search_terms
# Note the count is ignored by some feeds. # Note the count is ignored by some feeds.
query.count = max_results query.count = max_results

View File

@ -40,7 +40,7 @@ class Description(object):
with closing(br.open(url, timeout=15)) as f: with closing(br.open(url, timeout=15)) as f:
doc = etree.fromstring(f.read()) doc = etree.fromstring(f.read())
# version 1.1 has repeating Url elements # version 1.1 has repeating Url elements.
self.urls = [] self.urls = []
for element in doc.xpath('//*[local-name() = "Url"]'): for element in doc.xpath('//*[local-name() = "Url"]'):
template = element.get('template') template = element.get('template')
@ -50,8 +50,21 @@ class Description(object):
url.template = template url.template = template
url.type = type url.type = type
self.urls.append(url) self.urls.append(url)
# Stanza catalogs.
for element in doc.xpath('//*[local-name() = "link"]'):
if element.get('rel') != 'search':
continue
href = element.get('href')
type = element.get('type')
if href and type:
url = URL()
url.template = href
url.type = type
self.urls.append(url)
# this is version 1.0 specific # this is version 1.0 specific.
self.url = ''
if not self.urls:
self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()')) self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
self.format = ''.join(doc.xpath('//*[local-name() = "Format"][1]//text()')) self.format = ''.join(doc.xpath('//*[local-name() = "Format"][1]//text()'))