Store: OpenSearch based store base class. OpenSearch module added. Make some OPDS store use the new OpenSearchStore class.

This commit is contained in:
John Schember 2011-06-26 10:32:17 -04:00
parent 4c6aa0364f
commit 8ae7d310e8
11 changed files with 3311 additions and 135 deletions

View File

@ -6,84 +6,35 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import urllib
from contextlib import closing from contextlib import closing
from lxml import html from lxml import html
from PyQt4.Qt import QUrl from calibre import browser
from calibre import browser, url_slash_cleaner
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchStore
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog
class ArchiveOrgStore(BasicStoreConfig, StorePlugin): class ArchiveOrgStore(BasicStoreConfig, OpenSearchStore):
def open(self, parent=None, detail_item=None, external=False):
url = 'http://www.archive.org/details/texts'
if detail_item:
detail_item = url_slash_cleaner('http://www.archive.org' + detail_item)
if external or self.config.get('open_external', False):
open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url)))
else:
d = WebStoreDialog(self.gui, url, parent, detail_item)
d.setWindowTitle(self.name)
d.set_tags(self.config.get('tags', ''))
d.exec_()
open_search_url = 'http://bookserver.archive.org/catalog/opensearch.xml'
web_url = 'http://www.archive.org/details/texts'
# http://bookserver.archive.org/catalog/
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
query = query + ' AND mediatype:texts' for s in OpenSearchStore.search(self, query, max_results, timeout):
url = 'http://www.archive.org/search.php?query=' + urllib.quote(query) s.detail_item = 'http://www.archive.org/details/' + s.detail_item.split(':')[-1]
s.price = '$0.00'
br = browser() s.drm = SearchResult.DRM_UNLOCKED
yield s
counter = max_results '''
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
for data in doc.xpath('//td[@class="hitCell"]'):
if counter <= 0:
break
id = ''.join(data.xpath('.//a[@class="titleLink"]/@href'))
if not id:
continue
title = ''.join(data.xpath('.//a[@class="titleLink"]//text()'))
authors = data.xpath('.//text()')
if not authors:
continue
author = None
for a in authors:
if '-' in a:
author = a.replace('-', ' ').strip()
if author:
break
if not author:
continue
counter -= 1
s = SearchResult()
s.title = title.strip()
s.author = author.strip()
s.price = '$0.00'
s.detail_item = id.strip()
s.drm = SearchResult.DRM_UNLOCKED
yield s
def get_details(self, search_result, timeout): def get_details(self, search_result, timeout):
url = url_slash_cleaner('http://www.archive.org' + search_result.detail_item)
br = browser() br = browser()
with closing(br.open(url, timeout=timeout)) as nf: with closing(br.open(search_result.detail_item, timeout=timeout)) as nf:
idata = html.fromstring(nf.read()) idata = html.fromstring(nf.read())
formats = ', '.join(idata.xpath('//p[@id="dl" and @class="content"]//a/text()')) formats = ', '.join(idata.xpath('//p[@id="dl" and @class="content"]//a/text()'))
search_result.formats = formats.upper() search_result.formats = formats.upper()
return True return True
'''

View File

@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import mimetypes
import urllib
from PyQt4.Qt import QUrl
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog
from calibre.utils.opensearch import Client
class OpenSearchStore(StorePlugin):
open_search_url = ''
web_url = ''
def open(self, parent=None, detail_item=None, external=False):
if external or self.config.get('open_external', False):
open_url(QUrl(detail_item if detail_item else self.url))
else:
d = WebStoreDialog(self.gui, self.url, parent, detail_item)
d.setWindowTitle(self.name)
d.set_tags(self.config.get('tags', ''))
d.exec_()
def search(self, query, max_results=10, timeout=60):
if not hasattr(self, 'open_search_url'):
return
client = Client(self.open_search_url)
results = client.search(urllib.quote_plus(query), max_results)
counter = max_results
for r in results:
if counter <= 0:
break
counter -= 1
s = SearchResult()
s.detail_item = r.get('id', '')
links = r.get('links', None)
for l in links:
if l.get('rel', None):
if l['rel'] == u'http://opds-spec.org/image/thumbnail':
s.cover_url = l.get('href', '')
elif l['rel'] == u'http://opds-spec.org/acquisition/buy':
s.detail_item = l.get('href', s.detail_item)
elif l['rel'] == u'http://opds-spec.org/acquisition':
s.downloads.append((l.get('type', ''), l.get('href', '')))
formats = []
for mime, url in s.downloads:
ext = mimetypes.guess_extension(mime)
if ext:
formats.append(ext[1:])
s.formats = ', '.join(formats)
s.title = r.get('title', '')
s.author = r.get('author', '')
s.price = r.get('price', '')
yield s

View File

@ -6,79 +6,19 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import urllib
from contextlib import closing
from lxml import html
from PyQt4.Qt import QUrl
from calibre import browser, url_slash_cleaner
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchStore
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog
class PragmaticBookshelfStore(BasicStoreConfig, StorePlugin): class PragmaticBookshelfStore(BasicStoreConfig, OpenSearchStore):
def open(self, parent=None, detail_item=None, external=False): open_search_url = 'http://pragprog.com/catalog/search-description'
url = 'http://pragprog.com/' web_url = 'http://pragprog.com/'
if external or self.config.get('open_external', False): # http://pragprog.com/catalog.opds
open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url)))
else:
d = WebStoreDialog(self.gui, url, parent, detail_item)
d.setWindowTitle(self.name)
d.set_tags(self.config.get('tags', ''))
d.exec_()
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
''' for s in OpenSearchStore.search(self, query, max_results, timeout):
OPDS based search. s.drm = SearchResult.DRM_UNLOCKED
s.formats = 'EPUB, PDF, MOBI'
We really should get the catelog from http://pragprog.com/catalog.opds yield s
and look for the application/opensearchdescription+xml entry.
Then get the opensearch description to get the search url and
format. However, we are going to be lazy and hard code it.
'''
url = 'http://pragprog.com/catalog/search?q=' + urllib.quote_plus(query)
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
# Use html instead of etree as html allows us
# to ignore the namespace easily.
doc = html.fromstring(f.read())
for data in doc.xpath('//entry'):
if counter <= 0:
break
id = ''.join(data.xpath('.//link[@rel="http://opds-spec.org/acquisition/buy"]/@href'))
if not id:
continue
price = ''.join(data.xpath('.//price/@currencycode')).strip()
price += ' '
price += ''.join(data.xpath('.//price/text()')).strip()
if not price.strip():
continue
cover_url = ''.join(data.xpath('.//link[@rel="http://opds-spec.org/cover"]/@href'))
title = ''.join(data.xpath('.//title/text()'))
author = ''.join(data.xpath('.//author//text()'))
counter -= 1
s = SearchResult()
s.cover_url = cover_url
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = id.strip()
s.drm = SearchResult.DRM_UNLOCKED
s.formats = 'EPUB, PDF, MOBI'
yield s

View File

@ -22,6 +22,7 @@ class SearchResult(object):
self.detail_item = '' self.detail_item = ''
self.drm = None self.drm = None
self.formats = '' self.formats = ''
self.downloads = []
self.affiliate = False self.affiliate = False
self.plugin_author = '' self.plugin_author = ''

View File

@ -0,0 +1,4 @@
from description import Description
from query import Query
from client import Client
from results import Results

View File

@ -0,0 +1,39 @@
from description import Description
from query import Query
from results import Results
class Client:
"""This is the class you'll probably want to be using. You simply
pass the constructor the url for the service description file and
issue a search and get back results as an iterable Results object.
The neat thing about a Results object is that it will seamlessly
handle fetching more results from the opensearch server when it can...
so you just need to iterate and can let the paging be taken care of
for you.
from opensearch import Client
client = Client(description_url)
results = client.search("computer")
for result in results:
print result.title
"""
def __init__(self, url, agent="python-opensearch <https://github.com/edsu/opensearch>"):
self.agent = agent
self.description = Description(url, self.agent)
def search(self, search_terms, page_size=25):
"""Perform a search and get back a results object
"""
url = self.description.get_best_template()
query = Query(url)
# set up initial values
query.searchTerms = search_terms
query.count = page_size
# run the results
return Results(query, agent=self.agent)

View File

@ -0,0 +1,127 @@
from urllib2 import urlopen, Request
from xml.dom.minidom import parse
from url import URL
class Description:
"""A class for representing OpenSearch Description files.
"""
def __init__(self, url="", agent=""):
"""The constructor which may pass an optional url to load from.
d = Description("http://www.example.com/description")
"""
self.agent = agent
if url:
self.load(url)
def load(self, url):
"""For loading up a description object from a url. Normally
you'll probably just want to pass a URL into the constructor.
"""
req = Request(url, headers={'User-Agent':self.agent})
self.dom = parse(urlopen(req))
# version 1.1 has repeating Url elements
self.urls = self._get_urls()
# this is version 1.0 specific
self.url = self._get_element_text('Url')
self.format = self._get_element_text('Format')
self.shortname = self._get_element_text('ShortName')
self.longname = self._get_element_text('LongName')
self.description = self._get_element_text('Description')
self.image = self._get_element_text('Image')
self.samplesearch = self._get_element_text('SampleSearch')
self.developer = self._get_element_text('Developer')
self.contact = self._get_element_text('Contact')
self.attribution = self._get_element_text('Attribution')
self.syndicationright = self._get_element_text('SyndicationRight')
tag_text = self._get_element_text('Tags')
if tag_text != None:
self.tags = tag_text.split(" ")
if self._get_element_text('AdultContent') == 'true':
self.adultcontent = True
else:
self.adultcontent = False
def get_url_by_type(self, type):
"""Walks available urls and returns them by type. Only
appropriate in opensearch v1.1 where there can be multiple
query targets. Returns none if no such type is found.
url = description.get_url_by_type('application/rss+xml')
"""
for url in self.urls:
if url.type == type:
return url
return None
def get_best_template(self):
"""OK, best is a value judgement, but so be it. You'll get
back either the atom, rss or first template available. This
method handles the main difference between opensearch v1.0 and v1.1
"""
# version 1.0
if self.url:
return self.url
# atom
if self.get_url_by_type('application/atom+xml'):
return self.get_url_by_type('application/atom+xml').template
# rss
if self.get_url_by_type('application/rss+xml'):
return self.get_url_by_type('application/rss+xml').template
# other possible rss type
if self.get_url_by_type('text/xml'):
return self.get_url_by_Type('text/xml').template
# otherwise just the first one
if len(self.urls) > 0:
return self.urls[0].template
# out of luck
return Nil
# these are internal methods for querying xml
def _get_element_text(self, tag):
elements = self._get_elements(tag)
if not elements:
return None
return self._get_text(elements[0].childNodes)
def _get_attribute_text(self, tag, attribute):
elements = self._get_elements(tag)
if not elements:
return ''
return elements[0].getAttribute('template')
def _get_elements(self, tag):
return self.dom.getElementsByTagName(tag)
def _get_text(self, nodes):
text = ''
for node in nodes:
if node.nodeType == node.TEXT_NODE:
text += node.data
return text.strip()
def _get_urls(self):
urls = []
for element in self._get_elements('Url'):
template = element.getAttribute('template')
type = element.getAttribute('type')
if template and type:
url = URL()
url.template = template
url.type = type
urls.append(url)
return urls

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,66 @@
from urlparse import urlparse, urlunparse
from urllib import urlencode
from cgi import parse_qs
class Query:
"""Represents an opensearch query. Used internally by the Client to
construct an opensearch url to request. Really this class is just a
helper for substituting values into the macros in a format.
format = 'http://beta.indeed.com/opensearch?q={searchTerms}&start={startIndex}&limit={count}'
q = Query(format)
q.searchTerms('zx81')
q.startIndex = 1
q.count = 25
print q.to_url()
"""
standard_macros = ['searchTerms','count','startIndex','startPage',
'language', 'outputEncoding', 'inputEncoding']
def __init__(self, format):
"""Create a query object by passing it the url format obtained
from the opensearch Description.
"""
self.format = format
# unpack the url to a tuple
self.url_parts = urlparse(format)
# unpack the query string to a dictionary
self.query_string = parse_qs(self.url_parts[4])
# look for standard macros and create a mapping of the
# opensearch names to the service specific ones
# so q={searchTerms} will result in a mapping between searchTerms and q
self.macro_map = {}
for key,values in self.query_string.items():
# TODO eventually optional/required params should be
# distinguished somehow (the ones with/without trailing ?
macro = values[0].replace('{','').replace('}','').replace('?','')
if macro in Query.standard_macros:
self.macro_map[macro] = key
def url(self):
# copy the original query string
query_string = dict(self.query_string)
# iterate through macros and set the position in the querystring
for macro, name in self.macro_map.items():
if hasattr(self, macro):
# set the name/value pair
query_string[name] = [getattr(self, macro)]
else:
# remove the name/value pair
del(query_string[name])
# copy the url parts and substitute in our new query string
url_parts = list(self.url_parts)
url_parts[4] = urlencode(query_string, 1)
# recompose and return url
return urlunparse(tuple(url_parts))
def has_macro(self, macro):
return self.macro_map.has_key(macro)

View File

@ -0,0 +1,131 @@
import osfeedparser
class Results(object):
def __init__(self, query, agent=None):
self.agent = agent
self._fetch(query)
self._iter = 0
def __iter__(self):
self._iter = 0
return self
def __len__(self):
return self.totalResults
def next(self):
# just keep going like the energizer bunny
while True:
# return any item we haven't returned
if self._iter < len(self.items):
self._iter += 1
return self.items[self._iter-1]
# if there appears to be more to fetch
if \
self.totalResults != 0 \
and self.totalResults > self.startIndex + self.itemsPerPage - 1:
# get the next query
next_query = self._get_next_query()
# if we got one executed it and go back to the beginning
if next_query:
self._fetch(next_query)
# very important to reset this counter
# or else the return will fail
self._iter = 0
else:
raise StopIteration
def _fetch(self, query):
feed = osfeedparser.opensearch_parse(query.url(), agent=self.agent)
self.feed = feed
# general channel stuff
channel = feed['feed']
self.title = _pick(channel,'title')
self.link = _pick(channel,'link')
self.description = _pick(channel,'description')
self.language = _pick(channel,'language')
self.copyright = _pick(channel,'copyright')
# get back opensearch specific values
self.totalResults = _pick(channel,'opensearch_totalresults',0)
self.startIndex = _pick(channel,'opensearch_startindex',1)
self.itemsPerPage = _pick(channel,'opensearch_itemsperpage',0)
# alias items from the feed to our results object
self.items = feed['items']
# set default values if necessary
if self.startIndex == 0:
self.startIndex = 1
if self.itemsPerPage == 0 and len(self.items) > 0:
self.itemsPerPage = len(self.items)
# store away query for calculating next results
# if necessary
self.last_query = query
def _get_next_query(self):
# update our query to get the next set of records
query = self.last_query
# use start page if the query supports it
if query.has_macro('startPage'):
# if the query already defined the startPage
# we just need to increment it
if hasattr(query, 'startPage'):
query.startPage += 1
# to issue the first query startPage might not have
# been specified, so set it to 2
else:
query.startPage = 2
return query
# otherwise the query should support startIndex
elif query.has_macro('startIndex'):
# if startIndex was used before we just add the
# items per page to it to get the next set
if hasattr(query, 'startIndex'):
query.startIndex += self.itemsPerPage
# to issue the first query the startIndex may have
# been left blank in that case we assume it to be
# the item just after the last one on this page
else:
query.startIndex = self.itemsPerPage + 1
return query
# doesn't look like there is another stage to this query
return None
# helper for pulling values out of a dictionary if they're there
# and returning a default value if they're not
def _pick(d,key,default=None):
# get the value out
value = d.get(key)
# if it wasn't there return the default
if value == None:
return default
# if they want an int try to convert to an int
# and return default if it fails
if type(default) == int:
try:
return int(d[key])
except:
return default
# otherwise we're good to return the value
return value

View File

@ -0,0 +1,8 @@
class URL:
"""Class for representing a URL in an opensearch v1.1 query"""
def __init__(self, type='', template='', method='GET'):
self.type = type
self.template = template
self.method = 'GET'
self.params = []