OpenSearch: rewrite module to use lxml and remove the modified feed parser as it causes all avaliable file descriptors to be used. Store: Rework opensearch class to use the changes to the opensearch module.

This commit is contained in:
John Schember 2011-06-26 20:56:43 -04:00
parent 3c83b7873a
commit 38364443a7
9 changed files with 144 additions and 3130 deletions

View File

@ -8,14 +8,20 @@ __docformat__ = 'restructuredtext en'
import mimetypes
import urllib
from contextlib import closing
from lxml import etree
from PyQt4.Qt import QUrl
from calibre import browser
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog
from calibre.utils.opensearch import Client
#from calibre.utils.opensearch import Client
from calibre.utils.opensearch.description import Description
from calibre.utils.opensearch.query import Query
class OpenSearchStore(StorePlugin):
@ -38,38 +44,51 @@ class OpenSearchStore(StorePlugin):
if not hasattr(self, 'open_search_url'):
return
client = Client(self.open_search_url)
results = client.search(urllib.quote_plus(query), max_results)
description = Description(self.open_search_url)
url_template = description.get_best_template()
if not url_template:
return
oquery = Query(url_template)
# set up initial values
oquery.searchTerms = urllib.quote_plus(query)
oquery.count = max_results
url = oquery.url()
counter = max_results
for r in results:
if counter <= 0:
break
counter -= 1
br = browser()
with closing(br.open(url, timeout=timeout)) as f:
doc = etree.fromstring(f.read())
for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0:
break
s = SearchResult()
s.detail_item = r.get('id', '')
links = r.get('links', None)
for l in links:
if l.get('rel', None):
if l['rel'] in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
s.cover_url = l.get('href', '')
elif l['rel'] == u'http://opds-spec.org/acquisition/buy':
s.detail_item = l.get('href', s.detail_item)
elif l['rel'] == u'http://opds-spec.org/acquisition':
mime = l.get('type', '')
if mime:
ext = mimetypes.guess_extension(mime)
if ext:
ext = ext[1:].upper()
s.downloads[ext] = l.get('href', '')
counter -= 1
s = SearchResult()
s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()'))
s.formats = ', '.join(s.downloads.keys())
for link in data.xpath('./*[local-name() = "link"]'):
rel = link.get('rel')
href = link.get('href')
type = link.get('type')
if rel and href and type:
if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
s.cover_url = href
elif rel == u'http://opds-spec.org/acquisition/buy':
s.detail_item = href
elif rel == u'http://opds-spec.org/acquisition':
if type:
ext = mimetypes.guess_extension(type)
if ext:
ext = ext[1:].upper()
s.downloads[ext] = href
s.formats = ', '.join(s.downloads.keys())
s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()'))
s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()'))
s.price = ' '.join(data.xpath('.//*[local-name() = "price"]//text()'))
s.title = r.get('title', '')
s.author = r.get('author', '')
s.price = r.get('price', '')
yield s
yield s

View File

@ -28,6 +28,7 @@ class ArchiveOrgStore(BasicStoreConfig, OpenSearchStore):
s.price = '$0.00'
s.drm = SearchResult.DRM_UNLOCKED
yield s
'''
def get_details(self, search_result, timeout):
br = browser()

View File

@ -1,4 +0,0 @@
from description import Description
from query import Query
from client import Client
from results import Results

View File

@ -1,39 +0,0 @@
from description import Description
from query import Query
from results import Results
class Client:
"""This is the class you'll probably want to be using. You simply
pass the constructor the url for the service description file and
issue a search and get back results as an iterable Results object.
The neat thing about a Results object is that it will seamlessly
handle fetching more results from the opensearch server when it can...
so you just need to iterate and can let the paging be taken care of
for you.
from opensearch import Client
client = Client(description_url)
results = client.search("computer")
for result in results:
print result.title
"""
def __init__(self, url, agent="python-opensearch <https://github.com/edsu/opensearch>"):
self.agent = agent
self.description = Description(url, self.agent)
def search(self, search_terms, page_size=25):
"""Perform a search and get back a results object
"""
url = self.description.get_best_template()
query = Query(url)
# set up initial values
query.searchTerms = search_terms
query.count = page_size
# run the results
return Results(query, agent=self.agent)

View File

@ -1,71 +1,95 @@
from urllib2 import urlopen, Request
from xml.dom.minidom import parse
from url import URL
# -*- coding: utf-8 -*-
class Description:
"""A class for representing OpenSearch Description files.
"""
from __future__ import (unicode_literals, division, absolute_import, print_function)
def __init__(self, url="", agent=""):
"""The constructor which may pass an optional url to load from.
__license__ = 'GPL 3'
__copyright__ = '''
2011, John Schember <john@nachtimwald.com>,
2006, Ed Summers <ehs@pobox.com>
'''
__docformat__ = 'restructuredtext en'
from contextlib import closing
from lxml import etree
from calibre import browser
from calibre.utils.opensearch.url import URL
class Description(object):
'''
A class for representing OpenSearch Description files.
'''
def __init__(self, url=""):
'''
The constructor which may pass an optional url to load from.
d = Description("http://www.example.com/description")
"""
self.agent = agent
'''
if url:
self.load(url)
def load(self, url):
"""For loading up a description object from a url. Normally
'''
For loading up a description object from a url. Normally
you'll probably just want to pass a URL into the constructor.
"""
req = Request(url, headers={'User-Agent':self.agent})
self.dom = parse(urlopen(req))
'''
br = browser()
with closing(br.open(url, timeout=15)) as f:
doc = etree.fromstring(f.read())
# version 1.1 has repeating Url elements
self.urls = self._get_urls()
self.urls = []
for element in doc.xpath('//*[local-name() = "Url"]'):
template = element.get('template')
type = element.get('type')
if template and type:
url = URL()
url.template = template
url.type = type
self.urls.append(url)
# this is version 1.0 specific
self.url = self._get_element_text('Url')
self.format = self._get_element_text('Format')
self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
self.format = ''.join(doc.xpath('//*[local-name() = "Format"][1]//text()'))
self.shortname = self._get_element_text('ShortName')
self.longname = self._get_element_text('LongName')
self.description = self._get_element_text('Description')
self.image = self._get_element_text('Image')
self.samplesearch = self._get_element_text('SampleSearch')
self.developer = self._get_element_text('Developer')
self.contact = self._get_element_text('Contact')
self.attribution = self._get_element_text('Attribution')
self.syndicationright = self._get_element_text('SyndicationRight')
self.shortname = ''.join(doc.xpath('//*[local-name() = "ShortName"][1]//text()'))
self.longname = ''.join(doc.xpath('//*[local-name() = "LongName"][1]//text()'))
self.description = ''.join(doc.xpath('//*[local-name() = "Description"][1]//text()'))
self.image = ''.join(doc.xpath('//*[local-name() = "Image"][1]//text()'))
self.sameplesearch = ''.join(doc.xpath('//*[local-name() = "SampleSearch"][1]//text()'))
self.developer = ''.join(doc.xpath('//*[local-name() = "Developer"][1]//text()'))
self.contact = ''.join(doc.xpath('/*[local-name() = "Contact"][1]//text()'))
self.attribution = ''.join(doc.xpath('//*[local-name() = "Attribution"][1]//text()'))
self.syndicationright = ''.join(doc.xpath('//*[local-name() = "SyndicationRight"][1]//text()'))
tag_text = self._get_element_text('Tags')
tag_text = ' '.join(doc.xpath('//*[local-name() = "Tags"]//text()'))
if tag_text != None:
self.tags = tag_text.split(" ")
self.tags = tag_text.split(' ')
if self._get_element_text('AdultContent') == 'true':
self.adultcontent = True
else:
self.adultcontent = False
self.adultcontent = doc.xpath('boolean(//*[local-name() = "AdultContent" and contains(., "true")])')
def get_url_by_type(self, type):
"""Walks available urls and returns them by type. Only
'''
Walks available urls and returns them by type. Only
appropriate in opensearch v1.1 where there can be multiple
query targets. Returns none if no such type is found.
url = description.get_url_by_type('application/rss+xml')
"""
'''
for url in self.urls:
if url.type == type:
return url
return None
def get_best_template(self):
"""OK, best is a value judgement, but so be it. You'll get
'''
OK, best is a value judgement, but so be it. You'll get
back either the atom, rss or first template available. This
method handles the main difference between opensearch v1.0 and v1.1
"""
'''
# version 1.0
if self.url:
return self.url
@ -88,40 +112,3 @@ class Description:
# out of luck
return None
# these are internal methods for querying xml
def _get_element_text(self, tag):
elements = self._get_elements(tag)
if not elements:
return None
return self._get_text(elements[0].childNodes)
def _get_attribute_text(self, tag, attribute):
elements = self._get_elements(tag)
if not elements:
return ''
return elements[0].getAttribute('template')
def _get_elements(self, tag):
return self.dom.getElementsByTagName(tag)
def _get_text(self, nodes):
text = ''
for node in nodes:
if node.nodeType == node.TEXT_NODE:
text += node.data
return text.strip()
def _get_urls(self):
urls = []
for element in self._get_elements('Url'):
template = element.getAttribute('template')
type = element.getAttribute('type')
if template and type:
url = URL()
url.template = template
url.type = type
urls.append(url)
return urls

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,17 @@
from urlparse import urlparse, urlunparse
from urllib import urlencode
from cgi import parse_qs
# -*- coding: utf-8 -*-
class Query:
"""Represents an opensearch query. Used internally by the Client to
construct an opensearch url to request. Really this class is just a
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2006, Ed Summers <ehs@pobox.com>'
__docformat__ = 'restructuredtext en'
from urlparse import urlparse, urlunparse, parse_qs
from urllib import urlencode
class Query(object):
'''
Represents an opensearch query Really this class is just a
helper for substituting values into the macros in a format.
format = 'http://beta.indeed.com/opensearch?q={searchTerms}&start={startIndex}&limit={count}'
@ -12,16 +19,17 @@ class Query:
q.searchTerms('zx81')
q.startIndex = 1
q.count = 25
print q.to_url()
"""
print q.url()
'''
standard_macros = ['searchTerms','count','startIndex','startPage',
standard_macros = ['searchTerms', 'count', 'startIndex', 'startPage',
'language', 'outputEncoding', 'inputEncoding']
def __init__(self, format):
"""Create a query object by passing it the url format obtained
'''
Create a query object by passing it the url format obtained
from the opensearch Description.
"""
'''
self.format = format
# unpack the url to a tuple
@ -37,7 +45,7 @@ class Query:
for key,values in self.query_string.items():
# TODO eventually optional/required params should be
# distinguished somehow (the ones with/without trailing ?
macro = values[0].replace('{','').replace('}','').replace('?','')
macro = values[0].replace('{', '').replace('}', '').replace('?', '')
if macro in Query.standard_macros:
self.macro_map[macro] = key

View File

@ -1,131 +0,0 @@
import osfeedparser
class Results(object):
def __init__(self, query, agent=None):
self.agent = agent
self._fetch(query)
self._iter = 0
def __iter__(self):
self._iter = 0
return self
def __len__(self):
return self.totalResults
def next(self):
# just keep going like the energizer bunny
while True:
# return any item we haven't returned
if self._iter < len(self.items):
self._iter += 1
return self.items[self._iter-1]
# if there appears to be more to fetch
if \
self.totalResults != 0 \
and self.totalResults > self.startIndex + self.itemsPerPage - 1:
# get the next query
next_query = self._get_next_query()
# if we got one executed it and go back to the beginning
if next_query:
self._fetch(next_query)
# very important to reset this counter
# or else the return will fail
self._iter = 0
else:
raise StopIteration
def _fetch(self, query):
feed = osfeedparser.opensearch_parse(query.url(), agent=self.agent)
self.feed = feed
# general channel stuff
channel = feed['feed']
self.title = _pick(channel,'title')
self.link = _pick(channel,'link')
self.description = _pick(channel,'description')
self.language = _pick(channel,'language')
self.copyright = _pick(channel,'copyright')
# get back opensearch specific values
self.totalResults = _pick(channel,'opensearch_totalresults',0)
self.startIndex = _pick(channel,'opensearch_startindex',1)
self.itemsPerPage = _pick(channel,'opensearch_itemsperpage',0)
# alias items from the feed to our results object
self.items = feed['items']
# set default values if necessary
if self.startIndex == 0:
self.startIndex = 1
if self.itemsPerPage == 0 and len(self.items) > 0:
self.itemsPerPage = len(self.items)
# store away query for calculating next results
# if necessary
self.last_query = query
def _get_next_query(self):
# update our query to get the next set of records
query = self.last_query
# use start page if the query supports it
if query.has_macro('startPage'):
# if the query already defined the startPage
# we just need to increment it
if hasattr(query, 'startPage'):
query.startPage += 1
# to issue the first query startPage might not have
# been specified, so set it to 2
else:
query.startPage = 2
return query
# otherwise the query should support startIndex
elif query.has_macro('startIndex'):
# if startIndex was used before we just add the
# items per page to it to get the next set
if hasattr(query, 'startIndex'):
query.startIndex += self.itemsPerPage
# to issue the first query the startIndex may have
# been left blank in that case we assume it to be
# the item just after the last one on this page
else:
query.startIndex = self.itemsPerPage + 1
return query
# doesn't look like there is another stage to this query
return None
# helper for pulling values out of a dictionary if they're there
# and returning a default value if they're not
def _pick(d,key,default=None):
# get the value out
value = d.get(key)
# if it wasn't there return the default
if value == None:
return default
# if they want an int try to convert to an int
# and return default if it fails
if type(default) == int:
try:
return int(d[key])
except:
return default
# otherwise we're good to return the value
return value

View File

@ -1,5 +1,15 @@
class URL:
"""Class for representing a URL in an opensearch v1.1 query"""
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2006, Ed Summers <ehs@pobox.com>'
__docformat__ = 'restructuredtext en'
class URL(object):
'''
Class for representing a URL in an opensearch v1.1 query
'''
def __init__(self, type='', template='', method='GET'):
self.type = type