mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
OpenSearch: rewrite module to use lxml and remove the modified feed parser as it causes all avaliable file descriptors to be used. Store: Rework opensearch class to use the changes to the opensearch module.
This commit is contained in:
parent
3c83b7873a
commit
38364443a7
@ -8,14 +8,20 @@ __docformat__ = 'restructuredtext en'
|
||||
|
||||
import mimetypes
|
||||
import urllib
|
||||
from contextlib import closing
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from PyQt4.Qt import QUrl
|
||||
|
||||
from calibre import browser
|
||||
from calibre.gui2 import open_url
|
||||
from calibre.gui2.store import StorePlugin
|
||||
from calibre.gui2.store.search_result import SearchResult
|
||||
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
||||
from calibre.utils.opensearch import Client
|
||||
#from calibre.utils.opensearch import Client
|
||||
from calibre.utils.opensearch.description import Description
|
||||
from calibre.utils.opensearch.query import Query
|
||||
|
||||
class OpenSearchStore(StorePlugin):
|
||||
|
||||
@ -38,38 +44,51 @@ class OpenSearchStore(StorePlugin):
|
||||
if not hasattr(self, 'open_search_url'):
|
||||
return
|
||||
|
||||
client = Client(self.open_search_url)
|
||||
results = client.search(urllib.quote_plus(query), max_results)
|
||||
description = Description(self.open_search_url)
|
||||
url_template = description.get_best_template()
|
||||
if not url_template:
|
||||
return
|
||||
oquery = Query(url_template)
|
||||
|
||||
# set up initial values
|
||||
oquery.searchTerms = urllib.quote_plus(query)
|
||||
oquery.count = max_results
|
||||
url = oquery.url()
|
||||
|
||||
counter = max_results
|
||||
for r in results:
|
||||
if counter <= 0:
|
||||
break
|
||||
counter -= 1
|
||||
br = browser()
|
||||
with closing(br.open(url, timeout=timeout)) as f:
|
||||
doc = etree.fromstring(f.read())
|
||||
for data in doc.xpath('//*[local-name() = "entry"]'):
|
||||
if counter <= 0:
|
||||
break
|
||||
|
||||
s = SearchResult()
|
||||
|
||||
s.detail_item = r.get('id', '')
|
||||
|
||||
links = r.get('links', None)
|
||||
for l in links:
|
||||
if l.get('rel', None):
|
||||
if l['rel'] in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
|
||||
s.cover_url = l.get('href', '')
|
||||
elif l['rel'] == u'http://opds-spec.org/acquisition/buy':
|
||||
s.detail_item = l.get('href', s.detail_item)
|
||||
elif l['rel'] == u'http://opds-spec.org/acquisition':
|
||||
mime = l.get('type', '')
|
||||
if mime:
|
||||
ext = mimetypes.guess_extension(mime)
|
||||
if ext:
|
||||
ext = ext[1:].upper()
|
||||
s.downloads[ext] = l.get('href', '')
|
||||
counter -= 1
|
||||
|
||||
s = SearchResult()
|
||||
|
||||
s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()'))
|
||||
|
||||
s.formats = ', '.join(s.downloads.keys())
|
||||
for link in data.xpath('./*[local-name() = "link"]'):
|
||||
rel = link.get('rel')
|
||||
href = link.get('href')
|
||||
type = link.get('type')
|
||||
|
||||
if rel and href and type:
|
||||
if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
|
||||
s.cover_url = href
|
||||
elif rel == u'http://opds-spec.org/acquisition/buy':
|
||||
s.detail_item = href
|
||||
elif rel == u'http://opds-spec.org/acquisition':
|
||||
if type:
|
||||
ext = mimetypes.guess_extension(type)
|
||||
if ext:
|
||||
ext = ext[1:].upper()
|
||||
s.downloads[ext] = href
|
||||
s.formats = ', '.join(s.downloads.keys())
|
||||
|
||||
s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()'))
|
||||
s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()'))
|
||||
s.price = ' '.join(data.xpath('.//*[local-name() = "price"]//text()'))
|
||||
|
||||
s.title = r.get('title', '')
|
||||
s.author = r.get('author', '')
|
||||
s.price = r.get('price', '')
|
||||
|
||||
yield s
|
||||
yield s
|
||||
|
@ -28,6 +28,7 @@ class ArchiveOrgStore(BasicStoreConfig, OpenSearchStore):
|
||||
s.price = '$0.00'
|
||||
s.drm = SearchResult.DRM_UNLOCKED
|
||||
yield s
|
||||
|
||||
'''
|
||||
def get_details(self, search_result, timeout):
|
||||
br = browser()
|
||||
|
@ -1,4 +0,0 @@
|
||||
from description import Description
|
||||
from query import Query
|
||||
from client import Client
|
||||
from results import Results
|
@ -1,39 +0,0 @@
|
||||
from description import Description
|
||||
from query import Query
|
||||
from results import Results
|
||||
|
||||
class Client:
|
||||
|
||||
"""This is the class you'll probably want to be using. You simply
|
||||
pass the constructor the url for the service description file and
|
||||
issue a search and get back results as an iterable Results object.
|
||||
|
||||
The neat thing about a Results object is that it will seamlessly
|
||||
handle fetching more results from the opensearch server when it can...
|
||||
so you just need to iterate and can let the paging be taken care of
|
||||
for you.
|
||||
|
||||
from opensearch import Client
|
||||
client = Client(description_url)
|
||||
results = client.search("computer")
|
||||
for result in results:
|
||||
print result.title
|
||||
"""
|
||||
|
||||
def __init__(self, url, agent="python-opensearch <https://github.com/edsu/opensearch>"):
|
||||
self.agent = agent
|
||||
self.description = Description(url, self.agent)
|
||||
|
||||
def search(self, search_terms, page_size=25):
|
||||
"""Perform a search and get back a results object
|
||||
"""
|
||||
url = self.description.get_best_template()
|
||||
query = Query(url)
|
||||
|
||||
# set up initial values
|
||||
query.searchTerms = search_terms
|
||||
query.count = page_size
|
||||
|
||||
# run the results
|
||||
return Results(query, agent=self.agent)
|
||||
|
@ -1,71 +1,95 @@
|
||||
from urllib2 import urlopen, Request
|
||||
from xml.dom.minidom import parse
|
||||
from url import URL
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
class Description:
|
||||
"""A class for representing OpenSearch Description files.
|
||||
"""
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
def __init__(self, url="", agent=""):
|
||||
"""The constructor which may pass an optional url to load from.
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '''
|
||||
2011, John Schember <john@nachtimwald.com>,
|
||||
2006, Ed Summers <ehs@pobox.com>
|
||||
'''
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from contextlib import closing
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre import browser
|
||||
from calibre.utils.opensearch.url import URL
|
||||
|
||||
class Description(object):
|
||||
'''
|
||||
A class for representing OpenSearch Description files.
|
||||
'''
|
||||
|
||||
def __init__(self, url=""):
|
||||
'''
|
||||
The constructor which may pass an optional url to load from.
|
||||
|
||||
d = Description("http://www.example.com/description")
|
||||
"""
|
||||
self.agent = agent
|
||||
'''
|
||||
if url:
|
||||
self.load(url)
|
||||
|
||||
|
||||
def load(self, url):
|
||||
"""For loading up a description object from a url. Normally
|
||||
'''
|
||||
For loading up a description object from a url. Normally
|
||||
you'll probably just want to pass a URL into the constructor.
|
||||
"""
|
||||
req = Request(url, headers={'User-Agent':self.agent})
|
||||
self.dom = parse(urlopen(req))
|
||||
|
||||
'''
|
||||
br = browser()
|
||||
with closing(br.open(url, timeout=15)) as f:
|
||||
doc = etree.fromstring(f.read())
|
||||
|
||||
# version 1.1 has repeating Url elements
|
||||
self.urls = self._get_urls()
|
||||
self.urls = []
|
||||
for element in doc.xpath('//*[local-name() = "Url"]'):
|
||||
template = element.get('template')
|
||||
type = element.get('type')
|
||||
if template and type:
|
||||
url = URL()
|
||||
url.template = template
|
||||
url.type = type
|
||||
self.urls.append(url)
|
||||
|
||||
# this is version 1.0 specific
|
||||
self.url = self._get_element_text('Url')
|
||||
self.format = self._get_element_text('Format')
|
||||
self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
|
||||
self.format = ''.join(doc.xpath('//*[local-name() = "Format"][1]//text()'))
|
||||
|
||||
self.shortname = self._get_element_text('ShortName')
|
||||
self.longname = self._get_element_text('LongName')
|
||||
self.description = self._get_element_text('Description')
|
||||
self.image = self._get_element_text('Image')
|
||||
self.samplesearch = self._get_element_text('SampleSearch')
|
||||
self.developer = self._get_element_text('Developer')
|
||||
self.contact = self._get_element_text('Contact')
|
||||
self.attribution = self._get_element_text('Attribution')
|
||||
self.syndicationright = self._get_element_text('SyndicationRight')
|
||||
self.shortname = ''.join(doc.xpath('//*[local-name() = "ShortName"][1]//text()'))
|
||||
self.longname = ''.join(doc.xpath('//*[local-name() = "LongName"][1]//text()'))
|
||||
self.description = ''.join(doc.xpath('//*[local-name() = "Description"][1]//text()'))
|
||||
self.image = ''.join(doc.xpath('//*[local-name() = "Image"][1]//text()'))
|
||||
self.sameplesearch = ''.join(doc.xpath('//*[local-name() = "SampleSearch"][1]//text()'))
|
||||
self.developer = ''.join(doc.xpath('//*[local-name() = "Developer"][1]//text()'))
|
||||
self.contact = ''.join(doc.xpath('/*[local-name() = "Contact"][1]//text()'))
|
||||
self.attribution = ''.join(doc.xpath('//*[local-name() = "Attribution"][1]//text()'))
|
||||
self.syndicationright = ''.join(doc.xpath('//*[local-name() = "SyndicationRight"][1]//text()'))
|
||||
|
||||
tag_text = self._get_element_text('Tags')
|
||||
tag_text = ' '.join(doc.xpath('//*[local-name() = "Tags"]//text()'))
|
||||
if tag_text != None:
|
||||
self.tags = tag_text.split(" ")
|
||||
self.tags = tag_text.split(' ')
|
||||
|
||||
if self._get_element_text('AdultContent') == 'true':
|
||||
self.adultcontent = True
|
||||
else:
|
||||
self.adultcontent = False
|
||||
self.adultcontent = doc.xpath('boolean(//*[local-name() = "AdultContent" and contains(., "true")])')
|
||||
|
||||
def get_url_by_type(self, type):
|
||||
"""Walks available urls and returns them by type. Only
|
||||
'''
|
||||
Walks available urls and returns them by type. Only
|
||||
appropriate in opensearch v1.1 where there can be multiple
|
||||
query targets. Returns none if no such type is found.
|
||||
|
||||
url = description.get_url_by_type('application/rss+xml')
|
||||
"""
|
||||
'''
|
||||
for url in self.urls:
|
||||
if url.type == type:
|
||||
return url
|
||||
return None
|
||||
|
||||
def get_best_template(self):
|
||||
"""OK, best is a value judgement, but so be it. You'll get
|
||||
'''
|
||||
OK, best is a value judgement, but so be it. You'll get
|
||||
back either the atom, rss or first template available. This
|
||||
method handles the main difference between opensearch v1.0 and v1.1
|
||||
"""
|
||||
'''
|
||||
# version 1.0
|
||||
if self.url:
|
||||
return self.url
|
||||
@ -88,40 +112,3 @@ class Description:
|
||||
|
||||
# out of luck
|
||||
return None
|
||||
|
||||
|
||||
# these are internal methods for querying xml
|
||||
|
||||
def _get_element_text(self, tag):
|
||||
elements = self._get_elements(tag)
|
||||
if not elements:
|
||||
return None
|
||||
return self._get_text(elements[0].childNodes)
|
||||
|
||||
def _get_attribute_text(self, tag, attribute):
|
||||
elements = self._get_elements(tag)
|
||||
if not elements:
|
||||
return ''
|
||||
return elements[0].getAttribute('template')
|
||||
|
||||
def _get_elements(self, tag):
|
||||
return self.dom.getElementsByTagName(tag)
|
||||
|
||||
def _get_text(self, nodes):
|
||||
text = ''
|
||||
for node in nodes:
|
||||
if node.nodeType == node.TEXT_NODE:
|
||||
text += node.data
|
||||
return text.strip()
|
||||
|
||||
def _get_urls(self):
|
||||
urls = []
|
||||
for element in self._get_elements('Url'):
|
||||
template = element.getAttribute('template')
|
||||
type = element.getAttribute('type')
|
||||
if template and type:
|
||||
url = URL()
|
||||
url.template = template
|
||||
url.type = type
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,17 @@
|
||||
from urlparse import urlparse, urlunparse
|
||||
from urllib import urlencode
|
||||
from cgi import parse_qs
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
class Query:
|
||||
"""Represents an opensearch query. Used internally by the Client to
|
||||
construct an opensearch url to request. Really this class is just a
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2006, Ed Summers <ehs@pobox.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from urlparse import urlparse, urlunparse, parse_qs
|
||||
from urllib import urlencode
|
||||
|
||||
class Query(object):
|
||||
'''
|
||||
Represents an opensearch query Really this class is just a
|
||||
helper for substituting values into the macros in a format.
|
||||
|
||||
format = 'http://beta.indeed.com/opensearch?q={searchTerms}&start={startIndex}&limit={count}'
|
||||
@ -12,16 +19,17 @@ class Query:
|
||||
q.searchTerms('zx81')
|
||||
q.startIndex = 1
|
||||
q.count = 25
|
||||
print q.to_url()
|
||||
"""
|
||||
print q.url()
|
||||
'''
|
||||
|
||||
standard_macros = ['searchTerms','count','startIndex','startPage',
|
||||
standard_macros = ['searchTerms', 'count', 'startIndex', 'startPage',
|
||||
'language', 'outputEncoding', 'inputEncoding']
|
||||
|
||||
def __init__(self, format):
|
||||
"""Create a query object by passing it the url format obtained
|
||||
'''
|
||||
Create a query object by passing it the url format obtained
|
||||
from the opensearch Description.
|
||||
"""
|
||||
'''
|
||||
self.format = format
|
||||
|
||||
# unpack the url to a tuple
|
||||
@ -37,7 +45,7 @@ class Query:
|
||||
for key,values in self.query_string.items():
|
||||
# TODO eventually optional/required params should be
|
||||
# distinguished somehow (the ones with/without trailing ?
|
||||
macro = values[0].replace('{','').replace('}','').replace('?','')
|
||||
macro = values[0].replace('{', '').replace('}', '').replace('?', '')
|
||||
if macro in Query.standard_macros:
|
||||
self.macro_map[macro] = key
|
||||
|
||||
|
@ -1,131 +0,0 @@
|
||||
import osfeedparser
|
||||
|
||||
class Results(object):
|
||||
|
||||
def __init__(self, query, agent=None):
|
||||
self.agent = agent
|
||||
self._fetch(query)
|
||||
self._iter = 0
|
||||
|
||||
def __iter__(self):
|
||||
self._iter = 0
|
||||
return self
|
||||
|
||||
def __len__(self):
|
||||
return self.totalResults
|
||||
|
||||
def next(self):
|
||||
|
||||
# just keep going like the energizer bunny
|
||||
while True:
|
||||
|
||||
# return any item we haven't returned
|
||||
if self._iter < len(self.items):
|
||||
self._iter += 1
|
||||
return self.items[self._iter-1]
|
||||
|
||||
# if there appears to be more to fetch
|
||||
if \
|
||||
self.totalResults != 0 \
|
||||
and self.totalResults > self.startIndex + self.itemsPerPage - 1:
|
||||
|
||||
# get the next query
|
||||
next_query = self._get_next_query()
|
||||
|
||||
# if we got one executed it and go back to the beginning
|
||||
if next_query:
|
||||
self._fetch(next_query)
|
||||
# very important to reset this counter
|
||||
# or else the return will fail
|
||||
self._iter = 0
|
||||
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
|
||||
def _fetch(self, query):
|
||||
feed = osfeedparser.opensearch_parse(query.url(), agent=self.agent)
|
||||
self.feed = feed
|
||||
|
||||
# general channel stuff
|
||||
channel = feed['feed']
|
||||
self.title = _pick(channel,'title')
|
||||
self.link = _pick(channel,'link')
|
||||
self.description = _pick(channel,'description')
|
||||
self.language = _pick(channel,'language')
|
||||
self.copyright = _pick(channel,'copyright')
|
||||
|
||||
# get back opensearch specific values
|
||||
self.totalResults = _pick(channel,'opensearch_totalresults',0)
|
||||
self.startIndex = _pick(channel,'opensearch_startindex',1)
|
||||
self.itemsPerPage = _pick(channel,'opensearch_itemsperpage',0)
|
||||
|
||||
# alias items from the feed to our results object
|
||||
self.items = feed['items']
|
||||
|
||||
# set default values if necessary
|
||||
if self.startIndex == 0:
|
||||
self.startIndex = 1
|
||||
if self.itemsPerPage == 0 and len(self.items) > 0:
|
||||
self.itemsPerPage = len(self.items)
|
||||
|
||||
# store away query for calculating next results
|
||||
# if necessary
|
||||
self.last_query = query
|
||||
|
||||
|
||||
def _get_next_query(self):
|
||||
# update our query to get the next set of records
|
||||
query = self.last_query
|
||||
|
||||
# use start page if the query supports it
|
||||
if query.has_macro('startPage'):
|
||||
# if the query already defined the startPage
|
||||
# we just need to increment it
|
||||
if hasattr(query, 'startPage'):
|
||||
query.startPage += 1
|
||||
# to issue the first query startPage might not have
|
||||
# been specified, so set it to 2
|
||||
else:
|
||||
query.startPage = 2
|
||||
return query
|
||||
|
||||
# otherwise the query should support startIndex
|
||||
elif query.has_macro('startIndex'):
|
||||
# if startIndex was used before we just add the
|
||||
# items per page to it to get the next set
|
||||
if hasattr(query, 'startIndex'):
|
||||
query.startIndex += self.itemsPerPage
|
||||
# to issue the first query the startIndex may have
|
||||
# been left blank in that case we assume it to be
|
||||
# the item just after the last one on this page
|
||||
else:
|
||||
query.startIndex = self.itemsPerPage + 1
|
||||
return query
|
||||
|
||||
# doesn't look like there is another stage to this query
|
||||
return None
|
||||
|
||||
|
||||
# helper for pulling values out of a dictionary if they're there
|
||||
# and returning a default value if they're not
|
||||
def _pick(d,key,default=None):
|
||||
|
||||
# get the value out
|
||||
value = d.get(key)
|
||||
|
||||
# if it wasn't there return the default
|
||||
if value == None:
|
||||
return default
|
||||
|
||||
# if they want an int try to convert to an int
|
||||
# and return default if it fails
|
||||
if type(default) == int:
|
||||
try:
|
||||
return int(d[key])
|
||||
except:
|
||||
return default
|
||||
|
||||
# otherwise we're good to return the value
|
||||
return value
|
||||
|
@ -1,5 +1,15 @@
|
||||
class URL:
|
||||
"""Class for representing a URL in an opensearch v1.1 query"""
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2006, Ed Summers <ehs@pobox.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
class URL(object):
|
||||
'''
|
||||
Class for representing a URL in an opensearch v1.1 query
|
||||
'''
|
||||
|
||||
def __init__(self, type='', template='', method='GET'):
|
||||
self.type = type
|
||||
|
Loading…
x
Reference in New Issue
Block a user