OpenSearch: rewrite module to use lxml and remove the modified feed parser as it causes all avaliable file descriptors to be used. Store: Rework opensearch class to use the changes to the opensearch module.

2025-07-08 18:54:09 -04:00 · 2011-06-26 20:56:43 -04:00 · 2011-06-26 20:56:43 -04:00 · 38364443a7
commit 38364443a7
parent 3c83b7873a
9 changed files with 144 additions and 3130 deletions
--- a/src/calibre/gui2/store/opensearch_store.py
+++ b/src/calibre/gui2/store/opensearch_store.py
@ -8,14 +8,20 @@ __docformat__ = 'restructuredtext en'

 import mimetypes
 import urllib
+from contextlib import closing
+
+from lxml import etree

 from PyQt4.Qt import QUrl

+from calibre import browser
 from calibre.gui2 import open_url
 from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.search_result import SearchResult
 from calibre.gui2.store.web_store_dialog import WebStoreDialog
-from calibre.utils.opensearch import Client
+#from calibre.utils.opensearch import Client
+from calibre.utils.opensearch.description import Description
+from calibre.utils.opensearch.query import Query

 class OpenSearchStore(StorePlugin):

@ -38,38 +44,51 @@ class OpenSearchStore(StorePlugin):
        if not hasattr(self, 'open_search_url'):
            return

-        client = Client(self.open_search_url)
-        results = client.search(urllib.quote_plus(query), max_results)
+        description = Description(self.open_search_url)
+        url_template = description.get_best_template()
+        if not url_template:
+            return
+        oquery = Query(url_template)
+
+        # set up initial values
+        oquery.searchTerms = urllib.quote_plus(query)
+        oquery.count = max_results
+        url = oquery.url()
        
        counter = max_results
-        for r in results:
-            if counter <= 0:
-                break            
-            counter -= 1
+        br = browser()
+        with closing(br.open(url, timeout=timeout)) as f:
+            doc = etree.fromstring(f.read())
+            for data in doc.xpath('//*[local-name() = "entry"]'):
+                if counter <= 0:
+                    break
            
-            s = SearchResult()
-            
-            s.detail_item = r.get('id', '')
-            
-            links = r.get('links', None)
-            for l in links:
-                if l.get('rel', None):
-                    if l['rel'] in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
-                        s.cover_url = l.get('href', '')
-                    elif l['rel'] == u'http://opds-spec.org/acquisition/buy':
-                        s.detail_item = l.get('href', s.detail_item)
-                    elif l['rel'] == u'http://opds-spec.org/acquisition':
-                        mime = l.get('type', '')
-                        if mime:
-                            ext = mimetypes.guess_extension(mime)
-                            if ext:
-                                ext = ext[1:].upper()
-                                s.downloads[ext] = l.get('href', '')
+                counter -= 1
+    
+                s = SearchResult()
+                
+                s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()'))

-            s.formats = ', '.join(s.downloads.keys())
+                for link in data.xpath('./*[local-name() = "link"]'):
+                    rel = link.get('rel')
+                    href = link.get('href')
+                    type = link.get('type')
+                    
+                    if rel and href and type:
+                        if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
+                            s.cover_url = href
+                        elif rel == u'http://opds-spec.org/acquisition/buy':
+                            s.detail_item = href
+                        elif rel == u'http://opds-spec.org/acquisition':
+                            if type:
+                                ext = mimetypes.guess_extension(type)
+                                if ext:
+                                    ext = ext[1:].upper()
+                                    s.downloads[ext] = href
+                s.formats = ', '.join(s.downloads.keys())
+                
+                s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()'))
+                s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()'))
+                s.price = ' '.join(data.xpath('.//*[local-name() = "price"]//text()'))

-            s.title = r.get('title', '')
-            s.author = r.get('author', '')
-            s.price = r.get('price', '')
-            
-            yield s
+                yield s
--- a/src/calibre/gui2/store/stores/archive_org_plugin.py
+++ b/src/calibre/gui2/store/stores/archive_org_plugin.py
@ -28,6 +28,7 @@ class ArchiveOrgStore(BasicStoreConfig, OpenSearchStore):
            s.price = '$0.00'
            s.drm = SearchResult.DRM_UNLOCKED
            yield s
+
 '''
    def get_details(self, search_result, timeout):
        br = browser()
--- a/src/calibre/utils/opensearch/init.py
+++ b/src/calibre/utils/opensearch/init.py
@ -1,4 +0,0 @@
-from description import Description
-from query import Query
-from client import Client
-from results import Results
--- a/src/calibre/utils/opensearch/client.py
+++ b/src/calibre/utils/opensearch/client.py
@ -1,39 +0,0 @@
-from description import Description
-from query import Query
-from results import Results
-
-class Client:
-
-    """This is the class you'll probably want to be using. You simply
-    pass the constructor the url for the service description file and
-    issue a search and get back results as an iterable Results object.
-
-    The neat thing about a Results object is that it will seamlessly
-    handle fetching more results from the opensearch server when it can...
-    so you just need to iterate and can let the paging be taken care of 
-    for you.
-
-    from opensearch import Client
-    client = Client(description_url)
-    results = client.search("computer")
-    for result in results:
-        print result.title
-    """
-
-    def __init__(self, url, agent="python-opensearch <https://github.com/edsu/opensearch>"):
-        self.agent = agent
-        self.description = Description(url, self.agent)
-
-    def search(self, search_terms, page_size=25):
-        """Perform a search and get back a results object
-        """
-        url = self.description.get_best_template()
-        query = Query(url)
-
-        # set up initial values
-        query.searchTerms = search_terms
-        query.count = page_size
-
-        # run the results
-        return Results(query, agent=self.agent)
-
--- a/src/calibre/utils/opensearch/description.py
+++ b/src/calibre/utils/opensearch/description.py
@ -1,71 +1,95 @@
-from urllib2 import urlopen, Request
-from xml.dom.minidom import parse
-from url import URL
+# -*- coding: utf-8 -*-

-class Description:
-    """A class for representing OpenSearch Description files.
-    """
+from __future__ import (unicode_literals, division, absolute_import, print_function)

-    def __init__(self, url="", agent=""):
-        """The constructor which may pass an optional url to load from.
+__license__ = 'GPL 3'
+__copyright__ = '''
+2011, John Schember <john@nachtimwald.com>,
+2006, Ed Summers <ehs@pobox.com>
+'''
+__docformat__ = 'restructuredtext en'
+
+from contextlib import closing
+
+from lxml import etree
+
+from calibre import browser
+from calibre.utils.opensearch.url import URL
+
+class Description(object):
+    '''
+    A class for representing OpenSearch Description files.
+    '''
+
+    def __init__(self, url=""):
+        '''
+        The constructor which may pass an optional url to load from.

        d = Description("http://www.example.com/description")
-        """
-        self.agent = agent
+        '''
        if url: 
            self.load(url)


    def load(self, url):
-        """For loading up a description object from a url. Normally
+        '''
+        For loading up a description object from a url. Normally
        you'll probably just want to pass a URL into the constructor.
-        """
-        req = Request(url, headers={'User-Agent':self.agent})
-        self.dom = parse(urlopen(req))
-
+        '''
+        br = browser()
+        with closing(br.open(url, timeout=15)) as f:
+            doc = etree.fromstring(f.read())
+        
        # version 1.1 has repeating Url elements
-        self.urls = self._get_urls()
+        self.urls = []
+        for element in doc.xpath('//*[local-name() = "Url"]'):
+            template = element.get('template')
+            type = element.get('type')
+            if template and type:
+                url = URL()
+                url.template = template
+                url.type = type
+                self.urls.append(url)

        # this is version 1.0 specific
-        self.url = self._get_element_text('Url')
-        self.format = self._get_element_text('Format')
+        self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
+        self.format = ''.join(doc.xpath('//*[local-name() = "Format"][1]//text()'))

-        self.shortname = self._get_element_text('ShortName')
-        self.longname = self._get_element_text('LongName')
-        self.description = self._get_element_text('Description')
-        self.image = self._get_element_text('Image')
-        self.samplesearch = self._get_element_text('SampleSearch')
-        self.developer = self._get_element_text('Developer')
-        self.contact = self._get_element_text('Contact')
-        self.attribution = self._get_element_text('Attribution')
-        self.syndicationright = self._get_element_text('SyndicationRight')
+        self.shortname = ''.join(doc.xpath('//*[local-name() = "ShortName"][1]//text()'))
+        self.longname = ''.join(doc.xpath('//*[local-name() = "LongName"][1]//text()'))
+        self.description = ''.join(doc.xpath('//*[local-name() = "Description"][1]//text()'))
+        self.image = ''.join(doc.xpath('//*[local-name() = "Image"][1]//text()'))
+        self.sameplesearch = ''.join(doc.xpath('//*[local-name() = "SampleSearch"][1]//text()'))
+        self.developer = ''.join(doc.xpath('//*[local-name() = "Developer"][1]//text()'))
+        self.contact = ''.join(doc.xpath('/*[local-name() = "Contact"][1]//text()'))
+        self.attribution = ''.join(doc.xpath('//*[local-name() = "Attribution"][1]//text()'))
+        self.syndicationright = ''.join(doc.xpath('//*[local-name() = "SyndicationRight"][1]//text()'))

-        tag_text = self._get_element_text('Tags')
+        tag_text = ' '.join(doc.xpath('//*[local-name() = "Tags"]//text()'))
        if tag_text != None:
-            self.tags = tag_text.split(" ")
+            self.tags = tag_text.split(' ')

-        if self._get_element_text('AdultContent') == 'true':
-            self.adultcontent = True
-        else:
-            self.adultcontent = False
+        self.adultcontent = doc.xpath('boolean(//*[local-name() = "AdultContent" and contains(., "true")])')

    def get_url_by_type(self, type):
-        """Walks available urls and returns them by type. Only 
+        '''
+        Walks available urls and returns them by type. Only 
        appropriate in opensearch v1.1 where there can be multiple
        query targets. Returns none if no such type is found.

        url = description.get_url_by_type('application/rss+xml')
-        """
+        '''
        for url in self.urls:
            if url.type == type:
                return url
        return None

    def get_best_template(self):
-        """OK, best is a value judgement, but so be it. You'll get 
+        '''
+        OK, best is a value judgement, but so be it. You'll get 
        back either the atom, rss or first template available. This
        method handles the main difference between opensearch v1.0 and v1.1
-        """
+        '''
        # version 1.0
        if self.url: 
            return self.url
@ -88,40 +112,3 @@ class Description:

        # out of luck
        return None
-        
-
-    # these are internal methods for querying xml
-
-    def _get_element_text(self, tag):
-        elements = self._get_elements(tag)
-        if not elements:
-            return None 
-        return self._get_text(elements[0].childNodes)
-
-    def _get_attribute_text(self, tag, attribute):
-        elements = self._get_elements(tag)
-        if not elements:
-            return ''
-        return elements[0].getAttribute('template')
-
-    def _get_elements(self, tag):
-        return self.dom.getElementsByTagName(tag)
-
-    def _get_text(self, nodes):
-        text = ''
-        for node in nodes:
-            if node.nodeType == node.TEXT_NODE:
-                text += node.data
-        return text.strip()
-
-    def _get_urls(self):
-        urls = []
-        for element in self._get_elements('Url'):
-            template = element.getAttribute('template')
-            type = element.getAttribute('type')
-            if template and type:
-                url = URL()
-                url.template = template
-                url.type = type
-                urls.append(url)
-        return urls
--- a/src/calibre/utils/opensearch/osfeedparser.py
+++ b/src/calibre/utils/opensearch/osfeedparser.py
--- a/src/calibre/utils/opensearch/query.py
+++ b/src/calibre/utils/opensearch/query.py
@ -1,10 +1,17 @@
-from urlparse import urlparse, urlunparse
-from urllib import urlencode
-from cgi import parse_qs
+# -*- coding: utf-8 -*-

-class Query:
-    """Represents an opensearch query. Used internally by the Client to 
-    construct an opensearch url to request. Really this class is just a 
+from __future__ import (unicode_literals, division, absolute_import, print_function)
+
+__license__ = 'GPL 3'
+__copyright__ = '2006, Ed Summers <ehs@pobox.com>'
+__docformat__ = 'restructuredtext en'
+
+from urlparse import urlparse, urlunparse, parse_qs
+from urllib import urlencode
+
+class Query(object):
+    '''
+    Represents an opensearch query Really this class is just a 
    helper for substituting values into the macros in a format. 

    format = 'http://beta.indeed.com/opensearch?q={searchTerms}&start={startIndex}&limit={count}'
@ -12,16 +19,17 @@ class Query:
    q.searchTerms('zx81')
    q.startIndex = 1
    q.count = 25
-    print q.to_url()
-    """
+    print q.url()
+    '''

-    standard_macros = ['searchTerms','count','startIndex','startPage', 
+    standard_macros = ['searchTerms', 'count', 'startIndex', 'startPage', 
        'language', 'outputEncoding', 'inputEncoding']

    def __init__(self, format):
-        """Create a query object by passing it the url format obtained
+        '''
+        Create a query object by passing it the url format obtained
        from the opensearch Description.
-        """
+        '''
        self.format = format

        # unpack the url to a tuple
@ -37,7 +45,7 @@ class Query:
        for key,values in self.query_string.items():
            # TODO eventually optional/required params should be 
            # distinguished somehow (the ones with/without trailing ?
-            macro = values[0].replace('{','').replace('}','').replace('?','')
+            macro = values[0].replace('{', '').replace('}', '').replace('?', '')
            if macro in Query.standard_macros:
                self.macro_map[macro] = key

--- a/src/calibre/utils/opensearch/results.py
+++ b/src/calibre/utils/opensearch/results.py
@ -1,131 +0,0 @@
-import osfeedparser
-
-class Results(object):
-
-    def __init__(self, query, agent=None):
-        self.agent = agent
-        self._fetch(query)
-        self._iter = 0
-
-    def __iter__(self):
-        self._iter = 0
-        return self
-
-    def __len__(self):
-        return self.totalResults
-
-    def next(self):
-
-        # just keep going like the energizer bunny
-        while True:
-
-            # return any item we haven't returned
-            if self._iter < len(self.items):
-                self._iter += 1
-                return self.items[self._iter-1]
-          
-            # if there appears to be more to fetch
-            if \
-                self.totalResults != 0 \
-                and self.totalResults > self.startIndex + self.itemsPerPage - 1:
-
-                # get the next query
-                next_query = self._get_next_query()
-
-                # if we got one executed it and go back to the beginning
-                if next_query:
-                    self._fetch(next_query)
-                    # very important to reset this counter 
-                    # or else the return will fail
-                    self._iter = 0
-
-            else:
-                raise StopIteration
-
-
-    def _fetch(self, query):
-        feed  = osfeedparser.opensearch_parse(query.url(), agent=self.agent)
-        self.feed = feed
-
-        # general channel stuff
-        channel = feed['feed']
-        self.title = _pick(channel,'title')
-        self.link = _pick(channel,'link')
-        self.description = _pick(channel,'description')
-        self.language = _pick(channel,'language')
-        self.copyright = _pick(channel,'copyright')
-
-        # get back opensearch specific values
-        self.totalResults = _pick(channel,'opensearch_totalresults',0)
-        self.startIndex = _pick(channel,'opensearch_startindex',1) 
-        self.itemsPerPage = _pick(channel,'opensearch_itemsperpage',0)
-
-        # alias items from the feed to our results object
-        self.items = feed['items']
-
-        # set default values if necessary
-        if self.startIndex == 0:
-            self.startIndex = 1
-        if self.itemsPerPage == 0 and len(self.items) > 0:
-            self.itemsPerPage = len(self.items)
-
-        # store away query for calculating next results
-        # if necessary
-        self.last_query = query
-
-
-    def _get_next_query(self):
-        # update our query to get the next set of records
-        query = self.last_query
-
-        # use start page if the query supports it
-        if query.has_macro('startPage'):
-            # if the query already defined the startPage 
-            # we just need to increment it
-            if hasattr(query, 'startPage'):
-                query.startPage += 1
-            # to issue the first query startPage might not have
-            # been specified, so set it to 2
-            else:
-                query.startPage = 2
-            return query
-
-        # otherwise the query should support startIndex
-        elif query.has_macro('startIndex'):
-            # if startIndex was used before we just add the 
-            # items per page to it to get the next set
-            if hasattr(query, 'startIndex'):
-                query.startIndex += self.itemsPerPage
-            # to issue the first query the startIndex may have
-            # been left blank in that case we assume it to be
-            # the item just after the last one on this page
-            else:
-                query.startIndex = self.itemsPerPage + 1
-            return query
-
-        # doesn't look like there is another stage to this query
-        return None
-
-
-# helper for pulling values out of a dictionary if they're there
-# and returning a default value if they're not
-def _pick(d,key,default=None):
-
-    # get the value out
-    value = d.get(key)
-  
-    # if it wasn't there return the default
-    if value == None:
-        return default
-
-    # if they want an int try to convert to an int
-    # and return default if it fails
-    if type(default) == int:
-        try:
-            return int(d[key])
-        except:
-            return default
-
-    # otherwise we're good to return the value
-    return value
-
--- a/src/calibre/utils/opensearch/url.py
+++ b/src/calibre/utils/opensearch/url.py
@ -1,5 +1,15 @@
-class URL:
-    """Class for representing a URL in an opensearch v1.1 query"""
+# -*- coding: utf-8 -*-
+
+from __future__ import (unicode_literals, division, absolute_import, print_function)
+
+__license__ = 'GPL 3'
+__copyright__ = '2006, Ed Summers <ehs@pobox.com>'
+__docformat__ = 'restructuredtext en'
+
+class URL(object):
+    '''
+    Class for representing a URL in an opensearch v1.1 query
+    '''

    def __init__(self, type='', template='', method='GET'):
        self.type = type