Store: OpenSearch based store base class. OpenSearch module added. Make some OPDS store use the new OpenSearchStore class.

2025-07-09 03:04:10 -04:00 · 2011-06-26 10:32:17 -04:00 · 2011-06-26 10:32:17 -04:00 · 8ae7d310e8
commit 8ae7d310e8
parent 4c6aa0364f
11 changed files with 3311 additions and 135 deletions
--- a/src/calibre/gui2/store/archive_org_plugin.py
+++ b/src/calibre/gui2/store/archive_org_plugin.py
@ -6,84 +6,35 @@ __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import urllib
 from contextlib import closing
 from lxml import html
-from PyQt4.Qt import QUrl
+from calibre import browser
 from calibre import browser, url_slash_cleaner
 from calibre.gui2 import open_url
 from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.basic_config import BasicStoreConfig
 from calibre.gui2.store.opensearch_store import OpenSearchStore
 from calibre.gui2.store.search_result import SearchResult
 from calibre.gui2.store.web_store_dialog import WebStoreDialog
-class ArchiveOrgStore(BasicStoreConfig, StorePlugin):
+class ArchiveOrgStore(BasicStoreConfig, OpenSearchStore):
    def open(self, parent=None, detail_item=None, external=False):
        url = 'http://www.archive.org/details/texts'
        if detail_item:
            detail_item = url_slash_cleaner('http://www.archive.org' + detail_item)
        if external or self.config.get('open_external', False):
            open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url)))
        else:
            d = WebStoreDialog(self.gui, url, parent, detail_item)
            d.setWindowTitle(self.name)
            d.set_tags(self.config.get('tags', ''))
            d.exec_()
    open_search_url = 'http://bookserver.archive.org/catalog/opensearch.xml'
    web_url = 'http://www.archive.org/details/texts'
    # http://bookserver.archive.org/catalog/
    def search(self, query, max_results=10, timeout=60):
-        query = query + ' AND mediatype:texts'
+        for s in OpenSearchStore.search(self, query, max_results, timeout):
-        url = 'http://www.archive.org/search.php?query=' + urllib.quote(query)
+            s.detail_item = 'http://www.archive.org/details/' + s.detail_item.split(':')[-1]
-        
+            s.price = '$0.00'
-        br = browser()
+            s.drm = SearchResult.DRM_UNLOCKED
-        
+            yield s
-        counter = max_results
+'''
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            for data in doc.xpath('//td[@class="hitCell"]'):
                if counter <= 0:
                    break
                id = ''.join(data.xpath('.//a[@class="titleLink"]/@href'))
                if not id:
                    continue
                title = ''.join(data.xpath('.//a[@class="titleLink"]//text()'))
                authors = data.xpath('.//text()')
                if not authors:
                    continue
                author = None
                for a in authors:
                    if '-' in a:
                        author = a.replace('-', ' ').strip()
                        if author:
                            break
                if not author:
                    continue
                counter -= 1
                s = SearchResult()
                s.title = title.strip()
                s.author = author.strip()
                s.price = '$0.00'
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                yield s
    def get_details(self, search_result, timeout):
        url = url_slash_cleaner('http://www.archive.org' + search_result.detail_item)
        br = browser()
-        with closing(br.open(url, timeout=timeout)) as nf:
+        with closing(br.open(search_result.detail_item, timeout=timeout)) as nf:
            idata = html.fromstring(nf.read())
            formats = ', '.join(idata.xpath('//p[@id="dl" and @class="content"]//a/text()'))
            search_result.formats = formats.upper()
        return True
 '''
--- a/src/calibre/gui2/store/opensearch_store.py
+++ b/src/calibre/gui2/store/opensearch_store.py
@ -0,0 +1,72 @@
 # -*- coding: utf-8 -*-
 from __future__ import (unicode_literals, division, absolute_import, print_function)
 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import mimetypes
 import urllib
 from PyQt4.Qt import QUrl
 from calibre.gui2 import open_url
 from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.search_result import SearchResult
 from calibre.gui2.store.web_store_dialog import WebStoreDialog
 from calibre.utils.opensearch import Client
 class OpenSearchStore(StorePlugin):
    open_search_url = ''
    web_url = ''
    def open(self, parent=None, detail_item=None, external=False):
        if external or self.config.get('open_external', False):
            open_url(QUrl(detail_item if detail_item else self.url))
        else:
            d = WebStoreDialog(self.gui, self.url, parent, detail_item)
            d.setWindowTitle(self.name)
            d.set_tags(self.config.get('tags', ''))
            d.exec_()
    def search(self, query, max_results=10, timeout=60):
        if not hasattr(self, 'open_search_url'):
            return
        client = Client(self.open_search_url)
        results = client.search(urllib.quote_plus(query), max_results)
        counter = max_results
        for r in results:
            if counter <= 0:
                break            
            counter -= 1
            s = SearchResult()
            s.detail_item = r.get('id', '')
            links = r.get('links', None)
            for l in links:
                if l.get('rel', None):
                    if l['rel'] == u'http://opds-spec.org/image/thumbnail':
                        s.cover_url = l.get('href', '')
                    elif l['rel'] == u'http://opds-spec.org/acquisition/buy':
                        s.detail_item = l.get('href', s.detail_item)
                    elif l['rel'] == u'http://opds-spec.org/acquisition':
                        s.downloads.append((l.get('type', ''), l.get('href', '')))
            formats = []
            for mime, url in s.downloads:
                ext = mimetypes.guess_extension(mime)
                if ext:
                    formats.append(ext[1:])
            s.formats = ', '.join(formats)
            s.title = r.get('title', '')
            s.author = r.get('author', '')
            s.price = r.get('price', '')
            yield s
--- a/src/calibre/gui2/store/pragmatic_bookshelf_plugin.py
+++ b/src/calibre/gui2/store/pragmatic_bookshelf_plugin.py
@ -6,79 +6,19 @@ __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import urllib
 from contextlib import closing
 from lxml import html
 from PyQt4.Qt import QUrl
 from calibre import browser, url_slash_cleaner
 from calibre.gui2 import open_url
 from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.basic_config import BasicStoreConfig
 from calibre.gui2.store.opensearch_store import OpenSearchStore
 from calibre.gui2.store.search_result import SearchResult
 from calibre.gui2.store.web_store_dialog import WebStoreDialog
-class PragmaticBookshelfStore(BasicStoreConfig, StorePlugin):
+class PragmaticBookshelfStore(BasicStoreConfig, OpenSearchStore):
-    def open(self, parent=None, detail_item=None, external=False):
+    open_search_url = 'http://pragprog.com/catalog/search-description'
-        url = 'http://pragprog.com/'
+    web_url = 'http://pragprog.com/'
-
+    
-        if external or self.config.get('open_external', False):
+    # http://pragprog.com/catalog.opds
            open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url)))
        else:
            d = WebStoreDialog(self.gui, url, parent, detail_item)
            d.setWindowTitle(self.name)
            d.set_tags(self.config.get('tags', ''))
            d.exec_()
    def search(self, query, max_results=10, timeout=60):
-        '''
+        for s in OpenSearchStore.search(self, query, max_results, timeout):
-        OPDS based search.
+            s.drm = SearchResult.DRM_UNLOCKED
-        
+            s.formats = 'EPUB, PDF, MOBI'
-        We really should get the catelog from http://pragprog.com/catalog.opds
+            yield s
        and look for the application/opensearchdescription+xml entry.
        Then get the opensearch description to get the search url and
        format. However, we are going to be lazy and hard code it.
        '''
        url = 'http://pragprog.com/catalog/search?q=' + urllib.quote_plus(query)
        br = browser()
        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            # Use html instead of etree as html allows us
            # to ignore the namespace easily.
            doc = html.fromstring(f.read())
            for data in doc.xpath('//entry'):
                if counter <= 0:
                    break
                id = ''.join(data.xpath('.//link[@rel="http://opds-spec.org/acquisition/buy"]/@href'))
                if not id:
                    continue
                price = ''.join(data.xpath('.//price/@currencycode')).strip()
                price += ' '
                price += ''.join(data.xpath('.//price/text()')).strip()
                if not price.strip():
                    continue
                cover_url = ''.join(data.xpath('.//link[@rel="http://opds-spec.org/cover"]/@href'))
                title = ''.join(data.xpath('.//title/text()'))
                author = ''.join(data.xpath('.//author//text()'))
                counter -= 1
                s = SearchResult()
                s.cover_url = cover_url
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = id.strip()
                s.drm = SearchResult.DRM_UNLOCKED
                s.formats = 'EPUB, PDF, MOBI'
                yield s
--- a/src/calibre/gui2/store/search_result.py
+++ b/src/calibre/gui2/store/search_result.py
@ -22,6 +22,7 @@ class SearchResult(object):
        self.detail_item = ''
        self.drm = None
        self.formats = ''
        self.downloads = []
        self.affiliate = False
        self.plugin_author = ''
--- a/src/calibre/utils/opensearch/init.py
+++ b/src/calibre/utils/opensearch/init.py
@ -0,0 +1,4 @@
 from description import Description
 from query import Query
 from client import Client
 from results import Results
--- a/src/calibre/utils/opensearch/client.py
+++ b/src/calibre/utils/opensearch/client.py
@ -0,0 +1,39 @@
 from description import Description
 from query import Query
 from results import Results
 class Client:
    """This is the class you'll probably want to be using. You simply
    pass the constructor the url for the service description file and
    issue a search and get back results as an iterable Results object.
    The neat thing about a Results object is that it will seamlessly
    handle fetching more results from the opensearch server when it can...
    so you just need to iterate and can let the paging be taken care of 
    for you.
    from opensearch import Client
    client = Client(description_url)
    results = client.search("computer")
    for result in results:
        print result.title
    """
    def __init__(self, url, agent="python-opensearch <https://github.com/edsu/opensearch>"):
        self.agent = agent
        self.description = Description(url, self.agent)
    def search(self, search_terms, page_size=25):
        """Perform a search and get back a results object
        """
        url = self.description.get_best_template()
        query = Query(url)
        # set up initial values
        query.searchTerms = search_terms
        query.count = page_size
        # run the results
        return Results(query, agent=self.agent)
--- a/src/calibre/utils/opensearch/description.py
+++ b/src/calibre/utils/opensearch/description.py
@ -0,0 +1,127 @@
 from urllib2 import urlopen, Request
 from xml.dom.minidom import parse
 from url import URL
 class Description:
    """A class for representing OpenSearch Description files.
    """
    def __init__(self, url="", agent=""):
        """The constructor which may pass an optional url to load from.
        d = Description("http://www.example.com/description")
        """
        self.agent = agent
        if url: 
            self.load(url)
    def load(self, url):
        """For loading up a description object from a url. Normally
        you'll probably just want to pass a URL into the constructor.
        """
        req = Request(url, headers={'User-Agent':self.agent})
        self.dom = parse(urlopen(req))
        # version 1.1 has repeating Url elements
        self.urls = self._get_urls()
        # this is version 1.0 specific
        self.url = self._get_element_text('Url')
        self.format = self._get_element_text('Format')
        self.shortname = self._get_element_text('ShortName')
        self.longname = self._get_element_text('LongName')
        self.description = self._get_element_text('Description')
        self.image = self._get_element_text('Image')
        self.samplesearch = self._get_element_text('SampleSearch')
        self.developer = self._get_element_text('Developer')
        self.contact = self._get_element_text('Contact')
        self.attribution = self._get_element_text('Attribution')
        self.syndicationright = self._get_element_text('SyndicationRight')
        tag_text = self._get_element_text('Tags')
        if tag_text != None:
            self.tags = tag_text.split(" ")
        if self._get_element_text('AdultContent') == 'true':
            self.adultcontent = True
        else:
            self.adultcontent = False
    def get_url_by_type(self, type):
        """Walks available urls and returns them by type. Only 
        appropriate in opensearch v1.1 where there can be multiple
        query targets. Returns none if no such type is found.
        url = description.get_url_by_type('application/rss+xml')
        """
        for url in self.urls:
            if url.type == type:
                return url
        return None
    def get_best_template(self):
        """OK, best is a value judgement, but so be it. You'll get 
        back either the atom, rss or first template available. This
        method handles the main difference between opensearch v1.0 and v1.1
        """
        # version 1.0
        if self.url: 
            return self.url
        # atom
        if self.get_url_by_type('application/atom+xml'):
            return self.get_url_by_type('application/atom+xml').template
        # rss
        if self.get_url_by_type('application/rss+xml'):
            return self.get_url_by_type('application/rss+xml').template
        # other possible rss type
        if self.get_url_by_type('text/xml'):
            return self.get_url_by_Type('text/xml').template
        # otherwise just the first one
        if len(self.urls) > 0:
            return self.urls[0].template
        # out of luck
        return Nil
    # these are internal methods for querying xml
    def _get_element_text(self, tag):
        elements = self._get_elements(tag)
        if not elements:
            return None 
        return self._get_text(elements[0].childNodes)
    def _get_attribute_text(self, tag, attribute):
        elements = self._get_elements(tag)
        if not elements:
            return ''
        return elements[0].getAttribute('template')
    def _get_elements(self, tag):
        return self.dom.getElementsByTagName(tag)
    def _get_text(self, nodes):
        text = ''
        for node in nodes:
            if node.nodeType == node.TEXT_NODE:
                text += node.data
        return text.strip()
    def _get_urls(self):
        urls = []
        for element in self._get_elements('Url'):
            template = element.getAttribute('template')
            type = element.getAttribute('type')
            if template and type:
                url = URL()
                url.template = template
                url.type = type
                urls.append(url)
        return urls
--- a/src/calibre/utils/opensearch/osfeedparser.py
+++ b/src/calibre/utils/opensearch/osfeedparser.py
--- a/src/calibre/utils/opensearch/query.py
+++ b/src/calibre/utils/opensearch/query.py
@ -0,0 +1,66 @@
 from urlparse import urlparse, urlunparse
 from urllib import urlencode
 from cgi import parse_qs
 class Query:
    """Represents an opensearch query. Used internally by the Client to 
    construct an opensearch url to request. Really this class is just a 
    helper for substituting values into the macros in a format. 
    format = 'http://beta.indeed.com/opensearch?q={searchTerms}&start={startIndex}&limit={count}'
    q = Query(format)
    q.searchTerms('zx81')
    q.startIndex = 1
    q.count = 25
    print q.to_url()
    """
    standard_macros = ['searchTerms','count','startIndex','startPage', 
        'language', 'outputEncoding', 'inputEncoding']
    def __init__(self, format):
        """Create a query object by passing it the url format obtained
        from the opensearch Description.
        """
        self.format = format
        # unpack the url to a tuple
        self.url_parts = urlparse(format)
        # unpack the query string to a dictionary
        self.query_string = parse_qs(self.url_parts[4])
        # look for standard macros and create a mapping of the 
        # opensearch names to the service specific ones
        # so q={searchTerms} will result in a mapping between searchTerms and q
        self.macro_map = {}
        for key,values in self.query_string.items():
            # TODO eventually optional/required params should be 
            # distinguished somehow (the ones with/without trailing ?
            macro = values[0].replace('{','').replace('}','').replace('?','')
            if macro in Query.standard_macros:
                self.macro_map[macro] = key
    def url(self):
        # copy the original query string
        query_string = dict(self.query_string)
        # iterate through macros and set the position in the querystring
        for macro, name in self.macro_map.items():
            if hasattr(self, macro):
                # set the name/value pair
                query_string[name] = [getattr(self, macro)]
            else:
                # remove the name/value pair
                del(query_string[name])
        # copy the url parts and substitute in our new query string
        url_parts = list(self.url_parts)
        url_parts[4] = urlencode(query_string, 1)
        # recompose and return url
        return urlunparse(tuple(url_parts))
    def has_macro(self, macro):
        return self.macro_map.has_key(macro)
--- a/src/calibre/utils/opensearch/results.py
+++ b/src/calibre/utils/opensearch/results.py
@ -0,0 +1,131 @@
 import osfeedparser
 class Results(object):
    def __init__(self, query, agent=None):
        self.agent = agent
        self._fetch(query)
        self._iter = 0
    def __iter__(self):
        self._iter = 0
        return self
    def __len__(self):
        return self.totalResults
    def next(self):
        # just keep going like the energizer bunny
        while True:
            # return any item we haven't returned
            if self._iter < len(self.items):
                self._iter += 1
                return self.items[self._iter-1]
            # if there appears to be more to fetch
            if \
                self.totalResults != 0 \
                and self.totalResults > self.startIndex + self.itemsPerPage - 1:
                # get the next query
                next_query = self._get_next_query()
                # if we got one executed it and go back to the beginning
                if next_query:
                    self._fetch(next_query)
                    # very important to reset this counter 
                    # or else the return will fail
                    self._iter = 0
            else:
                raise StopIteration
    def _fetch(self, query):
        feed  = osfeedparser.opensearch_parse(query.url(), agent=self.agent)
        self.feed = feed
        # general channel stuff
        channel = feed['feed']
        self.title = _pick(channel,'title')
        self.link = _pick(channel,'link')
        self.description = _pick(channel,'description')
        self.language = _pick(channel,'language')
        self.copyright = _pick(channel,'copyright')
        # get back opensearch specific values
        self.totalResults = _pick(channel,'opensearch_totalresults',0)
        self.startIndex = _pick(channel,'opensearch_startindex',1) 
        self.itemsPerPage = _pick(channel,'opensearch_itemsperpage',0)
        # alias items from the feed to our results object
        self.items = feed['items']
        # set default values if necessary
        if self.startIndex == 0:
            self.startIndex = 1
        if self.itemsPerPage == 0 and len(self.items) > 0:
            self.itemsPerPage = len(self.items)
        # store away query for calculating next results
        # if necessary
        self.last_query = query
    def _get_next_query(self):
        # update our query to get the next set of records
        query = self.last_query
        # use start page if the query supports it
        if query.has_macro('startPage'):
            # if the query already defined the startPage 
            # we just need to increment it
            if hasattr(query, 'startPage'):
                query.startPage += 1
            # to issue the first query startPage might not have
            # been specified, so set it to 2
            else:
                query.startPage = 2
            return query
        # otherwise the query should support startIndex
        elif query.has_macro('startIndex'):
            # if startIndex was used before we just add the 
            # items per page to it to get the next set
            if hasattr(query, 'startIndex'):
                query.startIndex += self.itemsPerPage
            # to issue the first query the startIndex may have
            # been left blank in that case we assume it to be
            # the item just after the last one on this page
            else:
                query.startIndex = self.itemsPerPage + 1
            return query
        # doesn't look like there is another stage to this query
        return None
 # helper for pulling values out of a dictionary if they're there
 # and returning a default value if they're not
 def _pick(d,key,default=None):
    # get the value out
    value = d.get(key)
    # if it wasn't there return the default
    if value == None:
        return default
    # if they want an int try to convert to an int
    # and return default if it fails
    if type(default) == int:
        try:
            return int(d[key])
        except:
            return default
    # otherwise we're good to return the value
    return value
--- a/src/calibre/utils/opensearch/url.py
+++ b/src/calibre/utils/opensearch/url.py
@ -0,0 +1,8 @@
 class URL:
    """Class for representing a URL in an opensearch v1.1 query"""
    def __init__(self, type='', template='', method='GET'):
        self.type = type
        self.template = template
        self.method = 'GET'
        self.params = []