mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Store: Manybooks uses opds feed (faster, more accurate, fixes covers not showing in many cases, fix formats list). Opensearch: support creating search urls from Stanza catalogs. Store: opensearch based classes don't need to quote the search terms as the opensearch module does this already.
This commit is contained in:
parent
58ca9bc7d0
commit
3e0797872c
@ -7,7 +7,6 @@ __copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import mimetypes
|
||||
import urllib
|
||||
from contextlib import closing
|
||||
|
||||
from lxml import etree
|
||||
@ -50,7 +49,7 @@ class OpenSearchOPDSStore(StorePlugin):
|
||||
oquery = Query(url_template)
|
||||
|
||||
# set up initial values
|
||||
oquery.searchTerms = urllib.quote_plus(query)
|
||||
oquery.searchTerms = query
|
||||
oquery.count = max_results
|
||||
url = oquery.url()
|
||||
|
||||
@ -99,7 +98,3 @@ class OpenSearchOPDSStore(StorePlugin):
|
||||
|
||||
|
||||
yield s
|
||||
|
||||
class OpenSearchOPDSDetailStore(OpenSearchOPDSStore):
|
||||
|
||||
pass
|
||||
|
@ -6,89 +6,101 @@ __license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
import urllib
|
||||
import mimetypes
|
||||
from contextlib import closing
|
||||
|
||||
from lxml import html
|
||||
from lxml import etree
|
||||
|
||||
from PyQt4.Qt import QUrl
|
||||
|
||||
from calibre import browser, url_slash_cleaner
|
||||
from calibre.gui2 import open_url
|
||||
from calibre.gui2.store import StorePlugin
|
||||
from calibre import browser
|
||||
from calibre.gui2.store.basic_config import BasicStoreConfig
|
||||
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
|
||||
from calibre.gui2.store.search_result import SearchResult
|
||||
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
||||
from calibre.utils.opensearch.description import Description
|
||||
from calibre.utils.opensearch.query import Query
|
||||
|
||||
class ManyBooksStore(BasicStoreConfig, StorePlugin):
|
||||
class ManyBooksStore(BasicStoreConfig, OpenSearchOPDSStore):
|
||||
|
||||
def open(self, parent=None, detail_item=None, external=False):
|
||||
url = 'http://manybooks.net/'
|
||||
|
||||
detail_url = None
|
||||
if detail_item:
|
||||
detail_url = url + detail_item
|
||||
|
||||
if external or self.config.get('open_external', False):
|
||||
open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url)))
|
||||
else:
|
||||
d = WebStoreDialog(self.gui, url, parent, detail_url)
|
||||
d.setWindowTitle(self.name)
|
||||
d.set_tags(self.config.get('tags', ''))
|
||||
d.exec_()
|
||||
open_search_url = 'http://www.manybooks.net/opds/'
|
||||
web_url = 'http://manybooks.net'
|
||||
|
||||
def search(self, query, max_results=10, timeout=60):
|
||||
# ManyBooks website separates results for title and author.
|
||||
# It also doesn't do a clear job of references authors and
|
||||
# secondary titles. Google is also faster.
|
||||
# Using a google search so we can search on both fields at once.
|
||||
url = 'http://www.google.com/xhtml?q=site:manybooks.net+' + urllib.quote_plus(query)
|
||||
'''
|
||||
Manybooks uses a very strange opds feed. The opds
|
||||
main feed is structured like a stanza feed. The
|
||||
search result entries give very little information
|
||||
and requires you to go to a detail link. The detail
|
||||
link has the wrong type specified (text/html instead
|
||||
of application/atom+xml).
|
||||
'''
|
||||
if not hasattr(self, 'open_search_url'):
|
||||
return
|
||||
|
||||
br = browser()
|
||||
description = Description(self.open_search_url)
|
||||
url_template = description.get_best_template()
|
||||
if not url_template:
|
||||
return
|
||||
oquery = Query(url_template)
|
||||
|
||||
# set up initial values
|
||||
oquery.searchTerms = query
|
||||
oquery.count = max_results
|
||||
url = oquery.url()
|
||||
|
||||
counter = max_results
|
||||
br = browser()
|
||||
with closing(br.open(url, timeout=timeout)) as f:
|
||||
doc = html.fromstring(f.read())
|
||||
for data in doc.xpath('//div[@class="edewpi"]//div[@class="r ld"]'):
|
||||
doc = etree.fromstring(f.read())
|
||||
for data in doc.xpath('//*[local-name() = "entry"]'):
|
||||
if counter <= 0:
|
||||
break
|
||||
|
||||
url = ''
|
||||
url_a = data.xpath('div[@class="jd"]/a')
|
||||
if url_a:
|
||||
url_a = url_a[0]
|
||||
url = url_a.get('href', None)
|
||||
if url:
|
||||
url = url.split('u=')[-1][:-2]
|
||||
if '/titles/' not in url:
|
||||
continue
|
||||
id = url.split('/')[-1]
|
||||
id = id.strip()
|
||||
|
||||
url_a = html.fromstring(html.tostring(url_a))
|
||||
heading = ''.join(url_a.xpath('//text()'))
|
||||
title, _, author = heading.rpartition('by ')
|
||||
author = author.split('-')[0]
|
||||
price = '$0.00'
|
||||
|
||||
cover_url = ''
|
||||
mo = re.match('^\D+', id)
|
||||
if mo:
|
||||
cover_name = mo.group()
|
||||
cover_name = cover_name.replace('etext', '')
|
||||
cover_id = id.split('.')[0]
|
||||
cover_url = 'http://www.manybooks.net/images/' + id[0] + '/' + cover_name + '/' + cover_id + '-thumb.jpg'
|
||||
|
||||
counter -= 1
|
||||
|
||||
s = SearchResult()
|
||||
s.cover_url = cover_url
|
||||
s.title = title.strip()
|
||||
s.author = author.strip()
|
||||
s.price = price.strip()
|
||||
s.detail_item = '/titles/' + id
|
||||
|
||||
detail_links = data.xpath('./*[local-name() = "link" and @type = "text/html"]')
|
||||
if not detail_links:
|
||||
continue
|
||||
detail_link = detail_links[0]
|
||||
detail_href = detail_link.get('href')
|
||||
if not detail_href:
|
||||
continue
|
||||
|
||||
s.detail_item = 'http://manybooks.net/titles/' + detail_href.split('tid=')[-1] + '.html'
|
||||
# These can have HTML inside of them. We are going to get them again later
|
||||
# just in case.
|
||||
s.title = ''.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
|
||||
s.author = ', '.join(data.xpath('./*[local-name() = "author"]//text()')).strip()
|
||||
|
||||
# Follow the detail link to get the rest of the info.
|
||||
with closing(br.open(detail_href, timeout=timeout/4)) as df:
|
||||
ddoc = etree.fromstring(df.read())
|
||||
ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
|
||||
if ddata:
|
||||
ddata = ddata[0]
|
||||
|
||||
# This is the real title and author info we want. We got
|
||||
# it previously just in case it's not specified here for some reason.
|
||||
s.title = ''.join(ddata.xpath('./*[local-name() = "title"]//text()')).strip()
|
||||
s.author = ', '.join(ddata.xpath('./*[local-name() = "author"]//text()')).strip()
|
||||
if s.author.startswith(','):
|
||||
s.author = s.author[1:]
|
||||
if s.author.endswith(','):
|
||||
s.author = s.author[:-1]
|
||||
|
||||
s.cover_url = ''.join(ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href')).strip()
|
||||
|
||||
for link in ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
|
||||
type = link.get('type')
|
||||
href = link.get('href')
|
||||
if type:
|
||||
ext = mimetypes.guess_extension(type)
|
||||
if ext:
|
||||
ext = ext[1:].upper().strip()
|
||||
s.downloads[ext] = href
|
||||
|
||||
s.price = '$0.00'
|
||||
s.drm = SearchResult.DRM_UNLOCKED
|
||||
s.formts = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'
|
||||
s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'
|
||||
|
||||
yield s
|
||||
|
@ -28,7 +28,7 @@ if not url_template:
|
||||
query = Query(url_template)
|
||||
|
||||
# set up initial values.
|
||||
query.searchTerms = urllib.quote_plus(search_terms)
|
||||
query.searchTerms = search_terms
|
||||
# Note the count is ignored by some feeds.
|
||||
query.count = max_results
|
||||
|
||||
|
@ -40,7 +40,7 @@ class Description(object):
|
||||
with closing(br.open(url, timeout=15)) as f:
|
||||
doc = etree.fromstring(f.read())
|
||||
|
||||
# version 1.1 has repeating Url elements
|
||||
# version 1.1 has repeating Url elements.
|
||||
self.urls = []
|
||||
for element in doc.xpath('//*[local-name() = "Url"]'):
|
||||
template = element.get('template')
|
||||
@ -50,9 +50,22 @@ class Description(object):
|
||||
url.template = template
|
||||
url.type = type
|
||||
self.urls.append(url)
|
||||
# Stanza catalogs.
|
||||
for element in doc.xpath('//*[local-name() = "link"]'):
|
||||
if element.get('rel') != 'search':
|
||||
continue
|
||||
href = element.get('href')
|
||||
type = element.get('type')
|
||||
if href and type:
|
||||
url = URL()
|
||||
url.template = href
|
||||
url.type = type
|
||||
self.urls.append(url)
|
||||
|
||||
# this is version 1.0 specific
|
||||
self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
|
||||
# this is version 1.0 specific.
|
||||
self.url = ''
|
||||
if not self.urls:
|
||||
self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
|
||||
self.format = ''.join(doc.xpath('//*[local-name() = "Format"][1]//text()'))
|
||||
|
||||
self.shortname = ''.join(doc.xpath('//*[local-name() = "ShortName"][1]//text()'))
|
||||
|
Loading…
x
Reference in New Issue
Block a user