Get Books: Remove OpenLibrary since it has the same files as archive.org. Allow direct downloading from Project Gutenberg.

This commit is contained in:
Kovid Goyal 2011-07-03 10:57:37 -06:00
commit 299d477f75
13 changed files with 182 additions and 216 deletions

Binary file not shown.

View File

@ -1387,15 +1387,6 @@ class StoreOpenBooksStore(StoreBase):
drm_free_only = True
headquarters = 'US'
class StoreOpenLibraryStore(StoreBase):
name = 'Open Library'
description = u'One web page for every book ever published. The goal is to be a true online library. Over 20 million records from a variety of large catalogs as well as single contributions, with more on the way.'
actual_plugin = 'calibre.gui2.store.stores.open_library_plugin:OpenLibraryStore'
drm_free_only = True
headquarters = 'US'
formats = ['DAISY', 'DJVU', 'EPUB', 'MOBI', 'PDF', 'TXT']
class StoreOReillyStore(StoreBase):
name = 'OReilly'
description = u'Programming and tech ebooks from OReilly.'
@ -1514,7 +1505,6 @@ plugins += [
StoreMobileReadStore,
StoreNextoStore,
StoreOpenBooksStore,
StoreOpenLibraryStore,
StoreOReillyStore,
StorePragmaticBookshelfStore,
StoreSmashwordsStore,

View File

@ -7,7 +7,6 @@ __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import mimetypes
import urllib
from contextlib import closing
from lxml import etree
@ -22,7 +21,7 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog
from calibre.utils.opensearch.description import Description
from calibre.utils.opensearch.query import Query
class OpenSearchStore(StorePlugin):
class OpenSearchOPDSStore(StorePlugin):
open_search_url = ''
web_url = ''
@ -50,7 +49,7 @@ class OpenSearchStore(StorePlugin):
oquery = Query(url_template)
# set up initial values
oquery.searchTerms = urllib.quote_plus(query)
oquery.searchTerms = query
oquery.count = max_results
url = oquery.url()

View File

@ -349,7 +349,8 @@ class SearchDialog(QDialog, Ui_Dialog):
d = ChooseFormatDialog(self, _('Choose format to download to your library.'), result.downloads.keys())
if d.exec_() == d.Accepted:
ext = d.format()
self.gui.download_ebook(result.downloads[ext])
fname = result.title + '.' + ext.lower()
self.gui.download_ebook(result.downloads[ext], filename=fname)
def open_store(self, result):
self.gui.istores[result.store_name].open(self, result.detail_item, self.open_external.isChecked())

View File

@ -6,12 +6,11 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchStore
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult
class ArchiveOrgStore(BasicStoreConfig, OpenSearchStore):
class ArchiveOrgStore(BasicStoreConfig, OpenSearchOPDSStore):
open_search_url = 'http://bookserver.archive.org/catalog/opensearch.xml'
web_url = 'http://www.archive.org/details/texts'
@ -19,7 +18,7 @@ class ArchiveOrgStore(BasicStoreConfig, OpenSearchStore):
# http://bookserver.archive.org/catalog/
def search(self, query, max_results=10, timeout=60):
for s in OpenSearchStore.search(self, query, max_results, timeout):
for s in OpenSearchOPDSStore.search(self, query, max_results, timeout):
s.detail_item = 'http://www.archive.org/details/' + s.detail_item.split(':')[-1]
s.price = '$0.00'
s.drm = SearchResult.DRM_UNLOCKED
@ -33,6 +32,7 @@ class ArchiveOrgStore(BasicStoreConfig, OpenSearchStore):
from calibre import browser
from contextlib import closing
from lxml import html
br = browser()
with closing(br.open(search_result.detail_item, timeout=timeout)) as nf:
idata = html.fromstring(nf.read())

View File

@ -7,10 +7,10 @@ __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchStore
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult
class EpubBudStore(BasicStoreConfig, OpenSearchStore):
class EpubBudStore(BasicStoreConfig, OpenSearchOPDSStore):
open_search_url = 'http://www.epubbud.com/feeds/opensearch.xml'
web_url = 'http://www.epubbud.com/'
@ -18,7 +18,7 @@ class EpubBudStore(BasicStoreConfig, OpenSearchStore):
# http://www.epubbud.com/feeds/catalog.atom
def search(self, query, max_results=10, timeout=60):
for s in OpenSearchStore.search(self, query, max_results, timeout):
for s in OpenSearchOPDSStore.search(self, query, max_results, timeout):
s.price = '$0.00'
s.drm = SearchResult.DRM_UNLOCKED
s.formats = 'EPUB'

View File

@ -7,10 +7,10 @@ __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchStore
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult
class FeedbooksStore(BasicStoreConfig, OpenSearchStore):
class FeedbooksStore(BasicStoreConfig, OpenSearchOPDSStore):
open_search_url = 'http://assets0.feedbooks.net/opensearch.xml?t=1253087147'
web_url = 'http://feedbooks.com/'
@ -18,7 +18,7 @@ class FeedbooksStore(BasicStoreConfig, OpenSearchStore):
# http://www.feedbooks.com/catalog
def search(self, query, max_results=10, timeout=60):
for s in OpenSearchStore.search(self, query, max_results, timeout):
for s in OpenSearchOPDSStore.search(self, query, max_results, timeout):
if s.downloads:
s.drm = SearchResult.DRM_UNLOCKED
s.price = '$0.00'

View File

@ -6,6 +6,7 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import mimetypes
import urllib
from contextlib import closing
@ -23,70 +24,67 @@ from calibre.gui2.store.web_store_dialog import WebStoreDialog
class GutenbergStore(BasicStoreConfig, StorePlugin):
def open(self, parent=None, detail_item=None, external=False):
url = 'http://m.gutenberg.org/'
ext_url = 'http://gutenberg.org/'
url = 'http://gutenberg.org/'
if detail_item:
detail_item = url_slash_cleaner(url + detail_item)
if external or self.config.get('open_external', False):
if detail_item:
ext_url = ext_url + detail_item
open_url(QUrl(url_slash_cleaner(ext_url)))
open_url(QUrl(detail_item if detail_item else url))
else:
detail_url = None
if detail_item:
detail_url = url + detail_item
d = WebStoreDialog(self.gui, url, parent, detail_url)
d = WebStoreDialog(self.gui, url, parent, detail_item)
d.setWindowTitle(self.name)
d.set_tags(self.config.get('tags', ''))
d.exec_()
def search(self, query, max_results=10, timeout=60):
# Gutenberg's website does not allow searching both author and title.
# Using a google search so we can search on both fields at once.
url = 'http://www.google.com/xhtml?q=site:gutenberg.org+' + urllib.quote_plus(query)
url = 'http://m.gutenberg.org/ebooks/search.mobile/?default_prefix=all&sort_order=title&query=' + urllib.quote_plus(query)
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
for data in doc.xpath('//div[@class="edewpi"]//div[@class="r ld"]'):
for data in doc.xpath('//ol[@class="results"]//li[contains(@class, "icon_title")]'):
if counter <= 0:
break
id = ''.join(data.xpath('./a/@href'))
id = id.split('.mobile')[0]
url = ''
url_a = data.xpath('div[@class="jd"]/a')
if url_a:
url_a = url_a[0]
url = url_a.get('href', None)
if url:
url = url.split('u=')[-1].split('&')[0]
if '/ebooks/' not in url:
continue
id = url.split('/')[-1]
url_a = html.fromstring(html.tostring(url_a))
heading = ''.join(url_a.xpath('//text()'))
title, _, author = heading.rpartition('by ')
author = author.split('-')[0]
price = '$0.00'
title = ''.join(data.xpath('.//span[@class="title"]/text()'))
author = ''.join(data.xpath('.//span[@class="subtitle"]/text()'))
counter -= 1
s = SearchResult()
s.cover_url = ''
s.detail_item = id.strip()
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = '/ebooks/' + id.strip()
s.price = '$0.00'
s.drm = SearchResult.DRM_UNLOCKED
yield s
def get_details(self, search_result, timeout):
url = 'http://m.gutenberg.org/'
url = url_slash_cleaner('http://m.gutenberg.org/' + search_result.detail_item + '.mobile')
br = browser()
with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf:
idata = html.fromstring(nf.read())
search_result.formats = ', '.join(idata.xpath('//a[@type!="application/atom+xml"]//span[@class="title"]/text()'))
return True
with closing(br.open(url, timeout=timeout)) as nf:
doc = html.fromstring(nf.read())
for save_item in doc.xpath('//li[contains(@class, "icon_save")]/a'):
type = save_item.get('type')
href = save_item.get('href')
if type:
ext = mimetypes.guess_extension(type)
if ext:
ext = ext[1:].upper().strip()
search_result.downloads[ext] = href
search_result.formats = ', '.join(search_result.downloads.keys())
return True

View File

@ -6,89 +6,101 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re
import urllib
import mimetypes
from contextlib import closing
from lxml import html
from lxml import etree
from PyQt4.Qt import QUrl
from calibre import browser, url_slash_cleaner
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre import browser
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog
from calibre.utils.opensearch.description import Description
from calibre.utils.opensearch.query import Query
class ManyBooksStore(BasicStoreConfig, StorePlugin):
class ManyBooksStore(BasicStoreConfig, OpenSearchOPDSStore):
def open(self, parent=None, detail_item=None, external=False):
url = 'http://manybooks.net/'
detail_url = None
if detail_item:
detail_url = url + detail_item
if external or self.config.get('open_external', False):
open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url)))
else:
d = WebStoreDialog(self.gui, url, parent, detail_url)
d.setWindowTitle(self.name)
d.set_tags(self.config.get('tags', ''))
d.exec_()
open_search_url = 'http://www.manybooks.net/opds/'
web_url = 'http://manybooks.net'
def search(self, query, max_results=10, timeout=60):
# ManyBooks website separates results for title and author.
# It also doesn't do a clear job of references authors and
# secondary titles. Google is also faster.
# Using a google search so we can search on both fields at once.
url = 'http://www.google.com/xhtml?q=site:manybooks.net+' + urllib.quote_plus(query)
'''
Manybooks uses a very strange opds feed. The opds
main feed is structured like a stanza feed. The
search result entries give very little information
and requires you to go to a detail link. The detail
link has the wrong type specified (text/html instead
of application/atom+xml).
'''
if not hasattr(self, 'open_search_url'):
return
br = browser()
description = Description(self.open_search_url)
url_template = description.get_best_template()
if not url_template:
return
oquery = Query(url_template)
# set up initial values
oquery.searchTerms = query
oquery.count = max_results
url = oquery.url()
counter = max_results
br = browser()
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
for data in doc.xpath('//div[@class="edewpi"]//div[@class="r ld"]'):
doc = etree.fromstring(f.read())
for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0:
break
url = ''
url_a = data.xpath('div[@class="jd"]/a')
if url_a:
url_a = url_a[0]
url = url_a.get('href', None)
if url:
url = url.split('u=')[-1][:-2]
if '/titles/' not in url:
continue
id = url.split('/')[-1]
id = id.strip()
url_a = html.fromstring(html.tostring(url_a))
heading = ''.join(url_a.xpath('//text()'))
title, _, author = heading.rpartition('by ')
author = author.split('-')[0]
price = '$0.00'
cover_url = ''
mo = re.match('^\D+', id)
if mo:
cover_name = mo.group()
cover_name = cover_name.replace('etext', '')
cover_id = id.split('.')[0]
cover_url = 'http://www.manybooks.net/images/' + id[0] + '/' + cover_name + '/' + cover_id + '-thumb.jpg'
counter -= 1
s = SearchResult()
s.cover_url = cover_url
s.title = title.strip()
s.author = author.strip()
s.price = price.strip()
s.detail_item = '/titles/' + id
detail_links = data.xpath('./*[local-name() = "link" and @type = "text/html"]')
if not detail_links:
continue
detail_link = detail_links[0]
detail_href = detail_link.get('href')
if not detail_href:
continue
s.detail_item = 'http://manybooks.net/titles/' + detail_href.split('tid=')[-1] + '.html'
# These can have HTML inside of them. We are going to get them again later
# just in case.
s.title = ''.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
s.author = ', '.join(data.xpath('./*[local-name() = "author"]//text()')).strip()
# Follow the detail link to get the rest of the info.
with closing(br.open(detail_href, timeout=timeout/4)) as df:
ddoc = etree.fromstring(df.read())
ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
if ddata:
ddata = ddata[0]
# This is the real title and author info we want. We got
# it previously just in case it's not specified here for some reason.
s.title = ''.join(ddata.xpath('./*[local-name() = "title"]//text()')).strip()
s.author = ', '.join(ddata.xpath('./*[local-name() = "author"]//text()')).strip()
if s.author.startswith(','):
s.author = s.author[1:]
if s.author.endswith(','):
s.author = s.author[:-1]
s.cover_url = ''.join(ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href')).strip()
for link in ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
type = link.get('type')
href = link.get('href')
if type:
ext = mimetypes.guess_extension(type)
if ext:
ext = ext[1:].upper().strip()
s.downloads[ext] = href
s.price = '$0.00'
s.drm = SearchResult.DRM_UNLOCKED
s.formts = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'
s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR'
yield s

View File

@ -1,84 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import urllib2
from contextlib import closing
from lxml import html
from PyQt4.Qt import QUrl
from calibre import browser, url_slash_cleaner
from calibre.gui2 import open_url
from calibre.gui2.store import StorePlugin
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog
class OpenLibraryStore(BasicStoreConfig, StorePlugin):
def open(self, parent=None, detail_item=None, external=False):
url = 'http://openlibrary.org/'
if external or self.config.get('open_external', False):
if detail_item:
url = url + detail_item
open_url(QUrl(url_slash_cleaner(url)))
else:
detail_url = None
if detail_item:
detail_url = url + detail_item
d = WebStoreDialog(self.gui, url, parent, detail_url)
d.setWindowTitle(self.name)
d.set_tags(self.config.get('tags', ''))
d.exec_()
def search(self, query, max_results=10, timeout=60):
url = 'http://openlibrary.org/search?q=' + urllib2.quote(query) + '&has_fulltext=true'
br = browser()
counter = max_results
with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())
for data in doc.xpath('//div[@id="searchResults"]/ul[@id="siteSearch"]/li'):
if counter <= 0:
break
# Don't include books that don't have downloadable files.
if not data.xpath('boolean(./span[@class="actions"]//span[@class="label" and contains(text(), "Read")])'):
continue
id = ''.join(data.xpath('./span[@class="bookcover"]/a/@href'))
if not id:
continue
cover_url = ''.join(data.xpath('./span[@class="bookcover"]/a/img/@src'))
title = ''.join(data.xpath('.//h3[@class="booktitle"]/a[@class="results"]/text()'))
author = ''.join(data.xpath('.//span[@class="bookauthor"]/a/text()'))
price = '$0.00'
counter -= 1
s = SearchResult()
s.cover_url = cover_url
s.title = title.strip()
s.author = author.strip()
s.price = price
s.detail_item = id.strip()
s.drm = SearchResult.DRM_UNLOCKED
yield s
def get_details(self, search_result, timeout):
url = 'http://openlibrary.org/'
br = browser()
with closing(br.open(url_slash_cleaner(url + search_result.detail_item), timeout=timeout)) as nf:
idata = html.fromstring(nf.read())
search_result.formats = ', '.join(list(set(idata.xpath('//a[contains(@title, "Download")]/text()'))))
return True

View File

@ -7,10 +7,10 @@ __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchStore
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult
class PragmaticBookshelfStore(BasicStoreConfig, OpenSearchStore):
class PragmaticBookshelfStore(BasicStoreConfig, OpenSearchOPDSStore):
open_search_url = 'http://pragprog.com/catalog/search-description'
web_url = 'http://pragprog.com/'
@ -18,7 +18,7 @@ class PragmaticBookshelfStore(BasicStoreConfig, OpenSearchStore):
# http://pragprog.com/catalog.opds
def search(self, query, max_results=10, timeout=60):
for s in OpenSearchStore.search(self, query, max_results, timeout):
for s in OpenSearchOPDSStore.search(self, query, max_results, timeout):
s.drm = SearchResult.DRM_UNLOCKED
s.formats = 'EPUB, PDF, MOBI'
yield s

View File

@ -0,0 +1,37 @@
'''
Based on the OpenSearch Python module by Ed Summers <ehs@pobox.com> from
https://github.com/edsu/opensearch .
This module is heavily modified and does not implement all the features from
the original. The ability for the the module to perform a search and retrieve
search results has been removed. The original module used a modified version
of the Universal feed parser from http://feedparser.org/ . The use of
FeedPaser made getting search results very slow. There is also a bug in the
modified FeedParser that causes the system to run out of file descriptors.
Instead of fixing the modified feed parser it was decided to remove it and
manually parse the feeds in a set of type specific classes. This is much
faster and as we know in advance the feed format is simpler than using
FeedParser. Also, replacing the modified FeedParser with the newest version
of FeedParser caused some feeds to be parsed incorrectly and result in a loss
of data.
The module was also rewritten to use lxml instead of MiniDom.
Usage:
description = Description(open_search_url)
url_template = description.get_best_template()
if not url_template:
return
query = Query(url_template)
# set up initial values.
query.searchTerms = search_terms
# Note the count is ignored by some feeds.
query.count = max_results
search_url = oquery.url()
'''

View File

@ -40,7 +40,7 @@ class Description(object):
with closing(br.open(url, timeout=15)) as f:
doc = etree.fromstring(f.read())
# version 1.1 has repeating Url elements
# version 1.1 has repeating Url elements.
self.urls = []
for element in doc.xpath('//*[local-name() = "Url"]'):
template = element.get('template')
@ -50,9 +50,22 @@ class Description(object):
url.template = template
url.type = type
self.urls.append(url)
# Stanza catalogs.
for element in doc.xpath('//*[local-name() = "link"]'):
if element.get('rel') != 'search':
continue
href = element.get('href')
type = element.get('type')
if href and type:
url = URL()
url.template = href
url.type = type
self.urls.append(url)
# this is version 1.0 specific
self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
# this is version 1.0 specific.
self.url = ''
if not self.urls:
self.url = ''.join(doc.xpath('//*[local-name() = "Url"][1]//text()'))
self.format = ''.join(doc.xpath('//*[local-name() = "Format"][1]//text()'))
self.shortname = ''.join(doc.xpath('//*[local-name() = "ShortName"][1]//text()'))