Get Books: Add XinXii store. Metadata download plugin for ozon.ru, enabled only when user selects russian as their language in the welcome wizard.

This commit is contained in:
Kovid Goyal 2011-08-18 16:53:53 -06:00
commit 65a2931f68
8 changed files with 566 additions and 10 deletions

View File

@ -590,8 +590,9 @@ from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
from calibre.ebooks.metadata.sources.overdrive import OverDrive
from calibre.ebooks.metadata.sources.douban import Douban
from calibre.ebooks.metadata.sources.ozon import Ozon
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]
# }}}
@ -1476,6 +1477,14 @@ class StoreWoblinkStore(StoreBase):
headquarters = 'PL'
formats = ['EPUB']
class XinXiiStore(StoreBase):
name = 'XinXii'
description = ''
actual_plugin = 'calibre.gui2.store.stores.xinxii_plugin:XinXiiStore'
headquarters = 'DE'
formats = ['EPUB', 'PDF']
class StoreZixoStore(StoreBase):
name = 'Zixo'
author = u'Tomasz Długosz'
@ -1524,6 +1533,7 @@ plugins += [
StoreWHSmithUKStore,
StoreWizardsTowerBooksStore,
StoreWoblinkStore,
XinXiiStore,
StoreZixoStore
]

View File

@ -92,7 +92,7 @@ def restore_plugin_state_to_default(plugin_or_name):
config['enabled_plugins'] = ep
default_disabled_plugins = set([
'Overdrive', 'Douban Books',
'Overdrive', 'Douban Books', 'OZON.ru',
])
def is_disabled(plugin):

View File

@ -11,7 +11,7 @@ from functools import partial
from base64 import b64decode
from lxml import etree
from calibre.utils.date import parse_date
from calibre import guess_all_extensions, prints, force_unicode
from calibre import guess_type, guess_all_extensions, prints, force_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.chardet import xml_to_unicode
@ -147,6 +147,12 @@ def _parse_cover_data(root, imgid, mi):
if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
mime_extensions = guess_all_extensions(mimetype)
if not mime_extensions and mimetype.startswith('image/'):
mimetype_fromid = guess_type(imgid)[0]
if mimetype_fromid and mimetype_fromid.startswith('image/'):
mime_extensions = guess_all_extensions(mimetype_fromid)
if mime_extensions:
pic_data = elm_binary[0].text
if pic_data:

View File

@ -0,0 +1,445 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
from xml.etree.ElementTree import _Element
__license__ = 'GPL 3'
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
__docformat__ = 'restructuredtext en'
import re
import urllib2
import datetime
from urllib import quote_plus
from Queue import Queue, Empty
from lxml import etree, html
from lxml.etree import ElementBase
from calibre import as_unicode
from calibre import prints
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.book.base import Metadata
class Ozon(Source):
name = 'OZON.ru'
description = _('Downloads metadata and covers from OZON.ru')
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
'publisher', 'pubdate', 'comments', 'series', 'rating', 'language'])
# Test purpose only, test function does not like when sometimes some filed are empty
#touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
# 'publisher', 'pubdate', 'comments'])
supports_gzip_transfer_encoding = True
has_html_comments = True
ozon_url = 'http://www.ozon.ru'
# match any ISBN10/13. From "Regular Expressions Cookbook"
isbnPattern = r'(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|'\
'[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?'\
'(?:[0-9]+[- ]?){2}[0-9X]'
isbnRegex = re.compile(isbnPattern)
def get_book_url(self, identifiers): # {{{
ozon_id = identifiers.get('ozon', None)
res = None
if ozon_id:
url = '{}/context/detail/id/{}?partner={}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
res = ('ozon', ozon_id, url)
return res
# }}}
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
# div_book -> search only books, ebooks and audio books
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
isbn = _format_isbn(log, identifiers.get('isbn', None))
# TODO: format isbn!
qItems = set([isbn, title])
if authors:
qItems |= frozenset(authors)
qItems.discard(None)
qItems.discard('')
qItems = map(_quoteString, qItems)
q = ' '.join(qItems).strip()
log.info(u'search string: ' + q)
if isinstance(q, unicode):
q = q.encode('utf-8')
if not q:
return None
search_url += quote_plus(q)
log.debug(u'search url: %r'%search_url)
return search_url
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30):
if not self.is_configured():
return
query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
if not query:
err = 'Insufficient metadata to construct query'
log.error(err)
return err
try:
raw = self.browser.open_novisit(query).read()
except Exception as e:
log.exception(u'Failed to make identify query: %r'%query)
return as_unicode(e)
try:
parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
entries = feed.xpath('//*[local-name() = "SearchItems"]')
if entries:
metadata = self.get_metadata(log, entries, title, authors, identifiers)
self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
except Exception as e:
log.exception('Failed to parse identify results')
return as_unicode(e)
# }}}
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
title = unicode(title).upper() if title else ''
authors = map(unicode.upper, map(unicode, authors)) if authors else None
ozon_id = identifiers.get('ozon', None)
unk = unicode(_('Unknown')).upper()
if title == unk:
title = None
if authors == [unk]:
authors = None
def in_authors(authors, miauthors):
for author in authors:
for miauthor in miauthors:
if author in miauthor: return True
return None
def ensure_metadata_match(mi): # {{{
match = True
if title:
mititle = unicode(mi.title).upper() if mi.title else ''
match = title in mititle
if match and authors:
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
match = in_authors(authors, miauthors)
if match and ozon_id:
mozon_id = mi.identifiers['ozon']
match = ozon_id == mozon_id
return match
metadata = []
for i, entry in enumerate(entries):
mi = self.to_metadata(log, entry)
mi.source_relevance = i
if ensure_metadata_match(mi):
metadata.append(mi)
# log.debug(u'added metadata %s %s. '%(mi.title, mi.authors))
else:
log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
return metadata
# }}}
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{
req_isbn = identifiers.get('isbn', None)
for mi in metadata:
if abort.is_set():
break
try:
ozon_id = mi.identifiers['ozon']
try:
self.get_book_details(log, mi, timeout)
except:
log.exception(u'Failed to get details for metadata: %s'%mi.title)
all_isbns = getattr(mi, 'all_isbns', [])
if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns:
log.debug(u'skipped, no requested ISBN %s found'%req_isbn)
continue
for isbn in all_isbns:
self.cache_isbn_to_identifier(isbn, ozon_id)
if mi.ozon_cover_url:
self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url)
self.clean_downloaded_metadata(mi)
result_queue.put(mi)
except:
log.exception(u'Failed to get details for metadata: %s'%mi.title)
# }}}
def to_metadata(self, log, entry): # {{{
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
title = entry.xpath(xp_template.format('Name'))
author = entry.xpath(xp_template.format('Author'))
mi = Metadata(title, author.split(','))
ozon_id = entry.xpath(xp_template.format('ID'))
mi.identifiers = {'ozon':ozon_id}
mi.comments = entry.xpath(xp_template.format('Annotation'))
mi.ozon_cover_url = None
cover = entry.xpath(xp_template.format('Picture'))
if cover:
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
rating = entry.xpath(xp_template.format('ClientRatingValue'))
if rating:
try:
#'rating', A floating point number between 0 and 10
# OZON raion N of 5, calibre of 10, but there is a bug? in identify
mi.rating = float(rating)
except:
pass
rating
return mi
# }}}
def get_cached_cover_url(self, identifiers): # {{{
url = None
ozon_id = identifiers.get('ozon', None)
if ozon_id is None:
isbn = identifiers.get('isbn', None)
if isbn is not None:
ozon_id = self.cached_isbn_to_identifier(isbn)
if ozon_id is not None:
url = self.cached_identifier_to_cover_url(ozon_id)
return url
# }}}
def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): # {{{
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.debug('No cached cover found, running identify')
rq = Queue()
self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers)
if abort.is_set():
return
results = []
while True:
try:
results.append(rq.get_nowait())
except Empty:
break
results.sort(key=self.identify_results_keygen(title=title, authors=authors, identifiers=identifiers))
for mi in results:
cached_url = self.get_cached_cover_url(mi.identifiers)
if cached_url is not None:
break
if cached_url is None:
log.info('No cover found')
return
if abort.is_set():
return
log.debug('Downloading cover from:', cached_url)
try:
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
if cdata:
result_queue.put((self, cdata))
except Exception as e:
log.exception(u'Failed to download cover from: %s'%cached_url)
return as_unicode(e)
# }}}
def get_book_details(self, log, metadata, timeout): # {{{
url = self.get_book_url(metadata.get_identifiers())[2]
raw = self.browser.open_novisit(url, timeout=timeout).read()
doc = html.fromstring(raw)
# series
xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)'
series = doc.xpath(xpt)
if series:
metadata.series = series
xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")'
isbn_str = doc.xpath(xpt)
if isbn_str:
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
if all_isbns:
metadata.all_isbns = all_isbns
metadata.isbn = all_isbns[0]
xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]'
publishers = doc.xpath(xpt)
if publishers:
metadata.publisher = publishers[0].text
xpt = u'string(../text()[contains(., "г.")])'
yearIn = publishers[0].xpath(xpt)
if yearIn:
matcher = re.search(r'\d{4}', yearIn)
if matcher:
year = int(matcher.group(0))
# only year is available, so use 1-st of Jan
metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py
#metadata.pubdate = datetime(year, 1, 1)
xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
displLang = publishers[0].xpath(xpt)
lang_code =_translageLanguageToCode(displLang)
if lang_code:
metadata.language = lang_code
# overwrite comments from HTML if any
# tr/td[contains(.//text(), "От издателя")] -> does not work, why?
xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]'
comment_elem = doc.xpath(xpt)
if comment_elem:
comments = unicode(etree.tostring(comment_elem[0]))
if comments:
# cleanup root tag, TODO: remove tags like object/embeded
comments = re.sub(r'^<td.+?>|</td>.+?$', u'', comments).strip()
if comments:
metadata.comments = comments
else:
log.debug('No book description found in HTML')
# }}}
def _quoteString(str): # {{{
return '"' + str + '"' if str and str.find(' ') != -1 else str
# }}}
# TODO: make customizable
def _translateToBigCoverUrl(coverUrl): # {{{
# http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif
# http://www.ozon.ru/multimedia/books_covers/1002986468.jpg
m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl)
if m:
coverUrl = m.group(1) + m.group(2) + 'jpg'
return coverUrl
# }}}
def _get_affiliateId(): # {{{
import random
aff_id = 'romuk'
# Use Kovid's affiliate id 30% of the time.
if random.randint(1, 10) in (1, 2, 3):
aff_id = 'kovidgoyal'
return aff_id
# }}}
# for now only RUS ISBN are supported
#http://ru.wikipedia.org/wiki/ISBN_российских_издательств
isbn_pat = re.compile(r"""
^
(\d{3})? # match GS1 Prefix for ISBN13
(5) # group identifier for rRussian-speaking countries
( # begin variable length for Publisher
[01]\d{1}| # 2x
[2-6]\d{2}| # 3x
7\d{3}| # 4x (starting with 7)
8[0-4]\d{2}| # 4x (starting with 8)
9[2567]\d{2}| # 4x (starting with 9)
99[26]\d{1}| # 4x (starting with 99)
8[5-9]\d{3}| # 5x (starting with 8)
9[348]\d{3}| # 5x (starting with 9)
900\d{2}| # 5x (starting with 900)
91[0-8]\d{2}| # 5x (starting with 91)
90[1-9]\d{3}| # 6x (starting with 90)
919\d{3}| # 6x (starting with 919)
99[^26]\d{4} # 7x (starting with 99)
) # end variable length for Publisher
(\d+) # Title
([\dX]) # Check digit
$
""", re.VERBOSE)
def _format_isbn(log, isbn): # {{{
res = check_isbn(isbn)
if res:
m = isbn_pat.match(res)
if m:
res = '-'.join([g for g in m.groups() if g])
else:
log.error('cannot format isbn %s'%isbn)
return res
# }}}
def _translageLanguageToCode(displayLang): # {{{
displayLang = unicode(displayLang).strip() if displayLang else None
langTbl = { None: 'ru',
u'Немецкий': 'de',
u'Английский': 'en',
u'Французский': 'fr',
u'Итальянский': 'it',
u'Испанский': 'es',
u'Китайский': 'zh',
u'Японский': 'ja' }
return langTbl.get(displayLang, None)
# }}}
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
# comment some touched_fields before run thoses tests
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
title_test, authors_test, isbn_test)
test_identify_plugin(Ozon.name,
[
(
{'identifiers':{'isbn': '9785916572629'} },
[title_test(u'На все четыре стороны', exact=True),
authors_test([u'А. А. Гилл'])]
),
(
{'identifiers':{}, 'title':u'Der Himmel Kennt Keine Gunstlinge',
'authors':[u'Erich Maria Remarque']},
[title_test(u'Der Himmel Kennt Keine Gunstlinge', exact=True),
authors_test([u'Erich Maria Remarque'])]
),
(
{'identifiers':{ }, 'title':u'Метро 2033',
'authors':[u'Дмитрий Глуховский']},
[title_test(u'Метро 2033', exact=False)]
),
(
{'identifiers':{'isbn': '9785170727209'}, 'title':u'Метро 2033',
'authors':[u'Дмитрий Глуховский']},
[title_test(u'Метро 2033', exact=True),
authors_test([u'Дмитрий Глуховский']),
isbn_test('9785170727209')]
),
(
{'identifiers':{'isbn': '5-699-13613-4'}, 'title':u'Метро 2033',
'authors':[u'Дмитрий Глуховский']},
[title_test(u'Метро 2033', exact=True),
authors_test([u'Дмитрий Глуховский'])]
),
(
{'identifiers':{}, 'title':u'Метро',
'authors':[u'Глуховский']},
[title_test(u'Метро', exact=False)]
),
])
# }}}

View File

@ -50,6 +50,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
def search(self, query, max_results=10, timeout=60):
search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
'searchText=%s&searchContext=ebook' % urllib2.quote(query)
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
counter = max_results
br = browser()
@ -61,16 +62,13 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
break
counter -= 1
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
s = SearchResult()
s.detail_item = data.xpath(xp_template.format('ID'))
s.title = data.xpath(xp_template.format('Name'))
s.author = data.xpath(xp_template.format('Author'))
s.price = data.xpath(xp_template.format('Price'))
s.cover_url = data.xpath(xp_template.format('Picture'))
if re.match("^\d+?\.\d+?$", s.price):
s.price = u'{:.2F} руб.'.format(float(s.price))
s.price = format_price_in_RUR(s.price)
yield s
def get_details(self, search_result, timeout=60):
@ -98,6 +96,21 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
# search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item)
return result
def format_price_in_RUR(price):
'''
Try to format price according ru locale: '12 212,34 руб.'
@param price: price in format like 25.99
@return: formatted price if possible otherwise original value
@rtype: unicode
'''
if price and re.match("^\d*?\.\d*?$", price):
try:
price = u'{:,.2F} руб.'.format(float(price))
price = price.replace(',', ' ').replace('.', ',', 1)
except:
pass
return price
def _parse_ebook_formats(formatsStr):
'''
Creates a list with displayable names of the formats

View File

@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import urllib
from contextlib import closing
from lxml import etree
from calibre import browser
from calibre.gui2.store.basic_config import BasicStoreConfig
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
from calibre.gui2.store.search_result import SearchResult
class XinXiiStore(BasicStoreConfig, OpenSearchOPDSStore):
open_search_url = 'http://www.xinxii.com/catalog-search/'
web_url = 'http://xinxii.com/'
# http://www.xinxii.com/catalog/
def search(self, query, max_results=10, timeout=60):
'''
XinXii's open search url is:
http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&amp;pw={startPage?}&amp;doc_lang={docLang}&amp;ff={docFormat},{docFormat},{docFormat}
This url requires the docLang and docFormat. However, the search itself
sent to XinXii does not require them. They can be ignored. We cannot
push this into the stanard OpenSearchOPDSStore search because of the
required attributes.
XinXii doesn't return all info supported by OpenSearchOPDSStore search
function so this one is modified to remove parts that are used.
'''
url = 'http://www.xinxii.com/catalog-search/query/?keywords=' + urllib.quote_plus(query)
counter = max_results
br = browser()
with closing(br.open(url, timeout=timeout)) as f:
doc = etree.fromstring(f.read())
for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0:
break
counter -= 1
s = SearchResult()
s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()
for link in data.xpath('./*[local-name() = "link"]'):
rel = link.get('rel')
href = link.get('href')
type = link.get('type')
if rel and href and type:
if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'):
s.cover_url = href
if rel == 'alternate':
s.detail_item = href
s.formats = 'EPUB, PDF'
s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip()
s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip()
price_e = data.xpath('.//*[local-name() = "price"][1]')
if price_e:
price_e = price_e[0]
currency_code = price_e.get('currencycode', '')
price = ''.join(price_e.xpath('.//text()')).strip()
s.price = currency_code + ' ' + price
s.price = s.price.strip()
yield s

View File

@ -640,6 +640,7 @@ class LibraryPage(QWizardPage, LibraryUI):
metadata_plugins = {
'zh' : ('Douban Books',),
'fr' : ('Nicebooks',),
'ru' : ('OZON.ru',),
}.get(lang, [])
from calibre.customize.ui import enable_plugin
for name in metadata_plugins:

View File

@ -360,7 +360,7 @@ When you first run |app|, it will ask you for a folder in which to store your bo
Metadata about the books is stored in the file ``metadata.db`` at the top level of the library folder This file is is a sqlite database. When backing up your library make sure you copy the entire folder and all its sub-folders.
The library folder and all it's contents make up what is called a *|app| library*. You can have multiple such libraries. To manage the libraries, click the |app| icon on the toolbar. You can create new libraries, remove/rename existing ones and switch between libraries easily.
The library folder and all it's contents make up what is called a |app| library. You can have multiple such libraries. To manage the libraries, click the |app| icon on the toolbar. You can create new libraries, remove/rename existing ones and switch between libraries easily.
You can copy or move books between different libraries (once you have more than one library setup) by right clicking on a book and selecting the :guilabel:`Copy to library` action.