mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update the metadata download and Get Books plugins for ozon.ru
Changes in OZON.RU corresponding files due to API- and HTMLlayout Changes from ozon.ru Addition improved search hit relevance calculation, introduced configuration parameter.
This commit is contained in:
parent
f86638cf17
commit
770a5744e7
@ -2,7 +2,7 @@
|
|||||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
|
__copyright__ = '2011-2013 Roman Mukhin <ramses_ru at hotmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
@ -10,7 +10,7 @@ from Queue import Queue, Empty
|
|||||||
|
|
||||||
from calibre import as_unicode
|
from calibre import as_unicode
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
from calibre.ebooks.metadata.sources.base import Source
|
from calibre.ebooks.metadata.sources.base import Source, Option
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
|
||||||
class Ozon(Source):
|
class Ozon(Source):
|
||||||
@ -36,6 +36,13 @@ class Ozon(Source):
|
|||||||
'(?:[0-9]+[- ]?){2}[0-9X]'
|
'(?:[0-9]+[- ]?){2}[0-9X]'
|
||||||
isbnRegex = re.compile(isbnPattern)
|
isbnRegex = re.compile(isbnPattern)
|
||||||
|
|
||||||
|
optkey_strictmatch = 'strict_result_match'
|
||||||
|
options = (
|
||||||
|
Option(optkey_strictmatch, 'bool', False,
|
||||||
|
_('Filter out less relevant hits from the search results'),
|
||||||
|
_('Improve search result by removing less relevant hits. It can be useful to refine the search when there are many matches')),
|
||||||
|
)
|
||||||
|
|
||||||
def get_book_url(self, identifiers): # {{{
|
def get_book_url(self, identifiers): # {{{
|
||||||
import urllib2
|
import urllib2
|
||||||
ozon_id = identifiers.get('ozon', None)
|
ozon_id = identifiers.get('ozon', None)
|
||||||
@ -48,34 +55,38 @@ class Ozon(Source):
|
|||||||
|
|
||||||
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||||
from urllib import quote_plus
|
from urllib import quote_plus
|
||||||
|
|
||||||
# div_book -> search only books, ebooks and audio books
|
# div_book -> search only books, ebooks and audio books
|
||||||
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
|
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
|
||||||
|
|
||||||
# for ozon.ru search we have to format ISBN with '-'
|
# for ozon.ru search we have to format ISBN with '-'
|
||||||
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
||||||
|
if isbn and not '-' in isbn:
|
||||||
|
log.error("%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)"
|
||||||
|
%(self.name, isbn))
|
||||||
|
isbn = None
|
||||||
|
|
||||||
ozonid = identifiers.get('ozon', None)
|
ozonid = identifiers.get('ozon', None)
|
||||||
|
|
||||||
|
qItems = set([ozonid, isbn])
|
||||||
|
|
||||||
unk = unicode(_('Unknown')).upper()
|
unk = unicode(_('Unknown')).upper()
|
||||||
if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid:
|
|
||||||
qItems = set([isbn, title])
|
if title and title != unk:
|
||||||
if authors:
|
qItems.add(title)
|
||||||
|
if authors and authors != [unk]:
|
||||||
qItems |= frozenset(authors)
|
qItems |= frozenset(authors)
|
||||||
|
|
||||||
qItems.discard(None)
|
qItems.discard(None)
|
||||||
qItems.discard('')
|
qItems.discard('')
|
||||||
qItems = map(_quoteString, qItems)
|
qItems = map(_quoteString, qItems)
|
||||||
|
searchText = u' '.join(qItems).strip()
|
||||||
q = u' '.join(qItems).strip()
|
if isinstance(searchText, unicode):
|
||||||
log.info(u'search string: ' + q)
|
searchText = searchText.encode('utf-8')
|
||||||
|
if not searchText:
|
||||||
if isinstance(q, unicode):
|
|
||||||
q = q.encode('utf-8')
|
|
||||||
if not q:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
search_url += quote_plus(q)
|
search_url += quote_plus(searchText)
|
||||||
else:
|
|
||||||
search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid
|
|
||||||
|
|
||||||
log.debug(u'search url: %r'%search_url)
|
log.debug(u'search url: %r'%search_url)
|
||||||
return search_url
|
return search_url
|
||||||
# }}}
|
# }}}
|
||||||
@ -125,13 +136,14 @@ class Ozon(Source):
|
|||||||
authors = map(_normalizeAuthorNameWithInitials,
|
authors = map(_normalizeAuthorNameWithInitials,
|
||||||
map(unicode.upper, map(unicode, authors))) if authors else None
|
map(unicode.upper, map(unicode, authors))) if authors else None
|
||||||
ozon_id = identifiers.get('ozon', None)
|
ozon_id = identifiers.get('ozon', None)
|
||||||
|
#log.debug(u'ozonid: ', ozon_id)
|
||||||
|
|
||||||
unk = unicode(_('Unknown')).upper()
|
unk = unicode(_('Unknown')).upper()
|
||||||
|
|
||||||
if title == unk:
|
if title == unk:
|
||||||
title = None
|
title = None
|
||||||
|
|
||||||
if authors == [unk]:
|
if authors == [unk] or authors == []:
|
||||||
authors = None
|
authors = None
|
||||||
|
|
||||||
def in_authors(authors, miauthors):
|
def in_authors(authors, miauthors):
|
||||||
@ -142,33 +154,55 @@ class Ozon(Source):
|
|||||||
return True
|
return True
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def ensure_metadata_match(mi): # {{{
|
def calc_source_relevance(mi): # {{{
|
||||||
match = True
|
relevance = 0
|
||||||
if title:
|
if title:
|
||||||
mititle = unicode(mi.title).upper() if mi.title else ''
|
mititle = unicode(mi.title).upper() if mi.title else ''
|
||||||
if reRemoveFromTitle:
|
if reRemoveFromTitle:
|
||||||
mititle = reRemoveFromTitle.sub('', mititle)
|
mititle = reRemoveFromTitle.sub('', mititle)
|
||||||
match = title in mititle
|
if title in mititle:
|
||||||
#log.debug(u't=> %s <> %s'%(title, mititle))
|
relevance += 3
|
||||||
if match and authors:
|
elif mititle:
|
||||||
|
# log.debug(u'!!%s!'%mititle)
|
||||||
|
relevance -= 3
|
||||||
|
else:
|
||||||
|
relevance += 1
|
||||||
|
|
||||||
|
if authors:
|
||||||
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
|
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
|
||||||
match = in_authors(authors, miauthors)
|
if (in_authors(authors, miauthors)):
|
||||||
|
relevance += 3
|
||||||
|
elif u''.join(miauthors):
|
||||||
|
# log.debug(u'!%s!'%u'|'.join(miauthors))
|
||||||
|
relevance -= 3
|
||||||
|
else:
|
||||||
|
relevance += 1
|
||||||
|
|
||||||
if match and ozon_id:
|
if ozon_id:
|
||||||
mozon_id = mi.identifiers['ozon']
|
mozon_id = mi.identifiers['ozon']
|
||||||
match = ozon_id == mozon_id
|
if ozon_id == mozon_id:
|
||||||
|
relevance += 100
|
||||||
|
|
||||||
return match
|
if relevance < 0:
|
||||||
|
relevance = 0
|
||||||
|
return relevance
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
strict_match = self.prefs[self.optkey_strictmatch]
|
||||||
metadata = []
|
metadata = []
|
||||||
for i, entry in enumerate(entries):
|
for entry in entries:
|
||||||
mi = self.to_metadata(log, entry)
|
mi = self.to_metadata(log, entry)
|
||||||
mi.source_relevance = i
|
relevance = calc_source_relevance(mi)
|
||||||
if ensure_metadata_match(mi):
|
# TODO findout which is really used
|
||||||
|
mi.source_relevance = relevance
|
||||||
|
mi.relevance_in_source = relevance
|
||||||
|
|
||||||
|
if not strict_match or relevance > 0:
|
||||||
metadata.append(mi)
|
metadata.append(mi)
|
||||||
#log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
|
#log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
|
||||||
else:
|
else:
|
||||||
log.debug(u'skipped metadata %s %s. (does not match the query)'%(unicode(mi.title), mi.authors))
|
log.debug(u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)'
|
||||||
|
%(mi.title, u' '.join(mi.authors), relevance))
|
||||||
return metadata
|
return metadata
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@ -296,47 +330,49 @@ class Ozon(Source):
|
|||||||
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
||||||
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
||||||
|
|
||||||
xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
|
xpt_tmpl_base = u'//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]'
|
||||||
xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
|
xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)'
|
||||||
|
|
||||||
# series Серия/Серии
|
# series Серия/Серии
|
||||||
xpt = xpt_prod_det_at % u'Сери'
|
series = doc.xpath(xpt_tmpl_a % u'Сери')
|
||||||
# % u'Серия:'
|
|
||||||
series = doc.xpath(xpt)
|
|
||||||
if series:
|
if series:
|
||||||
metadata.series = series
|
metadata.series = series
|
||||||
|
#log.debug(u'Seria: ', metadata.series)
|
||||||
|
|
||||||
xpt = u'normalize-space(//*[@class="product-detail"]//text()[starts-with(., "ISBN")])'
|
xpt_isbn = u'normalize-space(' + xpt_tmpl_base + u')'
|
||||||
isbn_str = doc.xpath(xpt)
|
isbn_str = doc.xpath(xpt_isbn % u'ISBN')
|
||||||
if isbn_str:
|
if isbn_str:
|
||||||
|
#log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str))
|
||||||
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
|
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
|
||||||
if all_isbns:
|
if all_isbns:
|
||||||
metadata.all_isbns = all_isbns
|
metadata.all_isbns = all_isbns
|
||||||
metadata.isbn = all_isbns[0]
|
metadata.isbn = all_isbns[0]
|
||||||
|
#log.debug(u'ISBN: ', metadata.isbn)
|
||||||
|
|
||||||
xpt = xpt_prod_det_at % u'Издатель'
|
publishers = doc.xpath(xpt_tmpl_a % u'Издатель')
|
||||||
publishers = doc.xpath(xpt)
|
|
||||||
if publishers:
|
if publishers:
|
||||||
metadata.publisher = publishers
|
metadata.publisher = publishers
|
||||||
|
#log.debug(u'Publisher: ', metadata.publisher)
|
||||||
|
|
||||||
|
xpt_lang = u'substring-after(normalize-space(//text()[contains(normalize-space(.), "%s")]), ":")'
|
||||||
displ_lang = None
|
displ_lang = None
|
||||||
xpt = xpt_prod_det_tx % u'Язык'
|
langs = doc.xpath(xpt_lang % u'Язык')
|
||||||
langs = doc.xpath(xpt)
|
|
||||||
if langs:
|
if langs:
|
||||||
lng_splt = langs.split(u',')
|
lng_splt = langs.split(u',')
|
||||||
if lng_splt:
|
if lng_splt:
|
||||||
displ_lang = lng_splt[0].strip()
|
displ_lang = lng_splt[0].strip()
|
||||||
metadata.language = _translageLanguageToCode(displ_lang)
|
metadata.language = _translageLanguageToCode(displ_lang)
|
||||||
#log.debug(u'language: %s'%displ_lang)
|
#log.debug(u'Language: ', metadata.language)
|
||||||
|
|
||||||
# can be set before from xml search responce
|
# can be set before from xml search responce
|
||||||
if not metadata.pubdate:
|
if not metadata.pubdate:
|
||||||
xpt = u'normalize-space(substring-after(//div[@class="product-detail"]//text()[contains(., "г.")],";"))'
|
xpt = u'substring-after(' + xpt_isbn + u',";")'
|
||||||
yearIn = doc.xpath(xpt)
|
yearIn = doc.xpath(xpt % u'ISBN')
|
||||||
if yearIn:
|
if yearIn:
|
||||||
matcher = re.search(r'\d{4}', yearIn)
|
matcher = re.search(r'\d{4}', yearIn)
|
||||||
if matcher:
|
if matcher:
|
||||||
metadata.pubdate = toPubdate(log, matcher.group(0))
|
metadata.pubdate = toPubdate(log, matcher.group(0))
|
||||||
|
#log.debug(u'Pubdate: ', metadata.pubdate)
|
||||||
|
|
||||||
# overwrite comments from HTML if any
|
# overwrite comments from HTML if any
|
||||||
xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa
|
xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa
|
||||||
@ -352,7 +388,7 @@ class Ozon(Source):
|
|||||||
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
|
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
|
||||||
metadata.comments = comments
|
metadata.comments = comments
|
||||||
else:
|
else:
|
||||||
log.debug('HTML book description skipped in favour of search service xml responce')
|
log.debug('HTML book description skipped in favor of search service xml response')
|
||||||
else:
|
else:
|
||||||
log.debug('No book description found in HTML')
|
log.debug('No book description found in HTML')
|
||||||
# }}}
|
# }}}
|
||||||
@ -396,7 +432,7 @@ def _format_isbn(log, isbn): # {{{
|
|||||||
isbn_pat = re.compile(r"""
|
isbn_pat = re.compile(r"""
|
||||||
^
|
^
|
||||||
(\d{3})? # match GS1 Prefix for ISBN13
|
(\d{3})? # match GS1 Prefix for ISBN13
|
||||||
(5) # group identifier for rRussian-speaking countries
|
(5) # group identifier for Russian-speaking countries
|
||||||
( # begin variable length for Publisher
|
( # begin variable length for Publisher
|
||||||
[01]\d{1}| # 2x
|
[01]\d{1}| # 2x
|
||||||
[2-6]\d{2}| # 3x
|
[2-6]\d{2}| # 3x
|
||||||
@ -423,7 +459,7 @@ def _format_isbn(log, isbn): # {{{
|
|||||||
if m:
|
if m:
|
||||||
res = '-'.join([g for g in m.groups() if g])
|
res = '-'.join([g for g in m.groups() if g])
|
||||||
else:
|
else:
|
||||||
log.error('cannot format isbn %s'%isbn)
|
log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported'%isbn)
|
||||||
return res
|
return res
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
||||||
store_version = 1 # Needed for dynamic plugin loading
|
store_version = 2 # Needed for dynamic plugin loading
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
|
__copyright__ = '2011-2013, Roman Mukhin <ramses_ru at hotmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import random
|
import random
|
||||||
@ -52,11 +52,6 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
|
|||||||
'searchText=%s&searchContext=ebook' % urllib2.quote(query)
|
'searchText=%s&searchContext=ebook' % urllib2.quote(query)
|
||||||
search_urls = [ search_url ]
|
search_urls = [ search_url ]
|
||||||
|
|
||||||
## add this as the fist try if it looks like ozon ID
|
|
||||||
if re.match("^\d{6,9}$", query):
|
|
||||||
ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query
|
|
||||||
search_urls.insert(0, ozon_detail)
|
|
||||||
|
|
||||||
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
|
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
|
||||||
counter = max_results
|
counter = max_results
|
||||||
br = browser()
|
br = browser()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user