From 770a5744e7dedf7539a15ccb2cc8ff7fe4763cd4 Mon Sep 17 00:00:00 2001 From: Roman Mukhin Date: Sun, 1 Dec 2013 14:34:20 +0100 Subject: [PATCH] Update the metadata download and Get Books plugins for ozon.ru Changes in OZON.RU corresponding files due to API- and HTMLlayout Changes from ozon.ru Addition improved search hit relevance calculation, introduced configuration parameter. --- src/calibre/ebooks/metadata/sources/ozon.py | 136 +++++++++++------- .../gui2/store/stores/ozon_ru_plugin.py | 9 +- 2 files changed, 88 insertions(+), 57 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index 11f09e166a..2e1e613df8 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -2,7 +2,7 @@ from __future__ import (unicode_literals, division, absolute_import, print_function) __license__ = 'GPL 3' -__copyright__ = '2011, Roman Mukhin ' +__copyright__ = '2011-2013 Roman Mukhin ' __docformat__ = 'restructuredtext en' import re @@ -10,7 +10,7 @@ from Queue import Queue, Empty from calibre import as_unicode from calibre.ebooks.metadata import check_isbn -from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.sources.base import Source, Option from calibre.ebooks.metadata.book.base import Metadata class Ozon(Source): @@ -36,6 +36,13 @@ class Ozon(Source): '(?:[0-9]+[- ]?){2}[0-9X]' isbnRegex = re.compile(isbnPattern) + optkey_strictmatch = 'strict_result_match' + options = ( + Option(optkey_strictmatch, 'bool', False, + _('Filter out less relevant hits from the search results'), + _('Improve search result by removing less relevant hits. It can be useful to refine the search when there are many matches')), + ) + def get_book_url(self, identifiers): # {{{ import urllib2 ozon_id = identifiers.get('ozon', None) @@ -48,34 +55,38 @@ class Ozon(Source): def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ from urllib import quote_plus + # div_book -> search only books, ebooks and audio books search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' # for ozon.ru search we have to format ISBN with '-' isbn = _format_isbn(log, identifiers.get('isbn', None)) + if isbn and not '-' in isbn: + log.error("%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)" + %(self.name, isbn)) + isbn = None + ozonid = identifiers.get('ozon', None) + qItems = set([ozonid, isbn]) + unk = unicode(_('Unknown')).upper() - if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid: - qItems = set([isbn, title]) - if authors: - qItems |= frozenset(authors) - qItems.discard(None) - qItems.discard('') - qItems = map(_quoteString, qItems) - q = u' '.join(qItems).strip() - log.info(u'search string: ' + q) + if title and title != unk: + qItems.add(title) + if authors and authors != [unk]: + qItems |= frozenset(authors) - if isinstance(q, unicode): - q = q.encode('utf-8') - if not q: - return None - - search_url += quote_plus(q) - else: - search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid + qItems.discard(None) + qItems.discard('') + qItems = map(_quoteString, qItems) + searchText = u' '.join(qItems).strip() + if isinstance(searchText, unicode): + searchText = searchText.encode('utf-8') + if not searchText: + return None + search_url += quote_plus(searchText) log.debug(u'search url: %r'%search_url) return search_url # }}} @@ -125,13 +136,14 @@ class Ozon(Source): authors = map(_normalizeAuthorNameWithInitials, map(unicode.upper, map(unicode, authors))) if authors else None ozon_id = identifiers.get('ozon', None) + #log.debug(u'ozonid: ', ozon_id) unk = unicode(_('Unknown')).upper() if title == unk: title = None - if authors == [unk]: + if authors == [unk] or authors == []: authors = None def in_authors(authors, miauthors): @@ -142,33 +154,55 @@ class Ozon(Source): return True return None - def ensure_metadata_match(mi): # {{{ - match = True + def calc_source_relevance(mi): # {{{ + relevance = 0 if title: mititle = unicode(mi.title).upper() if mi.title else '' if reRemoveFromTitle: mititle = reRemoveFromTitle.sub('', mititle) - match = title in mititle - #log.debug(u't=> %s <> %s'%(title, mititle)) - if match and authors: + if title in mititle: + relevance += 3 + elif mititle: + # log.debug(u'!!%s!'%mititle) + relevance -= 3 + else: + relevance += 1 + + if authors: miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else [] - match = in_authors(authors, miauthors) + if (in_authors(authors, miauthors)): + relevance += 3 + elif u''.join(miauthors): + # log.debug(u'!%s!'%u'|'.join(miauthors)) + relevance -= 3 + else: + relevance += 1 - if match and ozon_id: + if ozon_id: mozon_id = mi.identifiers['ozon'] - match = ozon_id == mozon_id + if ozon_id == mozon_id: + relevance += 100 - return match + if relevance < 0: + relevance = 0 + return relevance + # }}} + strict_match = self.prefs[self.optkey_strictmatch] metadata = [] - for i, entry in enumerate(entries): + for entry in entries: mi = self.to_metadata(log, entry) - mi.source_relevance = i - if ensure_metadata_match(mi): + relevance = calc_source_relevance(mi) + # TODO findout which is really used + mi.source_relevance = relevance + mi.relevance_in_source = relevance + + if not strict_match or relevance > 0: metadata.append(mi) #log.debug(u'added metadata %s %s.'%(mi.title, mi.authors)) else: - log.debug(u'skipped metadata %s %s. (does not match the query)'%(unicode(mi.title), mi.authors)) + log.debug(u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)' + %(mi.title, u' '.join(mi.authors), relevance)) return metadata # }}} @@ -296,47 +330,49 @@ class Ozon(Source): raw = self.browser.open_novisit(url, timeout=timeout).read() doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) - xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)' - xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")' + xpt_tmpl_base = u'//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]' + xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)' # series Серия/Серии - xpt = xpt_prod_det_at % u'Сери' - # % u'Серия:' - series = doc.xpath(xpt) + series = doc.xpath(xpt_tmpl_a % u'Сери') if series: metadata.series = series + #log.debug(u'Seria: ', metadata.series) - xpt = u'normalize-space(//*[@class="product-detail"]//text()[starts-with(., "ISBN")])' - isbn_str = doc.xpath(xpt) + xpt_isbn = u'normalize-space(' + xpt_tmpl_base + u')' + isbn_str = doc.xpath(xpt_isbn % u'ISBN') if isbn_str: + #log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str)) all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)] if all_isbns: metadata.all_isbns = all_isbns metadata.isbn = all_isbns[0] + #log.debug(u'ISBN: ', metadata.isbn) - xpt = xpt_prod_det_at % u'Издатель' - publishers = doc.xpath(xpt) + publishers = doc.xpath(xpt_tmpl_a % u'Издатель') if publishers: metadata.publisher = publishers + #log.debug(u'Publisher: ', metadata.publisher) + xpt_lang = u'substring-after(normalize-space(//text()[contains(normalize-space(.), "%s")]), ":")' displ_lang = None - xpt = xpt_prod_det_tx % u'Язык' - langs = doc.xpath(xpt) + langs = doc.xpath(xpt_lang % u'Язык') if langs: lng_splt = langs.split(u',') if lng_splt: displ_lang = lng_splt[0].strip() metadata.language = _translageLanguageToCode(displ_lang) - #log.debug(u'language: %s'%displ_lang) + #log.debug(u'Language: ', metadata.language) # can be set before from xml search responce if not metadata.pubdate: - xpt = u'normalize-space(substring-after(//div[@class="product-detail"]//text()[contains(., "г.")],";"))' - yearIn = doc.xpath(xpt) + xpt = u'substring-after(' + xpt_isbn + u',";")' + yearIn = doc.xpath(xpt % u'ISBN') if yearIn: matcher = re.search(r'\d{4}', yearIn) if matcher: metadata.pubdate = toPubdate(log, matcher.group(0)) + #log.debug(u'Pubdate: ', metadata.pubdate) # overwrite comments from HTML if any xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa @@ -352,7 +388,7 @@ class Ozon(Source): if comments and (not metadata.comments or len(comments) > len(metadata.comments)): metadata.comments = comments else: - log.debug('HTML book description skipped in favour of search service xml responce') + log.debug('HTML book description skipped in favor of search service xml response') else: log.debug('No book description found in HTML') # }}} @@ -396,7 +432,7 @@ def _format_isbn(log, isbn): # {{{ isbn_pat = re.compile(r""" ^ (\d{3})? # match GS1 Prefix for ISBN13 - (5) # group identifier for rRussian-speaking countries + (5) # group identifier for Russian-speaking countries ( # begin variable length for Publisher [01]\d{1}| # 2x [2-6]\d{2}| # 3x @@ -423,7 +459,7 @@ def _format_isbn(log, isbn): # {{{ if m: res = '-'.join([g for g in m.groups() if g]) else: - log.error('cannot format isbn %s'%isbn) + log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported'%isbn) return res # }}} diff --git a/src/calibre/gui2/store/stores/ozon_ru_plugin.py b/src/calibre/gui2/store/stores/ozon_ru_plugin.py index 9a3c2dabaa..5dd48230f8 100644 --- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py +++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 1 # Needed for dynamic plugin loading +store_version = 2 # Needed for dynamic plugin loading __license__ = 'GPL 3' -__copyright__ = '2011, Roman Mukhin ' +__copyright__ = '2011-2013, Roman Mukhin ' __docformat__ = 'restructuredtext en' import random @@ -52,11 +52,6 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): 'searchText=%s&searchContext=ebook' % urllib2.quote(query) search_urls = [ search_url ] - ## add this as the fist try if it looks like ozon ID - if re.match("^\d{6,9}$", query): - ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query - search_urls.insert(0, ozon_detail) - xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser()