From 3b644906a41302e5a97523b138830ce205c4bdd4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 14 Oct 2016 08:59:40 +0530 Subject: [PATCH] Update ozon.ru metadata download plugin to fix searching for books by ISBN --- src/calibre/ebooks/metadata/sources/ozon.py | 59 +++++++++++++-------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index f2ad4fb9a8..5a42f5aa40 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -84,13 +84,15 @@ class Ozon(Source): # Added Russian variant of 'Unknown' unk = [_('Unknown').upper(), 'Неизв.'.upper(), icu_upper('Неизв.')] - if title and title not in unk: - qItems.add(title) + # use only ozonid if specified otherwise ozon.ru does not like a combination + if not ozonid: + if title and title not in unk: + qItems.add(title) - if authors: - for auth in authors: - if icu_upper(auth) not in unk: - qItems.add(auth) + if authors: + for auth in authors: + if icu_upper(auth) not in unk: + qItems.add(auth) qItems.discard(None) qItems.discard('') @@ -102,7 +104,7 @@ class Ozon(Source): return None search_url += quote_plus(searchText) - log.debug(u'search url: %r' % search_url) + log.debug(u'search url: %s' % search_url) return search_url # }}} @@ -112,6 +114,7 @@ class Ozon(Source): from calibre.ebooks.chardet import xml_to_unicode from HTMLParser import HTMLParser from lxml import etree, html + import json if not self.is_configured(): return @@ -131,8 +134,11 @@ class Ozon(Source): doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) entries_block = doc.xpath(u'//div[@class="bSearchResult"]') + # log.debug(u'HTML: %s' % xml_to_unicode(raw, verbose=True)[0]) + if entries_block: entries = doc.xpath(u'//div[contains(@itemprop, "itemListElement")]') + # log.debug(u'entries_block') # for entry in entries: # log.debug('entries %s' % entree.tostring(entry)) metadata = self.get_metadata(log, entries, title, authors, identifiers) @@ -140,19 +146,30 @@ class Ozon(Source): else: # Redirect page: trying to extract ozon_id from javascript data h = HTMLParser() - entry_string = (h.unescape(unicode(etree.tostring(doc, pretty_print=True)))) - id_title_pat = re.compile(u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)') - # result containing ozon_id and entry_title - entry_info = re.search(id_title_pat, entry_string) - ozon_id = entry_info.group(1) if entry_info else None - entry_title = entry_info.group(2) if entry_info else None + entry_string = (h.unescape(etree.tostring(doc, pretty_print=True, encoding=unicode))) + json_pat = re.compile(u'dataLayer\s*=\s*(.+)?;') + json_info = re.search(json_pat, entry_string) + jsondata = json_info.group(1) if json_info else None - if ozon_id: - metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors) - identifiers['ozon'] = ozon_id - self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={}) - else: - log.error('No SearchResults in Ozon.ru response found') + # log.debug(u'jsondata: %s' % jsondata) + dataLayer = json.loads(jsondata) if jsondata else None + + ozon_id = None + if dataLayer and dataLayer[0] and 'ecommerce' in dataLayer[0]: + jsproduct = dataLayer[0]['ecommerce']['detail']['products'][0] + ozon_id = as_unicode(jsproduct['id']) + entry_title = as_unicode(jsproduct['name']) + + log.debug(u'ozon_id %s' % ozon_id) + log.debug(u'entry_title %s' % entry_title) + + if ozon_id: + metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors) + identifiers['ozon'] = ozon_id + self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={}) + + if not ozon_id: + log.error('No SearchResults in Ozon.ru response found!') except Exception as e: log.exception('Failed to parse identify results') @@ -478,7 +495,7 @@ class Ozon(Source): langs_elem = doc.xpath(u'//div[contains(text(), "зык")]') if langs_elem: langs_elem = langs_elem[0].getnext() - langs = langs_elem.xpath(u'text()')[0].strip() + langs = langs_elem.xpath(u'text()')[0].strip() if langs_elem else None if langs: lng_splt = langs.split(u',') if lng_splt: @@ -576,9 +593,9 @@ def _format_isbn(log, isbn): # {{{ log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported' % isbn) return res - # }}} + def _translageLanguageToCode(displayLang): # {{{ displayLang = unicode(displayLang).strip() if displayLang else None langTbl = {None: 'ru',