Update ozon.ru metadata download plugin to fix searching for books by ISBN

2025-07-08 10:44:09 -04:00 · 2016-10-14 08:59:40 +05:30 · 2016-10-14 08:59:40 +05:30 · 3b644906a4
commit 3b644906a4
parent 53c5ff5daa
1 changed files with 38 additions and 21 deletions
--- a/src/calibre/ebooks/metadata/sources/ozon.py
+++ b/src/calibre/ebooks/metadata/sources/ozon.py
@ -84,13 +84,15 @@ class Ozon(Source):
        # Added Russian variant of 'Unknown'
        unk = [_('Unknown').upper(), 'Неизв.'.upper(), icu_upper('Неизв.')]

-        if title and title not in unk:
-            qItems.add(title)
+        # use only ozonid if specified otherwise ozon.ru does not like a combination
+        if not ozonid:
+            if title and title not in unk:
+                qItems.add(title)

-        if authors:
-            for auth in authors:
-                if icu_upper(auth) not in unk:
-                    qItems.add(auth)
+            if authors:
+                for auth in authors:
+                    if icu_upper(auth) not in unk:
+                        qItems.add(auth)

        qItems.discard(None)
        qItems.discard('')
@ -102,7 +104,7 @@ class Ozon(Source):
            return None

        search_url += quote_plus(searchText)
-        log.debug(u'search url: %r' % search_url)
+        log.debug(u'search url: %s' % search_url)
        return search_url

    # }}}
@ -112,6 +114,7 @@ class Ozon(Source):
        from calibre.ebooks.chardet import xml_to_unicode
        from HTMLParser import HTMLParser
        from lxml import etree, html
+        import json

        if not self.is_configured():
            return
@ -131,8 +134,11 @@ class Ozon(Source):
            doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
            entries_block = doc.xpath(u'//div[@class="bSearchResult"]')

+            # log.debug(u'HTML: %s' % xml_to_unicode(raw, verbose=True)[0])
+
            if entries_block:
                entries = doc.xpath(u'//div[contains(@itemprop, "itemListElement")]')
+                # log.debug(u'entries_block')
                # for entry in entries:
                #   log.debug('entries %s' % entree.tostring(entry))
                metadata = self.get_metadata(log, entries, title, authors, identifiers)
@ -140,19 +146,30 @@ class Ozon(Source):
            else:
                # Redirect page: trying to extract ozon_id from javascript data
                h = HTMLParser()
-                entry_string = (h.unescape(unicode(etree.tostring(doc, pretty_print=True))))
-                id_title_pat = re.compile(u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)')
-                # result containing ozon_id and entry_title
-                entry_info = re.search(id_title_pat, entry_string)
-                ozon_id = entry_info.group(1) if entry_info else None
-                entry_title = entry_info.group(2) if entry_info else None
+                entry_string = (h.unescape(etree.tostring(doc, pretty_print=True, encoding=unicode)))
+                json_pat = re.compile(u'dataLayer\s*=\s*(.+)?;')
+                json_info = re.search(json_pat, entry_string)
+                jsondata = json_info.group(1) if json_info else None

-                if ozon_id:
-                    metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors)
-                    identifiers['ozon'] = ozon_id
-                    self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={})
-                else:
-                    log.error('No SearchResults in Ozon.ru response found')
+                # log.debug(u'jsondata: %s' % jsondata)
+                dataLayer = json.loads(jsondata) if jsondata else None
+
+                ozon_id = None
+                if dataLayer and dataLayer[0] and 'ecommerce' in dataLayer[0]:
+                    jsproduct = dataLayer[0]['ecommerce']['detail']['products'][0]
+                    ozon_id = as_unicode(jsproduct['id'])
+                    entry_title = as_unicode(jsproduct['name'])
+
+                    log.debug(u'ozon_id %s' % ozon_id)
+                    log.debug(u'entry_title %s' % entry_title)
+
+                    if ozon_id:
+                        metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors)
+                        identifiers['ozon'] = ozon_id
+                        self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={})
+
+                if not ozon_id:
+                    log.error('No SearchResults in Ozon.ru response found!')

        except Exception as e:
            log.exception('Failed to parse identify results')
@ -478,7 +495,7 @@ class Ozon(Source):
        langs_elem = doc.xpath(u'//div[contains(text(), "зык")]')
        if langs_elem:
            langs_elem = langs_elem[0].getnext()
-            langs = langs_elem.xpath(u'text()')[0].strip()
+            langs = langs_elem.xpath(u'text()')[0].strip() if langs_elem else None
        if langs:
            lng_splt = langs.split(u',')
            if lng_splt:
@ -576,9 +593,9 @@ def _format_isbn(log, isbn):  # {{{
            log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported' % isbn)
    return res

-
 # }}}

+
 def _translageLanguageToCode(displayLang):  # {{{
    displayLang = unicode(displayLang).strip() if displayLang else None
    langTbl = {None: 'ru',