Update ozon.ru metadata download plugin to fix searching for books by ISBN

This commit is contained in:
Kovid Goyal 2016-10-14 08:59:40 +05:30
parent 53c5ff5daa
commit 3b644906a4

View File

@ -84,13 +84,15 @@ class Ozon(Source):
# Added Russian variant of 'Unknown'
unk = [_('Unknown').upper(), 'Неизв.'.upper(), icu_upper('Неизв.')]
if title and title not in unk:
qItems.add(title)
# use only ozonid if specified otherwise ozon.ru does not like a combination
if not ozonid:
if title and title not in unk:
qItems.add(title)
if authors:
for auth in authors:
if icu_upper(auth) not in unk:
qItems.add(auth)
if authors:
for auth in authors:
if icu_upper(auth) not in unk:
qItems.add(auth)
qItems.discard(None)
qItems.discard('')
@ -102,7 +104,7 @@ class Ozon(Source):
return None
search_url += quote_plus(searchText)
log.debug(u'search url: %r' % search_url)
log.debug(u'search url: %s' % search_url)
return search_url
# }}}
@ -112,6 +114,7 @@ class Ozon(Source):
from calibre.ebooks.chardet import xml_to_unicode
from HTMLParser import HTMLParser
from lxml import etree, html
import json
if not self.is_configured():
return
@ -131,8 +134,11 @@ class Ozon(Source):
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
entries_block = doc.xpath(u'//div[@class="bSearchResult"]')
# log.debug(u'HTML: %s' % xml_to_unicode(raw, verbose=True)[0])
if entries_block:
entries = doc.xpath(u'//div[contains(@itemprop, "itemListElement")]')
# log.debug(u'entries_block')
# for entry in entries:
# log.debug('entries %s' % entree.tostring(entry))
metadata = self.get_metadata(log, entries, title, authors, identifiers)
@ -140,19 +146,30 @@ class Ozon(Source):
else:
# Redirect page: trying to extract ozon_id from javascript data
h = HTMLParser()
entry_string = (h.unescape(unicode(etree.tostring(doc, pretty_print=True))))
id_title_pat = re.compile(u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)')
# result containing ozon_id and entry_title
entry_info = re.search(id_title_pat, entry_string)
ozon_id = entry_info.group(1) if entry_info else None
entry_title = entry_info.group(2) if entry_info else None
entry_string = (h.unescape(etree.tostring(doc, pretty_print=True, encoding=unicode)))
json_pat = re.compile(u'dataLayer\s*=\s*(.+)?;')
json_info = re.search(json_pat, entry_string)
jsondata = json_info.group(1) if json_info else None
if ozon_id:
metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors)
identifiers['ozon'] = ozon_id
self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={})
else:
log.error('No SearchResults in Ozon.ru response found')
# log.debug(u'jsondata: %s' % jsondata)
dataLayer = json.loads(jsondata) if jsondata else None
ozon_id = None
if dataLayer and dataLayer[0] and 'ecommerce' in dataLayer[0]:
jsproduct = dataLayer[0]['ecommerce']['detail']['products'][0]
ozon_id = as_unicode(jsproduct['id'])
entry_title = as_unicode(jsproduct['name'])
log.debug(u'ozon_id %s' % ozon_id)
log.debug(u'entry_title %s' % entry_title)
if ozon_id:
metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors)
identifiers['ozon'] = ozon_id
self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={})
if not ozon_id:
log.error('No SearchResults in Ozon.ru response found!')
except Exception as e:
log.exception('Failed to parse identify results')
@ -478,7 +495,7 @@ class Ozon(Source):
langs_elem = doc.xpath(u'//div[contains(text(), "зык")]')
if langs_elem:
langs_elem = langs_elem[0].getnext()
langs = langs_elem.xpath(u'text()')[0].strip()
langs = langs_elem.xpath(u'text()')[0].strip() if langs_elem else None
if langs:
lng_splt = langs.split(u',')
if lng_splt:
@ -576,9 +593,9 @@ def _format_isbn(log, isbn): # {{{
log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported' % isbn)
return res
# }}}
def _translageLanguageToCode(displayLang): # {{{
displayLang = unicode(displayLang).strip() if displayLang else None
langTbl = {None: 'ru',