Fix metadata plugin to download metadata from OZON for website changes. Fixes #1300383 [Searching metadata using Ozon.ru failed with error](https://bugs.launchpad.net/calibre/+bug/1300383)

This commit is contained in:
Roman Mukhin 2014-05-07 22:04:37 +02:00 committed by Kovid Goyal
parent 35e585bd0d
commit fb970e24c6

View File

@ -5,6 +5,8 @@ __license__ = 'GPL 3'
__copyright__ = '2011-2013 Roman Mukhin <ramses_ru at hotmail.com>'
__docformat__ = 'restructuredtext en'
# To ensure bugfix and development please donate bitcoins to 1E6CRSLY1uNstcZjLYZBHRVs1CPKbdi4ep
import re
from Queue import Queue, Empty
@ -48,7 +50,8 @@ class Ozon(Source):
ozon_id = identifiers.get('ozon', None)
res = None
if ozon_id:
url = '{}/context/detail/id/{}?partner={}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
#no affiliateId is used in search/detail
url = '{}/context/detail/id/{}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
res = ('ozon', ozon_id, url)
return res
# }}}
@ -57,7 +60,7 @@ class Ozon(Source):
from urllib import quote_plus
# div_book -> search only books, ebooks and audio books
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
search_url = self.ozon_url + '/?context=search&group=div_book&text='
# for ozon.ru search we have to format ISBN with '-'
isbn = _format_isbn(log, identifiers.get('isbn', None))
@ -92,8 +95,8 @@ class Ozon(Source):
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=60): # {{{
from lxml import etree
identifiers={}, timeout=90): # {{{
from lxml import html, etree
from calibre.ebooks.chardet import xml_to_unicode
if not self.is_configured():
@ -112,16 +115,55 @@ class Ozon(Source):
return as_unicode(e)
try:
parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
entries = doc.xpath(u'//div[@class="SearchResults"]//div[@itemprop="itemListElement"]')
if entries:
#for entry in entries:
# log.debug('entries %s' % etree.tostring(entry))
metadata = self.get_metadata(log, entries, title, authors, identifiers)
self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
else:
mainentry = doc.xpath(u'//div[contains(@class, "details-main")]')
if mainentry:
metadata = self.get_metadata_from_detail(log, mainentry[0], title, authors, identifiers)
ozon_id = unicode(metadata.identifiers['ozon'])
self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, {ozon_id : doc})
else:
log.error('No SearchResults/itemListElement entries in Ozon.ru responce found')
except Exception as e:
log.exception('Failed to parse identify results')
return as_unicode(e)
# }}}
def get_metadata_from_detail(self, log, entry, title, authors, identifiers): # {{{
title = unicode(entry.xpath(u'normalize-space(.//h1[@itemprop="name"][1]/text())'))
#log.debug(u'Tile (from_detail): -----> %s' % title)
author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
#log.debug(u'Author (from_detail): -----> %s' % author)
norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
mi = Metadata(title, norm_authors)
ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
if ozon_id:
#log.debug(u'ozon_id (from_detail): -----> %s' % ozon_id)
mi.identifiers = {'ozon':ozon_id}
mi.ozon_cover_url = None
cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
if cover:
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
#log.debug(u'mi.ozon_cover_url (from_detail): -----> %s' % mi.ozon_cover_url)
mi.rating = self.get_rating(entry)
#log.debug(u'mi.rating (from_detail): -----> %s' % mi.rating)
if not mi.rating:
log.debug('No rating (from_detail) found. ozon_id:%s'%ozon_id)
return mi
# }}}
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
@ -206,7 +248,7 @@ class Ozon(Source):
return metadata
# }}}
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout, cachedPagesDict = {}): # {{{
req_isbn = identifiers.get('isbn', None)
for mi in metadata:
@ -216,7 +258,7 @@ class Ozon(Source):
ozon_id = mi.identifiers['ozon']
try:
self.get_book_details(log, mi, timeout)
self.get_book_details(log, mi, timeout, cachedPagesDict[ozon_id] if cachedPagesDict and cachedPagesDict.has_key(ozon_id) else None)
except:
log.exception(u'Failed to get details for metadata: %s' % mi.title)
@ -238,38 +280,61 @@ class Ozon(Source):
# }}}
def to_metadata(self, log, entry): # {{{
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())'))
#log.debug(u'Tile: -----> %s' % title)
author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
#log.debug(u'Author: -----> %s' % author)
title = entry.xpath(xp_template.format('Name'))
author = entry.xpath(xp_template.format('Author'))
norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
mi = Metadata(title, norm_authors)
ozon_id = entry.xpath(xp_template.format('ID'))
ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
if ozon_id:
mi.identifiers = {'ozon':ozon_id}
mi.comments = entry.xpath(xp_template.format('Annotation'))
#log.debug(u'ozon_id: -----> %s' % ozon_id)
mi.ozon_cover_url = None
cover = entry.xpath(xp_template.format('Picture'))
cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
#log.debug(u'cover: -----> %s' % cover)
if cover:
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
#log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)
pub_year = entry.xpath(xp_template.format('Year'))
pub_year = None
if pub_year:
mi.pubdate = toPubdate(log, pub_year)
#log.debug('pubdate %s' % mi.pubdate)
rating = entry.xpath(xp_template.format('ClientRatingValue'))
if rating:
mi.rating = self.get_rating(entry)
#if not mi.rating:
# log.debug('No rating found. ozon_id:%s'%ozon_id)
return mi
# }}}
def get_rating(self, entry): # {{{
ozon_rating = None
try:
xp_rating_template = u'boolean(.//div[contains(@class, "bStars") and contains(@class, "%s")])'
rating = None
if entry.xpath(xp_rating_template % 'm5'):
rating = 5.
elif entry.xpath(xp_rating_template % 'm4'):
rating = 4.
elif entry.xpath(xp_rating_template % 'm3'):
rating = 3.
elif entry.xpath(xp_rating_template % 'm2'):
rating = 2.
elif entry.xpath(xp_rating_template % 'm1'):
rating = 1.
if rating:
# 'rating', A floating point number between 0 and 10
# OZON raion N of 5, calibre of 10, but there is a bug? in identify
mi.rating = float(rating)
ozon_rating = float(rating)
except:
pass
rating
return mi
return ozon_rating
# }}}
def get_cached_cover_url(self, identifiers): # {{{
@ -321,16 +386,23 @@ class Ozon(Source):
return as_unicode(e)
# }}}
def get_book_details(self, log, metadata, timeout): # {{{
def get_book_details(self, log, metadata, timeout, cachedPage): # {{{
from lxml import html, etree
from calibre.ebooks.chardet import xml_to_unicode
if not cachedPage:
url = self.get_book_url(metadata.get_identifiers())[2]
#log.debug(u'book_details_url', url)
raw = self.browser.open_novisit(url, timeout=timeout).read()
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
else:
fulldoc = cachedPage
#log.debug(u'book_details -> using cached page')
xpt_tmpl_base = u'//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]'
doc = fulldoc.xpath(u'//div[@id="PageContent"][1]')[0]
xpt_tmpl_base = u'.//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]'
xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)'
# series Серия/Серии
@ -354,13 +426,14 @@ class Ozon(Source):
metadata.publisher = publishers
# log.debug(u'Publisher: ', metadata.publisher)
xpt_lang = u'substring-after(normalize-space(//text()[contains(normalize-space(.), "%s")]), ":")'
xpt_lang = u'substring-after(normalize-space(.//text()[contains(normalize-space(.), "%s")]), ":")'
displ_lang = None
langs = doc.xpath(xpt_lang % u'Язык')
if langs:
lng_splt = langs.split(u',')
if lng_splt:
displ_lang = lng_splt[0].strip()
#log.debug(u'displ_lang1: ', displ_lang)
metadata.language = _translageLanguageToCode(displ_lang)
#log.debug(u'Language: ', metadata.language)
@ -375,7 +448,7 @@ class Ozon(Source):
# log.debug(u'Pubdate: ', metadata.pubdate)
# overwrite comments from HTML if any
xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa
xpt = u'.//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa
from lxml.etree import ElementBase
comment_elem = doc.xpath(xpt)
if comment_elem:
@ -407,12 +480,11 @@ def _verifyISBNIntegrity(log, isbn): # {{{
# TODO: make customizable
def _translateToBigCoverUrl(coverUrl): # {{{
# http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif
# http://www.ozon.ru/multimedia/books_covers/1002986468.jpg
m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl)
# //static.ozone.ru/multimedia/c200/1005748980.jpg
# http://www.ozon.ru/multimedia/books_covers/1009493080.jpg
m = re.match(r'.+\/([^\.\\]+).+$', coverUrl)
if m:
coverUrl = m.group(1) + m.group(2) + 'jpg'
coverUrl = 'http://www.ozon.ru/multimedia/books_covers/' + m.group(1) + '.jpg'
return coverUrl
# }}}
@ -466,6 +538,7 @@ def _format_isbn(log, isbn): # {{{
def _translageLanguageToCode(displayLang): # {{{
displayLang = unicode(displayLang).strip() if displayLang else None
langTbl = { None: 'ru',
u'Русский': 'ru',
u'Немецкий': 'de',
u'Английский': 'en',
u'Французский': 'fr',