From fb970e24c628f6034bb243dab5a86c39b81c2c55 Mon Sep 17 00:00:00 2001 From: Roman Mukhin Date: Wed, 7 May 2014 22:04:37 +0200 Subject: [PATCH] Fix metadata plugin to download metadata from OZON for website changes. Fixes #1300383 [Searching metadata using Ozon.ru failed with error](https://bugs.launchpad.net/calibre/+bug/1300383) --- src/calibre/ebooks/metadata/sources/ozon.py | 195 ++++++++++++++------ 1 file changed, 134 insertions(+), 61 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index 2e1e613df8..9a3b310b67 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -5,6 +5,8 @@ __license__ = 'GPL 3' __copyright__ = '2011-2013 Roman Mukhin ' __docformat__ = 'restructuredtext en' +# To ensure bugfix and development please donate bitcoins to 1E6CRSLY1uNstcZjLYZBHRVs1CPKbdi4ep + import re from Queue import Queue, Empty @@ -48,7 +50,8 @@ class Ozon(Source): ozon_id = identifiers.get('ozon', None) res = None if ozon_id: - url = '{}/context/detail/id/{}?partner={}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId()) + #no affiliateId is used in search/detail + url = '{}/context/detail/id/{}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId()) res = ('ozon', ozon_id, url) return res # }}} @@ -57,13 +60,13 @@ class Ozon(Source): from urllib import quote_plus # div_book -> search only books, ebooks and audio books - search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' + search_url = self.ozon_url + '/?context=search&group=div_book&text=' # for ozon.ru search we have to format ISBN with '-' isbn = _format_isbn(log, identifiers.get('isbn', None)) if isbn and not '-' in isbn: log.error("%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)" - %(self.name, isbn)) + % (self.name, isbn)) isbn = None ozonid = identifiers.get('ozon', None) @@ -87,13 +90,13 @@ class Ozon(Source): return None search_url += quote_plus(searchText) - log.debug(u'search url: %r'%search_url) + log.debug(u'search url: %r' % search_url) return search_url # }}} def identify(self, log, result_queue, abort, title=None, authors=None, - identifiers={}, timeout=60): # {{{ - from lxml import etree + identifiers={}, timeout=90): # {{{ + from lxml import html, etree from calibre.ebooks.chardet import xml_to_unicode if not self.is_configured(): @@ -108,26 +111,65 @@ class Ozon(Source): raw = self.browser.open_novisit(query).read() except Exception as e: - log.exception(u'Failed to make identify query: %r'%query) + log.exception(u'Failed to make identify query: %r' % query) return as_unicode(e) try: - parser = etree.XMLParser(recover=True, no_network=True) - feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser) - entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]') + doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) + entries = doc.xpath(u'//div[@class="SearchResults"]//div[@itemprop="itemListElement"]') + if entries: + #for entry in entries: + # log.debug('entries %s' % etree.tostring(entry)) metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) + else: + mainentry = doc.xpath(u'//div[contains(@class, "details-main")]') + if mainentry: + metadata = self.get_metadata_from_detail(log, mainentry[0], title, authors, identifiers) + ozon_id = unicode(metadata.identifiers['ozon']) + self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, {ozon_id : doc}) + else: + log.error('No SearchResults/itemListElement entries in Ozon.ru responce found') + except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) + # }}} + def get_metadata_from_detail(self, log, entry, title, authors, identifiers): # {{{ + title = unicode(entry.xpath(u'normalize-space(.//h1[@itemprop="name"][1]/text())')) + #log.debug(u'Tile (from_detail): -----> %s' % title) + + author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())')) + #log.debug(u'Author (from_detail): -----> %s' % author) + + norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) + mi = Metadata(title, norm_authors) + + ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")') + if ozon_id: + #log.debug(u'ozon_id (from_detail): -----> %s' % ozon_id) + mi.identifiers = {'ozon':ozon_id} + + mi.ozon_cover_url = None + cover = entry.xpath(u'normalize-space(.//img[1]/@src)') + if cover: + mi.ozon_cover_url = _translateToBigCoverUrl(cover) + #log.debug(u'mi.ozon_cover_url (from_detail): -----> %s' % mi.ozon_cover_url) + + mi.rating = self.get_rating(entry) + #log.debug(u'mi.rating (from_detail): -----> %s' % mi.rating) + if not mi.rating: + log.debug('No rating (from_detail) found. ozon_id:%s'%ozon_id) + + return mi # }}} def get_metadata(self, log, entries, title, authors, identifiers): # {{{ # some book titles have extra characters like this # TODO: make a twick - #reRemoveFromTitle = None + # reRemoveFromTitle = None reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') title = unicode(title).upper() if title else '' @@ -136,7 +178,7 @@ class Ozon(Source): authors = map(_normalizeAuthorNameWithInitials, map(unicode.upper, map(unicode, authors))) if authors else None ozon_id = identifiers.get('ozon', None) - #log.debug(u'ozonid: ', ozon_id) + # log.debug(u'ozonid: ', ozon_id) unk = unicode(_('Unknown')).upper() @@ -149,7 +191,7 @@ class Ozon(Source): def in_authors(authors, miauthors): for author in authors: for miauthor in miauthors: - #log.debug(u'=> %s <> %s'%(author, miauthor)) + # log.debug(u'=> %s <> %s'%(author, miauthor)) if author in miauthor: return True return None @@ -199,14 +241,14 @@ class Ozon(Source): if not strict_match or relevance > 0: metadata.append(mi) - #log.debug(u'added metadata %s %s.'%(mi.title, mi.authors)) + # log.debug(u'added metadata %s %s.'%(mi.title, mi.authors)) else: log.debug(u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)' - %(mi.title, u' '.join(mi.authors), relevance)) + % (mi.title, u' '.join(mi.authors), relevance)) return metadata # }}} - def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{ + def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout, cachedPagesDict = {}): # {{{ req_isbn = identifiers.get('isbn', None) for mi in metadata: @@ -216,13 +258,13 @@ class Ozon(Source): ozon_id = mi.identifiers['ozon'] try: - self.get_book_details(log, mi, timeout) + self.get_book_details(log, mi, timeout, cachedPagesDict[ozon_id] if cachedPagesDict and cachedPagesDict.has_key(ozon_id) else None) except: - log.exception(u'Failed to get details for metadata: %s'%mi.title) + log.exception(u'Failed to get details for metadata: %s' % mi.title) all_isbns = getattr(mi, 'all_isbns', []) if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns: - log.debug(u'skipped, no requested ISBN %s found'%req_isbn) + log.debug(u'skipped, no requested ISBN %s found' % req_isbn) continue for isbn in all_isbns: @@ -234,44 +276,67 @@ class Ozon(Source): self.clean_downloaded_metadata(mi) result_queue.put(mi) except: - log.exception(u'Failed to get details for metadata: %s'%mi.title) + log.exception(u'Failed to get details for metadata: %s' % mi.title) # }}} def to_metadata(self, log, entry): # {{{ - xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' + title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())')) + #log.debug(u'Tile: -----> %s' % title) - title = entry.xpath(xp_template.format('Name')) - author = entry.xpath(xp_template.format('Author')) + author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())')) + #log.debug(u'Author: -----> %s' % author) + norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) - ozon_id = entry.xpath(xp_template.format('ID')) - mi.identifiers = {'ozon':ozon_id} - - mi.comments = entry.xpath(xp_template.format('Annotation')) + ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")') + if ozon_id: + mi.identifiers = {'ozon':ozon_id} + #log.debug(u'ozon_id: -----> %s' % ozon_id) mi.ozon_cover_url = None - cover = entry.xpath(xp_template.format('Picture')) + cover = entry.xpath(u'normalize-space(.//img[1]/@src)') + #log.debug(u'cover: -----> %s' % cover) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) + #log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url) - pub_year = entry.xpath(xp_template.format('Year')) + pub_year = None if pub_year: mi.pubdate = toPubdate(log, pub_year) - #log.debug('pubdate %s'%mi.pubdate) + #log.debug('pubdate %s' % mi.pubdate) - rating = entry.xpath(xp_template.format('ClientRatingValue')) - if rating: - try: - #'rating', A floating point number between 0 and 10 - # OZON raion N of 5, calibre of 10, but there is a bug? in identify - mi.rating = float(rating) - except: - pass - rating + mi.rating = self.get_rating(entry) + #if not mi.rating: + # log.debug('No rating found. ozon_id:%s'%ozon_id) + return mi # }}} + def get_rating(self, entry): # {{{ + ozon_rating = None + try: + xp_rating_template = u'boolean(.//div[contains(@class, "bStars") and contains(@class, "%s")])' + rating = None + if entry.xpath(xp_rating_template % 'm5'): + rating = 5. + elif entry.xpath(xp_rating_template % 'm4'): + rating = 4. + elif entry.xpath(xp_rating_template % 'm3'): + rating = 3. + elif entry.xpath(xp_rating_template % 'm2'): + rating = 2. + elif entry.xpath(xp_rating_template % 'm1'): + rating = 1. + if rating: + # 'rating', A floating point number between 0 and 10 + # OZON raion N of 5, calibre of 10, but there is a bug? in identify + ozon_rating = float(rating) + except: + pass + return ozon_rating + # }}} + def get_cached_cover_url(self, identifiers): # {{{ url = None ozon_id = identifiers.get('ozon', None) @@ -317,20 +382,27 @@ class Ozon(Source): if cdata: result_queue.put((self, cdata)) except Exception as e: - log.exception(u'Failed to download cover from: %s'%cached_url) + log.exception(u'Failed to download cover from: %s' % cached_url) return as_unicode(e) # }}} - def get_book_details(self, log, metadata, timeout): # {{{ + def get_book_details(self, log, metadata, timeout, cachedPage): # {{{ from lxml import html, etree from calibre.ebooks.chardet import xml_to_unicode - url = self.get_book_url(metadata.get_identifiers())[2] + if not cachedPage: + url = self.get_book_url(metadata.get_identifiers())[2] + #log.debug(u'book_details_url', url) - raw = self.browser.open_novisit(url, timeout=timeout).read() - doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) + raw = self.browser.open_novisit(url, timeout=timeout).read() + fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) + else: + fulldoc = cachedPage + #log.debug(u'book_details -> using cached page') + + doc = fulldoc.xpath(u'//div[@id="PageContent"][1]')[0] - xpt_tmpl_base = u'//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]' + xpt_tmpl_base = u'.//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]' xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)' # series Серия/Серии @@ -342,25 +414,26 @@ class Ozon(Source): xpt_isbn = u'normalize-space(' + xpt_tmpl_base + u')' isbn_str = doc.xpath(xpt_isbn % u'ISBN') if isbn_str: - #log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str)) + # log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str)) all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)] if all_isbns: metadata.all_isbns = all_isbns metadata.isbn = all_isbns[0] - #log.debug(u'ISBN: ', metadata.isbn) + # log.debug(u'ISBN: ', metadata.isbn) publishers = doc.xpath(xpt_tmpl_a % u'Издатель') if publishers: metadata.publisher = publishers - #log.debug(u'Publisher: ', metadata.publisher) + # log.debug(u'Publisher: ', metadata.publisher) - xpt_lang = u'substring-after(normalize-space(//text()[contains(normalize-space(.), "%s")]), ":")' + xpt_lang = u'substring-after(normalize-space(.//text()[contains(normalize-space(.), "%s")]), ":")' displ_lang = None langs = doc.xpath(xpt_lang % u'Язык') if langs: lng_splt = langs.split(u',') if lng_splt: displ_lang = lng_splt[0].strip() + #log.debug(u'displ_lang1: ', displ_lang) metadata.language = _translageLanguageToCode(displ_lang) #log.debug(u'Language: ', metadata.language) @@ -372,10 +445,10 @@ class Ozon(Source): matcher = re.search(r'\d{4}', yearIn) if matcher: metadata.pubdate = toPubdate(log, matcher.group(0)) - #log.debug(u'Pubdate: ', metadata.pubdate) + # log.debug(u'Pubdate: ', metadata.pubdate) # overwrite comments from HTML if any - xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa + xpt = u'.//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa from lxml.etree import ElementBase comment_elem = doc.xpath(xpt) if comment_elem: @@ -401,18 +474,17 @@ def _verifyISBNIntegrity(log, isbn): # {{{ # Online ISBN-Check http://www.isbn-check.de/ res = check_isbn(isbn) if not res: - log.error(u'ISBN integrity check failed for "%s"'%isbn) + log.error(u'ISBN integrity check failed for "%s"' % isbn) return res is not None # }}} # TODO: make customizable def _translateToBigCoverUrl(coverUrl): # {{{ - # http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif - # http://www.ozon.ru/multimedia/books_covers/1002986468.jpg - - m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl) + # //static.ozone.ru/multimedia/c200/1005748980.jpg + # http://www.ozon.ru/multimedia/books_covers/1009493080.jpg + m = re.match(r'.+\/([^\.\\]+).+$', coverUrl) if m: - coverUrl = m.group(1) + m.group(2) + 'jpg' + coverUrl = 'http://www.ozon.ru/multimedia/books_covers/' + m.group(1) + '.jpg' return coverUrl # }}} @@ -459,13 +531,14 @@ def _format_isbn(log, isbn): # {{{ if m: res = '-'.join([g for g in m.groups() if g]) else: - log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported'%isbn) + log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported' % isbn) return res # }}} def _translageLanguageToCode(displayLang): # {{{ displayLang = unicode(displayLang).strip() if displayLang else None - langTbl = {None: 'ru', + langTbl = { None: 'ru', + u'Русский': 'ru', u'Немецкий': 'de', u'Английский': 'en', u'Французский': 'fr', @@ -475,7 +548,7 @@ def _translageLanguageToCode(displayLang): # {{{ u'Японский': 'ja', u'Финский' : 'fi', u'Польский' : 'pl', - u'Украинский' : 'uk',} + u'Украинский' : 'uk', } return langTbl.get(displayLang, None) # }}} @@ -502,7 +575,7 @@ def toPubdate(log, yearAsString): # {{{ try: res = parse_only_date(u"01.01." + yearAsString) except: - log.error('cannot parse to date %s'%yearAsString) + log.error('cannot parse to date %s' % yearAsString) return res # }}}