From fb970e24c628f6034bb243dab5a86c39b81c2c55 Mon Sep 17 00:00:00 2001
From: Roman Mukhin <ramses_ru@hotmail.com>
Date: Wed, 7 May 2014 22:04:37 +0200
Subject: [PATCH] Fix metadata plugin to download metadata from OZON for
 website changes.  Fixes #1300383 [Searching metadata using Ozon.ru failed
 with error](https://bugs.launchpad.net/calibre/+bug/1300383)

---
 src/calibre/ebooks/metadata/sources/ozon.py | 195 ++++++++++++++------
 1 file changed, 134 insertions(+), 61 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py
index 2e1e613df8..9a3b310b67 100644
--- a/src/calibre/ebooks/metadata/sources/ozon.py
+++ b/src/calibre/ebooks/metadata/sources/ozon.py
@@ -5,6 +5,8 @@ __license__ = 'GPL 3'
 __copyright__ = '2011-2013 Roman Mukhin <ramses_ru at hotmail.com>'
 __docformat__ = 'restructuredtext en'
 
+# To ensure bugfix and development please donate bitcoins to 1E6CRSLY1uNstcZjLYZBHRVs1CPKbdi4ep
+
 import re
 from Queue import Queue, Empty
 
@@ -48,7 +50,8 @@ class Ozon(Source):
         ozon_id = identifiers.get('ozon', None)
         res = None
         if ozon_id:
-            url = '{}/context/detail/id/{}?partner={}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
+            #no affiliateId is used in search/detail
+            url = '{}/context/detail/id/{}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
             res = ('ozon', ozon_id, url)
         return res
     # }}}
@@ -57,13 +60,13 @@ class Ozon(Source):
         from urllib import quote_plus
 
         # div_book -> search only books, ebooks and audio books
-        search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
+        search_url = self.ozon_url + '/?context=search&group=div_book&text='
 
         # for ozon.ru search we have to format ISBN with '-'
         isbn = _format_isbn(log, identifiers.get('isbn', None))
         if isbn and not '-' in isbn:
             log.error("%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)"
-                      %(self.name, isbn))
+                      % (self.name, isbn))
             isbn = None
 
         ozonid = identifiers.get('ozon', None)
@@ -87,13 +90,13 @@ class Ozon(Source):
             return None
 
         search_url += quote_plus(searchText)
-        log.debug(u'search url: %r'%search_url)
+        log.debug(u'search url: %r' % search_url)
         return search_url
     # }}}
 
     def identify(self, log, result_queue, abort, title=None, authors=None,
-            identifiers={}, timeout=60):  # {{{
-        from lxml import etree
+            identifiers={}, timeout=90):  # {{{
+        from lxml import html, etree
         from calibre.ebooks.chardet import xml_to_unicode
 
         if not self.is_configured():
@@ -108,26 +111,65 @@ class Ozon(Source):
             raw = self.browser.open_novisit(query).read()
 
         except Exception as e:
-            log.exception(u'Failed to make identify query: %r'%query)
+            log.exception(u'Failed to make identify query: %r' % query)
             return as_unicode(e)
 
         try:
-            parser = etree.XMLParser(recover=True, no_network=True)
-            feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
-            entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
+            doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])            
+            entries = doc.xpath(u'//div[@class="SearchResults"]//div[@itemprop="itemListElement"]')            
+            
             if entries:
+                #for entry in entries:
+                #    log.debug('entries %s' % etree.tostring(entry))
                 metadata = self.get_metadata(log, entries, title, authors, identifiers)
                 self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
+            else:
+                mainentry = doc.xpath(u'//div[contains(@class, "details-main")]')
+                if mainentry: 
+                    metadata = self.get_metadata_from_detail(log, mainentry[0], title, authors, identifiers)
+                    ozon_id = unicode(metadata.identifiers['ozon'])
+                    self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, {ozon_id : doc})
+                else:    
+                    log.error('No SearchResults/itemListElement entries in Ozon.ru responce found')
+                    
         except Exception as e:
             log.exception('Failed to parse identify results')
             return as_unicode(e)
+    # }}}
 
+    def get_metadata_from_detail(self, log, entry, title, authors, identifiers):  # {{{
+        title = unicode(entry.xpath(u'normalize-space(.//h1[@itemprop="name"][1]/text())'))
+        #log.debug(u'Tile (from_detail): -----> %s' % title)
+
+        author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
+        #log.debug(u'Author (from_detail): -----> %s' % author)
+        
+        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
+        mi = Metadata(title, norm_authors)
+
+        ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
+        if ozon_id:
+            #log.debug(u'ozon_id (from_detail): -----> %s' % ozon_id)
+            mi.identifiers = {'ozon':ozon_id}
+
+        mi.ozon_cover_url = None
+        cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
+        if cover:
+            mi.ozon_cover_url = _translateToBigCoverUrl(cover)
+            #log.debug(u'mi.ozon_cover_url  (from_detail): -----> %s' % mi.ozon_cover_url)
+
+        mi.rating = self.get_rating(entry)
+        #log.debug(u'mi.rating  (from_detail): -----> %s' % mi.rating)
+        if not mi.rating:
+            log.debug('No rating (from_detail) found. ozon_id:%s'%ozon_id)        
+        
+        return mi
     # }}}
 
     def get_metadata(self, log, entries, title, authors, identifiers):  # {{{
         # some book titles have extra characters like this
         # TODO: make a twick
-        #reRemoveFromTitle = None
+        # reRemoveFromTitle = None
         reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
 
         title = unicode(title).upper() if title else ''
@@ -136,7 +178,7 @@ class Ozon(Source):
         authors = map(_normalizeAuthorNameWithInitials,
                       map(unicode.upper, map(unicode, authors))) if authors else None
         ozon_id = identifiers.get('ozon', None)
-        #log.debug(u'ozonid: ', ozon_id)
+        # log.debug(u'ozonid: ', ozon_id)
 
         unk = unicode(_('Unknown')).upper()
 
@@ -149,7 +191,7 @@ class Ozon(Source):
         def in_authors(authors, miauthors):
             for author in authors:
                 for miauthor in miauthors:
-                    #log.debug(u'=> %s <> %s'%(author, miauthor))
+                    # log.debug(u'=> %s <> %s'%(author, miauthor))
                     if author in miauthor:
                         return True
             return None
@@ -199,14 +241,14 @@ class Ozon(Source):
 
             if not strict_match or relevance > 0:
                 metadata.append(mi)
-                #log.debug(u'added metadata %s %s.'%(mi.title,  mi.authors))
+                # log.debug(u'added metadata %s %s.'%(mi.title,  mi.authors))
             else:
                 log.debug(u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)'
-                          %(mi.title, u' '.join(mi.authors), relevance))
+                          % (mi.title, u' '.join(mi.authors), relevance))
         return metadata
     # }}}
 
-    def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout):  # {{{
+    def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout, cachedPagesDict = {}):  # {{{
         req_isbn = identifiers.get('isbn', None)
 
         for mi in metadata:
@@ -216,13 +258,13 @@ class Ozon(Source):
                 ozon_id = mi.identifiers['ozon']
 
                 try:
-                    self.get_book_details(log, mi, timeout)
+                    self.get_book_details(log, mi, timeout, cachedPagesDict[ozon_id] if cachedPagesDict and cachedPagesDict.has_key(ozon_id) else None)
                 except:
-                    log.exception(u'Failed to get details for metadata: %s'%mi.title)
+                    log.exception(u'Failed to get details for metadata: %s' % mi.title)
 
                 all_isbns = getattr(mi, 'all_isbns', [])
                 if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns:
-                    log.debug(u'skipped, no requested ISBN %s found'%req_isbn)
+                    log.debug(u'skipped, no requested ISBN %s found' % req_isbn)
                     continue
 
                 for isbn in all_isbns:
@@ -234,44 +276,67 @@ class Ozon(Source):
                 self.clean_downloaded_metadata(mi)
                 result_queue.put(mi)
             except:
-                log.exception(u'Failed to get details for metadata: %s'%mi.title)
+                log.exception(u'Failed to get details for metadata: %s' % mi.title)
     # }}}
 
     def to_metadata(self, log, entry):  # {{{
-        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
+        title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())'))
+        #log.debug(u'Tile: -----> %s' % title)
 
-        title = entry.xpath(xp_template.format('Name'))
-        author = entry.xpath(xp_template.format('Author'))
+        author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
+        #log.debug(u'Author: -----> %s' % author)
+        
         norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
         mi = Metadata(title, norm_authors)
 
-        ozon_id = entry.xpath(xp_template.format('ID'))
-        mi.identifiers = {'ozon':ozon_id}
-
-        mi.comments = entry.xpath(xp_template.format('Annotation'))
+        ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
+        if ozon_id:
+            mi.identifiers = {'ozon':ozon_id}
+            #log.debug(u'ozon_id: -----> %s' % ozon_id)
 
         mi.ozon_cover_url = None
-        cover = entry.xpath(xp_template.format('Picture'))
+        cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
+        #log.debug(u'cover: -----> %s' % cover)
         if cover:
             mi.ozon_cover_url = _translateToBigCoverUrl(cover)
+            #log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)
 
-        pub_year = entry.xpath(xp_template.format('Year'))
+        pub_year = None
         if pub_year:
             mi.pubdate = toPubdate(log, pub_year)
-            #log.debug('pubdate %s'%mi.pubdate)
+            #log.debug('pubdate %s' % mi.pubdate)
 
-        rating = entry.xpath(xp_template.format('ClientRatingValue'))
-        if rating:
-            try:
-                #'rating',     A floating point number between 0 and 10
-                # OZON raion N of 5, calibre of 10, but there is a bug? in identify
-                mi.rating = float(rating)
-            except:
-                pass
-            rating
+        mi.rating = self.get_rating(entry)
+        #if not mi.rating:
+        #    log.debug('No rating found. ozon_id:%s'%ozon_id)        
+            
         return mi
     # }}}
 
+    def get_rating(self, entry):  # {{{
+        ozon_rating = None
+        try:
+            xp_rating_template = u'boolean(.//div[contains(@class, "bStars") and contains(@class, "%s")])'
+            rating = None
+            if entry.xpath(xp_rating_template % 'm5'):
+                rating = 5.
+            elif entry.xpath(xp_rating_template % 'm4'):
+                rating = 4.
+            elif entry.xpath(xp_rating_template % 'm3'):
+                rating = 3.
+            elif entry.xpath(xp_rating_template % 'm2'):
+                rating = 2.
+            elif entry.xpath(xp_rating_template % 'm1'):
+                rating = 1.
+            if rating:
+                # 'rating',     A floating point number between 0 and 10
+                # OZON raion N of 5, calibre of 10, but there is a bug? in identify
+                ozon_rating = float(rating)
+        except:
+            pass
+        return ozon_rating
+    # }}}
+
     def get_cached_cover_url(self, identifiers):  # {{{
         url = None
         ozon_id = identifiers.get('ozon', None)
@@ -317,20 +382,27 @@ class Ozon(Source):
             if cdata:
                 result_queue.put((self, cdata))
         except Exception as e:
-            log.exception(u'Failed to download cover from: %s'%cached_url)
+            log.exception(u'Failed to download cover from: %s' % cached_url)
             return as_unicode(e)
     # }}}
 
-    def get_book_details(self, log, metadata, timeout):  # {{{
+    def get_book_details(self, log, metadata, timeout, cachedPage):  # {{{
         from lxml import html, etree
         from calibre.ebooks.chardet import xml_to_unicode
 
-        url = self.get_book_url(metadata.get_identifiers())[2]
+        if not cachedPage:
+            url = self.get_book_url(metadata.get_identifiers())[2]
+            #log.debug(u'book_details_url', url)
 
-        raw = self.browser.open_novisit(url, timeout=timeout).read()
-        doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
+            raw = self.browser.open_novisit(url, timeout=timeout).read()
+            fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
+        else:
+            fulldoc = cachedPage   
+            #log.debug(u'book_details -> using cached page')
+        
+        doc = fulldoc.xpath(u'//div[@id="PageContent"][1]')[0]
 
-        xpt_tmpl_base = u'//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]'
+        xpt_tmpl_base = u'.//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]'
         xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)'
 
         # series Серия/Серии
@@ -342,25 +414,26 @@ class Ozon(Source):
         xpt_isbn = u'normalize-space(' + xpt_tmpl_base + u')'
         isbn_str = doc.xpath(xpt_isbn % u'ISBN')
         if isbn_str:
-            #log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str))
+            # log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str))
             all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
             if all_isbns:
                 metadata.all_isbns = all_isbns
                 metadata.isbn = all_isbns[0]
-        #log.debug(u'ISBN: ', metadata.isbn)
+        # log.debug(u'ISBN: ', metadata.isbn)
 
         publishers = doc.xpath(xpt_tmpl_a % u'Издатель')
         if publishers:
             metadata.publisher = publishers
-        #log.debug(u'Publisher: ', metadata.publisher)
+        # log.debug(u'Publisher: ', metadata.publisher)
 
-        xpt_lang = u'substring-after(normalize-space(//text()[contains(normalize-space(.), "%s")]), ":")'
+        xpt_lang = u'substring-after(normalize-space(.//text()[contains(normalize-space(.), "%s")]), ":")'
         displ_lang = None
         langs = doc.xpath(xpt_lang % u'Язык')
         if langs:
             lng_splt = langs.split(u',')
             if lng_splt:
                 displ_lang = lng_splt[0].strip()
+                #log.debug(u'displ_lang1: ', displ_lang)
         metadata.language = _translageLanguageToCode(displ_lang)
         #log.debug(u'Language: ', metadata.language)
 
@@ -372,10 +445,10 @@ class Ozon(Source):
                 matcher = re.search(r'\d{4}', yearIn)
                 if matcher:
                     metadata.pubdate = toPubdate(log, matcher.group(0))
-        #log.debug(u'Pubdate: ', metadata.pubdate)
+        # log.debug(u'Pubdate: ', metadata.pubdate)
 
         # overwrite comments from HTML if any
-        xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]'  # noqa
+        xpt = u'.//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]'  # noqa
         from lxml.etree import ElementBase
         comment_elem = doc.xpath(xpt)
         if comment_elem:
@@ -401,18 +474,17 @@ def _verifyISBNIntegrity(log, isbn):  # {{{
     # Online ISBN-Check http://www.isbn-check.de/
     res = check_isbn(isbn)
     if not res:
-        log.error(u'ISBN integrity check failed for "%s"'%isbn)
+        log.error(u'ISBN integrity check failed for "%s"' % isbn)
     return res is not None
 # }}}
 
 # TODO: make customizable
 def _translateToBigCoverUrl(coverUrl):  # {{{
-    # http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif
-    # http://www.ozon.ru/multimedia/books_covers/1002986468.jpg
-
-    m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl)
+    # //static.ozone.ru/multimedia/c200/1005748980.jpg
+    # http://www.ozon.ru/multimedia/books_covers/1009493080.jpg
+    m = re.match(r'.+\/([^\.\\]+).+$', coverUrl)
     if m:
-        coverUrl = m.group(1) + m.group(2) + 'jpg'
+        coverUrl = 'http://www.ozon.ru/multimedia/books_covers/' + m.group(1) + '.jpg' 
     return coverUrl
 # }}}
 
@@ -459,13 +531,14 @@ def _format_isbn(log, isbn):  # {{{
         if m:
             res = '-'.join([g for g in m.groups() if g])
         else:
-            log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported'%isbn)
+            log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported' % isbn)
     return res
 # }}}
 
 def _translageLanguageToCode(displayLang):  # {{{
     displayLang = unicode(displayLang).strip() if displayLang else None
-    langTbl = {None: 'ru',
+    langTbl = { None: 'ru',
+                u'Русский': 'ru',
                 u'Немецкий': 'de',
                 u'Английский': 'en',
                 u'Французский': 'fr',
@@ -475,7 +548,7 @@ def _translageLanguageToCode(displayLang):  # {{{
                 u'Японский': 'ja',
                 u'Финский' : 'fi',
                 u'Польский' : 'pl',
-                u'Украинский' : 'uk',}
+                u'Украинский' : 'uk', }
     return langTbl.get(displayLang, None)
 # }}}
 
@@ -502,7 +575,7 @@ def toPubdate(log, yearAsString):  # {{{
         try:
             res = parse_only_date(u"01.01." + yearAsString)
         except:
-            log.error('cannot parse to date %s'%yearAsString)
+            log.error('cannot parse to date %s' % yearAsString)
     return res
 # }}}