Get Books: Fix ozon.ru

2025-07-09 03:04:10 -04:00 · 2012-07-02 10:17:54 +05:30 · 2012-07-02 10:17:54 +05:30 · 343d8f448c
commit 343d8f448c
parent 5a2848bacb
2 changed files with 80 additions and 54 deletions
--- a/src/calibre/ebooks/metadata/sources/ozon.py
+++ b/src/calibre/ebooks/metadata/sources/ozon.py
@ -54,30 +54,35 @@ class Ozon(Source):

        # for ozon.ru search we have to format ISBN with '-'
        isbn = _format_isbn(log, identifiers.get('isbn', None))
-        # TODO: format isbn!
-        qItems = set([isbn, title])
-        if authors:
-            qItems |= frozenset(authors)
-        qItems.discard(None)
-        qItems.discard('')
-        qItems = map(_quoteString, qItems)
-
-        q = u' '.join(qItems).strip()
-        log.info(u'search string: ' + q)
-
-        if isinstance(q, unicode):
-            q = q.encode('utf-8')
-        if not q:
-            return None
-
-        search_url += quote_plus(q)
+        ozonid = identifiers.get('ozon', None)
+        
+        unk = unicode(_('Unknown')).upper()
+        if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid:
+            qItems = set([isbn, title])
+            if authors:
+                qItems |= frozenset(authors)
+            qItems.discard(None)
+            qItems.discard('')
+            qItems = map(_quoteString, qItems)
+    
+            q = u' '.join(qItems).strip()
+            log.info(u'search string: ' + q)
+    
+            if isinstance(q, unicode):
+                q = q.encode('utf-8')
+            if not q:
+                return None
+    
+            search_url += quote_plus(q)
+        else:
+            search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid
+            
        log.debug(u'search url: %r'%search_url)
-
        return search_url
    # }}}

    def identify(self, log, result_queue, abort, title=None, authors=None,
-            identifiers={}, timeout=30): # {{{
+            identifiers={}, timeout=60): # {{{
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode

@ -99,7 +104,7 @@ class Ozon(Source):
        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
-            entries = feed.xpath('//*[local-name() = "SearchItems"]')
+            entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
            if entries:
                metadata = self.get_metadata(log, entries, title, authors, identifiers)
                self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
@ -112,8 +117,8 @@ class Ozon(Source):
    def get_metadata(self, log, entries, title, authors, identifiers): # {{{
        # some book titles have extra characters like this
        # TODO: make a twick
-        reRemoveFromTitle = None
-        #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
+        #reRemoveFromTitle = None
+        reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')

        title = unicode(title).upper() if title else ''
        if reRemoveFromTitle:
@ -163,7 +168,7 @@ class Ozon(Source):
                metadata.append(mi)
                #log.debug(u'added metadata %s %s.'%(mi.title,  mi.authors))
            else:
-                log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
+                log.debug(u'skipped metadata %s %s. (does not match the query)'%(unicode(mi.title), mi.authors))
        return metadata
    # }}}

@ -301,7 +306,7 @@ class Ozon(Source):
        if series:
            metadata.series = series

-        xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
+        xpt = u'normalize-space(//*[@class="product-detail"]//text()[starts-with(., "ISBN")])'
        isbn_str = doc.xpath(xpt)
        if isbn_str:
            all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
@ -326,7 +331,7 @@ class Ozon(Source):

        # can be set before from xml search responce
        if not metadata.pubdate:
-            xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
+            xpt = u'normalize-space(substring-after(//div[@class="product-detail"]//text()[contains(., "г.")],";"))'
            yearIn = doc.xpath(xpt)
            if yearIn:
                matcher = re.search(r'\d{4}', yearIn)
@ -334,17 +339,20 @@ class Ozon(Source):
                    metadata.pubdate = toPubdate(log, matcher.group(0))

        # overwrite comments from HTML if any
-        xpt = u'//table[@id="detail_description"]//tr/td'
+        xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]'
+        from lxml.etree import ElementBase
        comment_elem = doc.xpath(xpt)
        if comment_elem:
-            comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
-            if comments:
-                # cleanup root tag, TODO: remove tags like object/embeded
-                comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
-                if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
-                    metadata.comments = comments
-                else:
-                    log.debug('HTML book description skipped in favour of search service xml responce')
+            comments = u''
+            for node in comment_elem:
+                if isinstance(node, ElementBase):
+                    comments += unicode(etree.tostring(node, encoding=unicode))
+                elif isinstance(node, basestring) and node.strip():
+                    comments += unicode(node) + u'\n'
+            if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
+                metadata.comments = comments
+            else:
+                log.debug('HTML book description skipped in favour of search service xml responce')
        else:
            log.debug('No book description found in HTML')
    # }}}
@ -430,7 +438,8 @@ def _translageLanguageToCode(displayLang): # {{{
                u'Китайский': 'zh',
                u'Японский': 'ja',
                u'Финский' : 'fi',
-                u'Польский' : 'pl',}
+                u'Польский' : 'pl',
+                u'Украинский' : 'uk',}
    return langTbl.get(displayLang, None)
 # }}}

@ -454,7 +463,7 @@ def toPubdate(log, yearAsString): # {{{
    res = None
    if yearAsString:
        try:
-            res = parse_only_date(yearAsString)
+            res = parse_only_date(u"01.01." + yearAsString)
        except:
            log.error('cannot parse to date %s'%yearAsString)
    return res
--- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py
+++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py
@ -46,30 +46,37 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
            d.set_tags(self.config.get('tags', ''))
            d.exec_()        
        
-
-    def search(self, query, max_results=10, timeout=60):
+    def search(self, query, max_results=15, timeout=60):
        search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
                    'searchText=%s&searchContext=ebook' % urllib2.quote(query)
+        search_urls = [ search_url ]
+        
+        ## add this as the fist try if it looks like ozon ID
+        if re.match("^\d{6,9}$", query):
+            ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query
+            search_urls.insert(0, ozon_detail)
+
        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
-                    
        counter = max_results
        br = browser()
-        with closing(br.open(search_url, timeout=timeout)) as f:
-            raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
-            doc = etree.fromstring(raw)
-            for data in doc.xpath('//*[local-name() = "SearchItems"]'):
-                if counter <= 0:
-                    break
-                counter -= 1
+        
+        for url in search_urls:
+            with closing(br.open(url, timeout=timeout)) as f:
+                raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
+                doc = etree.fromstring(raw)
+                for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
+                    if counter <= 0:
+                        break
+                    counter -= 1

-                s = SearchResult()
-                s.detail_item = data.xpath(xp_template.format('ID'))
-                s.title = data.xpath(xp_template.format('Name'))
-                s.author = data.xpath(xp_template.format('Author'))
-                s.price = data.xpath(xp_template.format('Price'))
-                s.cover_url = data.xpath(xp_template.format('Picture'))
-                s.price = format_price_in_RUR(s.price)
-                yield s
+                    s = SearchResult()
+                    s.detail_item = data.xpath(xp_template.format('ID'))
+                    s.title = data.xpath(xp_template.format('Name'))
+                    s.author = data.xpath(xp_template.format('Author'))
+                    s.price = data.xpath(xp_template.format('Price'))
+                    s.cover_url = data.xpath(xp_template.format('Picture'))
+                    s.price = format_price_in_RUR(s.price)
+                    yield s

    def get_details(self, search_result, timeout=60):
        url = self.shop_url + '/context/detail/id/' + urllib2.quote(search_result.detail_item)
@ -97,6 +104,16 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
                search_result.formats = ', '.join(_parse_ebook_formats(formats))
                # unfortunately no direct links to download books (only buy link)
                # search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item)
+            
+            #<p class="main-cost"><span class="main">215</span><span class="submain">00</span> руб.</p>
+            #<span itemprop="price" class="hidden">215.00</span>
+            #<meta itemprop="priceCurrency" content="RUR " />
+            
+            # if the price not in the search result (the ID search case)
+            if not search_result.price:
+                price = doc.xpath(u'normalize-space(//*[@itemprop="price"]/text())')
+                search_result.price = format_price_in_RUR(price)
+                
        return result

 def format_price_in_RUR(price):