Get Books: Fix ozon.ru

2025-07-09 03:04:10 -04:00 · 2012-07-02 10:17:54 +05:30 · 2012-07-02 10:17:54 +05:30 · 343d8f448c
commit 343d8f448c
parent 5a2848bacb
2 changed files with 80 additions and 54 deletions
--- a/src/calibre/ebooks/metadata/sources/ozon.py
+++ b/src/calibre/ebooks/metadata/sources/ozon.py
@ -54,30 +54,35 @@ class Ozon(Source):
        # for ozon.ru search we have to format ISBN with '-'
        isbn = _format_isbn(log, identifiers.get('isbn', None))
-        # TODO: format isbn!
+        ozonid = identifiers.get('ozon', None)
        qItems = set([isbn, title])
        if authors:
            qItems |= frozenset(authors)
        qItems.discard(None)
        qItems.discard('')
        qItems = map(_quoteString, qItems)
-        q = u' '.join(qItems).strip()
+        unk = unicode(_('Unknown')).upper()
-        log.info(u'search string: ' + q)
+        if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid:
            qItems = set([isbn, title])
            if authors:
                qItems |= frozenset(authors)
            qItems.discard(None)
            qItems.discard('')
            qItems = map(_quoteString, qItems)
-        if isinstance(q, unicode):
+            q = u' '.join(qItems).strip()
-            q = q.encode('utf-8')
+            log.info(u'search string: ' + q)
-        if not q:
+    
-            return None
+            if isinstance(q, unicode):
                q = q.encode('utf-8')
            if not q:
                return None
            search_url += quote_plus(q)
        else:
            search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid
        search_url += quote_plus(q)
        log.debug(u'search url: %r'%search_url)
        return search_url
    # }}}
    def identify(self, log, result_queue, abort, title=None, authors=None,
-            identifiers={}, timeout=30): # {{{
+            identifiers={}, timeout=60): # {{{
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
@ -99,7 +104,7 @@ class Ozon(Source):
        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
-            entries = feed.xpath('//*[local-name() = "SearchItems"]')
+            entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
            if entries:
                metadata = self.get_metadata(log, entries, title, authors, identifiers)
                self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
@ -112,8 +117,8 @@ class Ozon(Source):
    def get_metadata(self, log, entries, title, authors, identifiers): # {{{
        # some book titles have extra characters like this
        # TODO: make a twick
-        reRemoveFromTitle = None
+        #reRemoveFromTitle = None
-        #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
+        reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
        title = unicode(title).upper() if title else ''
        if reRemoveFromTitle:
@ -163,7 +168,7 @@ class Ozon(Source):
                metadata.append(mi)
                #log.debug(u'added metadata %s %s.'%(mi.title,  mi.authors))
            else:
-                log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
+                log.debug(u'skipped metadata %s %s. (does not match the query)'%(unicode(mi.title), mi.authors))
        return metadata
    # }}}
@ -301,7 +306,7 @@ class Ozon(Source):
        if series:
            metadata.series = series
-        xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
+        xpt = u'normalize-space(//*[@class="product-detail"]//text()[starts-with(., "ISBN")])'
        isbn_str = doc.xpath(xpt)
        if isbn_str:
            all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
@ -326,7 +331,7 @@ class Ozon(Source):
        # can be set before from xml search responce
        if not metadata.pubdate:
-            xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
+            xpt = u'normalize-space(substring-after(//div[@class="product-detail"]//text()[contains(., "г.")],";"))'
            yearIn = doc.xpath(xpt)
            if yearIn:
                matcher = re.search(r'\d{4}', yearIn)
@ -334,17 +339,20 @@ class Ozon(Source):
                    metadata.pubdate = toPubdate(log, matcher.group(0))
        # overwrite comments from HTML if any
-        xpt = u'//table[@id="detail_description"]//tr/td'
+        xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]'
        from lxml.etree import ElementBase
        comment_elem = doc.xpath(xpt)
        if comment_elem:
-            comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
+            comments = u''
-            if comments:
+            for node in comment_elem:
-                # cleanup root tag, TODO: remove tags like object/embeded
+                if isinstance(node, ElementBase):
-                comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
+                    comments += unicode(etree.tostring(node, encoding=unicode))
-                if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
+                elif isinstance(node, basestring) and node.strip():
-                    metadata.comments = comments
+                    comments += unicode(node) + u'\n'
-                else:
+            if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
-                    log.debug('HTML book description skipped in favour of search service xml responce')
+                metadata.comments = comments
            else:
                log.debug('HTML book description skipped in favour of search service xml responce')
        else:
            log.debug('No book description found in HTML')
    # }}}
@ -430,7 +438,8 @@ def _translageLanguageToCode(displayLang): # {{{
                u'Китайский': 'zh',
                u'Японский': 'ja',
                u'Финский' : 'fi',
-                u'Польский' : 'pl',}
+                u'Польский' : 'pl',
                u'Украинский' : 'uk',}
    return langTbl.get(displayLang, None)
 # }}}
@ -454,7 +463,7 @@ def toPubdate(log, yearAsString): # {{{
    res = None
    if yearAsString:
        try:
-            res = parse_only_date(yearAsString)
+            res = parse_only_date(u"01.01." + yearAsString)
        except:
            log.error('cannot parse to date %s'%yearAsString)
    return res
--- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py
+++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py
@ -46,30 +46,37 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
            d.set_tags(self.config.get('tags', ''))
            d.exec_()        
-
+    def search(self, query, max_results=15, timeout=60):
    def search(self, query, max_results=10, timeout=60):
        search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
                    'searchText=%s&searchContext=ebook' % urllib2.quote(query)
-        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
+        search_urls = [ search_url ]
        ## add this as the fist try if it looks like ozon ID
        if re.match("^\d{6,9}$", query):
            ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query
            search_urls.insert(0, ozon_detail)
        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
        counter = max_results
        br = browser()
        with closing(br.open(search_url, timeout=timeout)) as f:
            raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
            doc = etree.fromstring(raw)
            for data in doc.xpath('//*[local-name() = "SearchItems"]'):
                if counter <= 0:
                    break
                counter -= 1
-                s = SearchResult()
+        for url in search_urls:
-                s.detail_item = data.xpath(xp_template.format('ID'))
+            with closing(br.open(url, timeout=timeout)) as f:
-                s.title = data.xpath(xp_template.format('Name'))
+                raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
-                s.author = data.xpath(xp_template.format('Author'))
+                doc = etree.fromstring(raw)
-                s.price = data.xpath(xp_template.format('Price'))
+                for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
-                s.cover_url = data.xpath(xp_template.format('Picture'))
+                    if counter <= 0:
-                s.price = format_price_in_RUR(s.price)
+                        break
-                yield s
+                    counter -= 1
                    s = SearchResult()
                    s.detail_item = data.xpath(xp_template.format('ID'))
                    s.title = data.xpath(xp_template.format('Name'))
                    s.author = data.xpath(xp_template.format('Author'))
                    s.price = data.xpath(xp_template.format('Price'))
                    s.cover_url = data.xpath(xp_template.format('Picture'))
                    s.price = format_price_in_RUR(s.price)
                    yield s
    def get_details(self, search_result, timeout=60):
        url = self.shop_url + '/context/detail/id/' + urllib2.quote(search_result.detail_item)
@ -97,6 +104,16 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
                search_result.formats = ', '.join(_parse_ebook_formats(formats))
                # unfortunately no direct links to download books (only buy link)
                # search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item)
            #<p class="main-cost"><span class="main">215</span><span class="submain">00</span> руб.</p>
            #<span itemprop="price" class="hidden">215.00</span>
            #<meta itemprop="priceCurrency" content="RUR " />
            # if the price not in the search result (the ID search case)
            if not search_result.price:
                price = doc.xpath(u'normalize-space(//*[@itemprop="price"]/text())')
                search_result.price = format_price_in_RUR(price)
        return result
 def format_price_in_RUR(price):