Fix encoding of comments incorrectly detected when downloading metadata from ozon.ru

2025-06-23 15:30:45 -04:00 · 2011-11-19 20:37:33 +05:30 · 2011-11-19 20:37:33 +05:30 · 17f9da26c4
commit 17f9da26c4
parent b9765b8f52 5d6706b5fe
4 changed files with 39 additions and 24 deletions
--- a/src/calibre/ebooks/metadata/book/base.py
+++ b/src/calibre/ebooks/metadata/book/base.py
@ -710,7 +710,8 @@ class Metadata(object):
            fmt('Title sort', self.title_sort)
        if self.authors:
            fmt('Author(s)',  authors_to_string(self.authors) + \
-               ((' [' + self.author_sort + ']') if self.author_sort else ''))
+               ((' [' + self.author_sort + ']') 
+                if self.author_sort and self.author_sort != _('Unknown') else ''))
        if self.publisher:
            fmt('Publisher', self.publisher)
        if getattr(self, 'book_producer', False):
--- a/src/calibre/ebooks/metadata/sources/ozon.py
+++ b/src/calibre/ebooks/metadata/sources/ozon.py
@ -11,7 +11,7 @@ import datetime
 from urllib import quote_plus
 from Queue import Queue, Empty
 from lxml import etree, html
-from calibre import as_unicode
+from calibre import prints, as_unicode

 from calibre.ebooks.chardet import xml_to_unicode

@ -54,7 +54,8 @@ class Ozon(Source):
    def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
        # div_book -> search only books, ebooks and audio books
        search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
-
+        
+        # for ozon.ru search we have to format ISBN with '-'
        isbn = _format_isbn(log, identifiers.get('isbn', None))
        # TODO: format isbn!
        qItems = set([isbn, title])
@ -64,7 +65,7 @@ class Ozon(Source):
        qItems.discard('')
        qItems = map(_quoteString, qItems)

-        q = ' '.join(qItems).strip()
+        q = u' '.join(qItems).strip()
        log.info(u'search string: ' + q)

        if isinstance(q, unicode):
@ -78,13 +79,13 @@ class Ozon(Source):
        return search_url
    # }}}

-    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
-            identifiers={}, timeout=30):
+    def identify(self, log, result_queue, abort, title=None, authors=None, 
+            identifiers={}, timeout=30): # {{{
        if not self.is_configured():
            return
        query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
        if not query:
-            err = 'Insufficient metadata to construct query'
+            err = u'Insufficient metadata to construct query'
            log.error(err)
            return err

@ -109,7 +110,7 @@ class Ozon(Source):
    # }}}

    def get_metadata(self, log, entries, title, authors, identifiers): # {{{
-        # some book titles have extra charactes like this
+        # some book titles have extra characters like this
        # TODO: make a twick
        reRemoveFromTitle = None 
        #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
@ -160,7 +161,7 @@ class Ozon(Source):
            mi.source_relevance = i
            if ensure_metadata_match(mi):
                metadata.append(mi)
-                # log.debug(u'added metadata %s %s. '%(mi.title, mi.authors))
+                #log.debug(u'added metadata %s %s.'%(mi.title,  mi.authors))
            else:
                log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
        return metadata
@ -285,12 +286,12 @@ class Ozon(Source):
        url = self.get_book_url(metadata.get_identifiers())[2]

        raw = self.browser.open_novisit(url, timeout=timeout).read()
-        doc = html.fromstring(raw)
+        doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])

        xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
        xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'

-        # series
+        # series Серия/Серии
        xpt = xpt_prod_det_at % u'Сери'
        # % u'Серия:'
        series = doc.xpath(xpt)
@ -300,7 +301,7 @@ class Ozon(Source):
        xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
        isbn_str = doc.xpath(xpt)
        if isbn_str:
-            all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
+            all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
            if all_isbns:
                metadata.all_isbns = all_isbns
                metadata.isbn = all_isbns[0]
@ -333,10 +334,10 @@ class Ozon(Source):
        xpt = u'//table[@id="detail_description"]//tr/td'
        comment_elem = doc.xpath(xpt)
        if comment_elem:
-            comments = unicode(etree.tostring(comment_elem[0]))
+            comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
            if comments:
                # cleanup root tag, TODO: remove tags like object/embeded
-                comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
+                comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
                if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
                    metadata.comments = comments
                else:
@ -345,8 +346,16 @@ class Ozon(Source):
            log.debug('No book description found in HTML')
    # }}}

-def _quoteString(str): # {{{
-    return '"' + str + '"' if str and str.find(' ') != -1 else str
+def _quoteString(strToQuote): # {{{
+    return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote
+# }}}
+
+def _verifyISBNIntegrity(log, isbn): # {{{
+    # Online ISBN-Check http://www.isbn-check.de/
+    res = check_isbn(isbn)
+    if not res:
+        log.error(u'ISBN integrity check failed for "%s"'%isbn)
+    return res is not None
 # }}}

 # TODO: make customizable
@ -438,7 +447,7 @@ def _normalizeAuthorNameWithInitials(name): # {{{
    return res
 # }}}

-def toPubdate(log, yearAsString):
+def toPubdate(log, yearAsString): # {{{
    res = None
    if yearAsString:
        try:
@ -448,7 +457,11 @@ def toPubdate(log, yearAsString):
        except:
            log.error('cannot parse to date %s'%yearAsString)
    return res
+# }}}

+def _listToUnicodePrintStr(lst): # {{{
+    return u'[' + u', '.join(unicode(x) for x in lst) + u']'
+# }}}

 if __name__ == '__main__': # tests {{{
    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
--- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py
+++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py
@ -77,7 +77,8 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):

        result = False
        with closing(br.open(url, timeout=timeout)) as f:
-            doc = html.fromstring(f.read())
+            raw = xml_to_unicode(f.read(), verbose=True)[0]
+            doc = html.fromstring(raw)
            
            # example where we are going to find formats
            # <div class="l">
@ -88,7 +89,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
            # <div class="l">
            #     <p>.epub, .fb2.zip, .pdf</p>
            # </div>
-            xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
+            xpt = u'normalize-space(//div[contains(@id, "saleBlock")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
            formats = doc.xpath(xpt)
            if formats:
                result = True
--- a/src/calibre/translations/ru.po
+++ b/src/calibre/translations/ru.po
@ -12539,7 +12539,7 @@ msgstr "За&грузить метаданные"

 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:226
 msgid "Configure download metadata"
-msgstr ""
+msgstr "Настроить загрузку метаданных"

 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:230
 msgid "Change how calibre downloads metadata"
@ -12595,7 +12595,7 @@ msgstr "&Пользовательские метаданные"

 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:788
 msgid "&Comments"
-msgstr "Комментарии"
+msgstr "&Комментарии"

 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single.py:854
 msgid "Basic metadata"
@ -12603,11 +12603,11 @@ msgstr "Основные метаданные"

 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
 msgid "Has cover"
-msgstr "Есть обложка"
+msgstr "Обложка"

 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:133
 msgid "Has summary"
-msgstr ""
+msgstr "Аннотация"

 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:190
 msgid ""
@ -12619,7 +12619,7 @@ msgstr ""

 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:268
 msgid "See at"
-msgstr ""
+msgstr "Посмотреть на"

 #: /home/kovid/work/calibre/src/calibre/gui2/metadata/single_download.py:403
 msgid "calibre is downloading metadata from: "