Fix bug processing author names with initials when downloading metadata from ozon.ru. Fixes #845420 (Problems with processing metadata in plugin ozon.ru)

2025-07-09 03:04:10 -04:00 · 2011-09-14 17:03:43 -06:00 · 2011-09-14 17:03:43 -06:00 · c8a78a83bc
commit c8a78a83bc
parent fb0f73a7ec 640f345640
4 changed files with 109 additions and 45 deletions
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -11,7 +11,7 @@ __docformat__ = 'restructuredtext en'
 Input plugin for HTML or OPF ebooks.
 '''
-import os, re, sys, uuid, tempfile, errno
+import os, re, sys, uuid, tempfile
 from urlparse import urlparse, urlunparse
 from urllib import unquote
 from functools import partial
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -116,7 +116,8 @@ def cap_author_token(token):
    lt = lower(token)
    if lt in ('von', 'de', 'el', 'van', 'le'):
        return lt
-    if re.match(r'([a-z]\.){2,}$', lt) is not None:
+    # no digits no spez. characters
    if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
        # Normalize tokens of the form J.K. to J. K.
        parts = token.split('.')
        return '. '.join(map(capitalize, parts)).strip()
--- a/src/calibre/ebooks/metadata/sources/ozon.py
+++ b/src/calibre/ebooks/metadata/sources/ozon.py
@ -28,7 +28,7 @@ class Ozon(Source):
    touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
                               'publisher', 'pubdate', 'comments', 'series', 'rating', 'language'])
    # Test purpose only, test function does not like when sometimes some filed are empty
-    #touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
+    # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
    #                          'publisher', 'pubdate', 'comments'])
    supports_gzip_transfer_encoding = True
@ -109,8 +109,16 @@ class Ozon(Source):
    # }}}
    def get_metadata(self, log, entries, title, authors, identifiers): # {{{
        # some book titles have extra charactes like this
        # TODO: make a twick
        reRemoveFromTitle = None 
        #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
        title = unicode(title).upper() if title else ''
-        authors = map(unicode.upper, map(unicode, authors)) if authors else None
+        if reRemoveFromTitle:
            title = reRemoveFromTitle.sub('', title) 
        authors = map(_normalizeAuthorNameWithInitials, 
                      map(unicode.upper, map(unicode, authors))) if authors else None
        ozon_id = identifiers.get('ozon', None)
        unk = unicode(_('Unknown')).upper()
@ -124,6 +132,7 @@ class Ozon(Source):
        def in_authors(authors, miauthors):
            for author in authors:
                for miauthor in miauthors:
                    #log.debug(u'=> %s <> %s'%(author, miauthor))
                    if author in miauthor: return True
            return None
@ -131,7 +140,10 @@ class Ozon(Source):
            match = True
            if title:
                mititle = unicode(mi.title).upper() if mi.title else ''
                if reRemoveFromTitle:
                    mititle = reRemoveFromTitle.sub('', mititle)
                match = title in mititle
                #log.debug(u't=> %s <> %s'%(title, mititle))
            if match and authors:
                miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
                match = in_authors(authors, miauthors)
@ -190,7 +202,8 @@ class Ozon(Source):
        title = entry.xpath(xp_template.format('Name'))
        author = entry.xpath(xp_template.format('Author'))
-        mi = Metadata(title, author.split(','))
+        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)
        ozon_id = entry.xpath(xp_template.format('ID'))
        mi.identifiers = {'ozon':ozon_id}
@ -202,6 +215,11 @@ class Ozon(Source):
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)
        pub_year = entry.xpath(xp_template.format('Year'))
        if pub_year:
            mi.pubdate = toPubdate(log, pub_year)
            #log.debug('pubdate %s'%mi.pubdate)
        rating = entry.xpath(xp_template.format('ClientRatingValue'))
        if rating:
            try:
@ -269,13 +287,17 @@ class Ozon(Source):
        raw = self.browser.open_novisit(url, timeout=timeout).read()
        doc = html.fromstring(raw)
        xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
        xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
        # series
-        xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)'
+        xpt = xpt_prod_det_at % u'Сери'
        # % u'Серия:'
        series = doc.xpath(xpt)
        if series:
            metadata.series = series
-        xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")'
+        xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
        isbn_str = doc.xpath(xpt)
        if isbn_str:
            all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
@ -283,38 +305,42 @@ class Ozon(Source):
                metadata.all_isbns = all_isbns
                metadata.isbn = all_isbns[0]
-        xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]'
+        xpt = xpt_prod_det_at % u'Издатель'
        publishers = doc.xpath(xpt)
        if publishers:
-            metadata.publisher = publishers[0].text
+            metadata.publisher = publishers
-            xpt = u'string(../text()[contains(., "г.")])'
+        displ_lang = None
-            yearIn = publishers[0].xpath(xpt)
+        xpt = xpt_prod_det_tx % u'Язык'
        langs = doc.xpath(xpt)
        if langs:
            lng_splt = langs.split(u',')
            if lng_splt:
                displ_lang = lng_splt[0].strip()
        metadata.language = _translageLanguageToCode(displ_lang)
        #log.debug(u'language: %s'%displ_lang)
        # can be set before from xml search responce
        if not metadata.pubdate:
            xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
            yearIn = doc.xpath(xpt)
            if yearIn:
                matcher = re.search(r'\d{4}', yearIn)
                if matcher:
-                    year = int(matcher.group(0))
+                    metadata.pubdate = toPubdate(log, matcher.group(0))
                    # only year is available, so use 1-st of Jan
                    metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py
                    #metadata.pubdate = datetime(year, 1, 1)
            xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
            displLang = publishers[0].xpath(xpt)
            lang_code =_translageLanguageToCode(displLang)
            if lang_code:
                metadata.language = lang_code
        # overwrite comments from HTML if any
-        # tr/td[contains(.//text(), "От издателя")] -> does not work, why?
+        xpt = u'//table[@id="detail_description"]//tr/td'
        xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
              u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]'
        comment_elem = doc.xpath(xpt)
        if comment_elem:
            comments = unicode(etree.tostring(comment_elem[0]))
            if comments:
                # cleanup root tag, TODO: remove tags like object/embeded
-                comments = re.sub(r'^<td.+?>|</td>.+?$', u'', comments).strip()
+                comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
-                if comments:
+                if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
                    metadata.comments = comments
                else:
                    log.debug('HTML book description skipped in favour of search service xml responce')
        else:
            log.debug('No book description found in HTML')
    # }}}
@ -390,10 +416,40 @@ def _translageLanguageToCode(displayLang): # {{{
                u'Итальянский': 'it',
                u'Испанский': 'es',
                u'Китайский': 'zh',
-                u'Японский': 'ja' }
+                u'Японский': 'ja',
                u'Финский' : 'fi',
                u'Польский' : 'pl',}
    return langTbl.get(displayLang, None)
 # }}}
 # [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников
 def _normalizeAuthorNameWithInitials(name): # {{{
    res = name
    if name: 
        re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$' 
        re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$'
        matcher = re.match(re1, unicode(name), re.UNICODE)
        if not matcher:
            matcher = re.match(re2, unicode(name), re.UNICODE)
        if matcher:
            d = matcher.groupdict()
            res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x)
    return res
 # }}}
 def toPubdate(log, yearAsString):
    res = None
    if yearAsString:
        try:
            year = int(yearAsString)
            # only year is available, so use 1-st of Jan
            res = datetime.datetime(year, 1, 1)
        except:
            log.error('cannot parse to date %s'%yearAsString)
    return res
 if __name__ == '__main__': # tests {{{
    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
    # comment some touched_fields before run thoses tests
@ -403,7 +459,12 @@ if __name__ == '__main__': # tests {{{
    test_identify_plugin(Ozon.name,
        [
-
+#            (
 #                {'identifiers':{}, 'title':u'Норвежский язык: Практический курс',
 #                    'authors':[u'Колесников В.П.', u'Г.В. Шатков']},
 #                [title_test(u'Норвежский язык: Практический курс', exact=True),
 #                 authors_test([u'В. П. Колесников', u'Г. В. Шатков'])]
 #            ),
             (
                {'identifiers':{'isbn': '9785916572629'} },
                [title_test(u'На все четыре стороны', exact=True),
--- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py
+++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py
@ -80,13 +80,15 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
            doc = html.fromstring(f.read())
            # example where we are going to find formats
-            # <div class="box">
+            # <div class="l">
-            # ...
+            #     <p>            
-            #     <b>Доступные&nbsp;форматы:</b>
+            #         Доступно:
-            #     <div class="vertpadd">.epub, .fb2, .pdf, .pdf, .txt</div>
+            #    </p>
            # ...
            # </div>
-            xpt = u'normalize-space(//div[@class="box"]//*[contains(normalize-space(text()), "Доступные форматы:")][1]/following-sibling::div[1]/text())'
+            # <div class="l">
            #     <p>.epub, .fb2.zip, .pdf</p>
            # </div>
            xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
            formats = doc.xpath(xpt)
            if formats:
                result = True