Changes from Roman Mukhin.

2026-05-13 10:42:17 -04:00 · 2011-08-18 17:02:36 -04:00
parent c3bf538fc0
commit 7e8ee1be41
4 changed files with 476 additions and 8 deletions
@@ -590,8 +590,9 @@ from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
 from calibre.ebooks.metadata.sources.isbndb import ISBNDB
 from calibre.ebooks.metadata.sources.overdrive import OverDrive
 from calibre.ebooks.metadata.sources.douban import Douban
+from calibre.ebooks.metadata.sources.ozon import Ozon

-plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
+plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon]

 # }}}

@@ -11,7 +11,7 @@ from functools import partial
 from base64 import b64decode
 from lxml import etree
 from calibre.utils.date import parse_date
-from calibre import guess_all_extensions, prints, force_unicode
+from calibre import guess_type, guess_all_extensions, prints, force_unicode
 from calibre.ebooks.metadata import MetaInformation, check_isbn
 from calibre.ebooks.chardet import xml_to_unicode

@@ -147,6 +147,15 @@ def _parse_cover_data(root, imgid, mi):
    if elm_binary:
        mimetype = elm_binary[0].get('content-type', 'image/jpeg')
        mime_extensions = guess_all_extensions(mimetype)
+        
+        if not mime_extensions and mimetype.startswith('image/'):
+            prints("WARNING: Unsupported or misspelled mime-type '%s'. "\
+                   "Trying to recovery mime-type from id_ref='%s'" % (mimetype, imgid) )
+            ctype = guess_type(imgid) # -> (mime-type, encoding)
+            mimetype_fromid = ctype[0]
+            if mimetype_fromid and mimetype_fromid.startswith('image/'):
+                mime_extensions = guess_all_extensions(mimetype_fromid)
+        
        if mime_extensions:
            pic_data = elm_binary[0].text
            if pic_data:
@@ -0,0 +1,445 @@
+# -*- coding: utf-8 -*-
+from __future__ import (unicode_literals, division, absolute_import, print_function)
+from xml.etree.ElementTree import _Element
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+import urllib2
+import datetime
+from urllib import quote_plus
+from Queue import Queue, Empty
+from lxml import etree, html
+from lxml.etree import ElementBase
+from calibre import as_unicode
+
+from calibre import prints
+from calibre.ebooks.chardet import xml_to_unicode
+
+from calibre.ebooks.metadata import check_isbn
+from calibre.ebooks.metadata.sources.base import Source
+from calibre.ebooks.metadata.book.base import Metadata
+
+class Ozon(Source):
+    name = 'OZON.ru'
+    description = _('Downloads metadata and covers from OZON.ru')
+
+    capabilities = frozenset(['identify', 'cover'])
+    
+    touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
+                               'publisher', 'pubdate', 'comments', 'series', 'rating', 'language'])
+    # Test purpose only, test function does not like when sometimes some filed are empty
+    #touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
+    #                          'publisher', 'pubdate', 'comments']) 
+
+    supports_gzip_transfer_encoding = True
+    has_html_comments = True
+    
+    ozon_url = 'http://www.ozon.ru'
+
+    # match any ISBN10/13. From "Regular Expressions Cookbook"
+    isbnPattern = r'(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|'\
+             '[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?'\
+             '(?:[0-9]+[- ]?){2}[0-9X]'
+    isbnRegex = re.compile(isbnPattern)
+
+    def get_book_url(self, identifiers): # {{{
+        ozon_id = identifiers.get('ozon', None)
+        res = None
+        if ozon_id:
+            url = '{}/context/detail/id/{}?partner={}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
+            res = ('ozon', ozon_id, url)
+        return res
+    # }}}
+    
+    def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
+        # div_book -> search only books, ebooks and audio books
+        search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
+        
+        isbn = _format_isbn(log, identifiers.get('isbn', None))
+        # TODO: format isbn!
+        qItems = set([isbn, title])
+        if authors:
+            qItems |= frozenset(authors)
+        qItems.discard(None)
+        qItems.discard('')
+        qItems = map(_quoteString, qItems)
+        
+        q = ' '.join(qItems).strip()
+        log.info(u'search string: ' + q)
+
+        if isinstance(q, unicode):
+            q = q.encode('utf-8')
+        if not q:
+            return None
+        
+        search_url += quote_plus(q)
+        log.debug(u'search url: %r'%search_url)
+        
+        return search_url
+    # }}}
+
+    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
+            identifiers={}, timeout=30):
+        if not self.is_configured():
+            return
+        query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
+        if not query:
+            err = 'Insufficient metadata to construct query'
+            log.error(err)
+            return err
+
+        try:
+            raw = self.browser.open_novisit(query).read()
+            
+        except Exception as e:
+            log.exception(u'Failed to make identify query: %r'%query)
+            return as_unicode(e)
+        
+        try:
+            parser = etree.XMLParser(recover=True, no_network=True)
+            feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
+            entries = feed.xpath('//*[local-name() = "SearchItems"]')
+            if entries:
+                metadata = self.get_metadata(log, entries, title, authors, identifiers)
+                self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
+        except Exception as e:
+            log.exception('Failed to parse identify results')
+            return as_unicode(e)
+
+    # }}}
+    
+    def get_metadata(self, log, entries, title, authors, identifiers): # {{{
+        title = unicode(title).upper() if title else ''
+        authors = map(unicode.upper, map(unicode, authors)) if authors else None
+        ozon_id = identifiers.get('ozon', None)
+        
+        unk = unicode(_('Unknown')).upper()
+        
+        if title == unk:
+            title = None
+
+        if authors == [unk]:
+            authors = None
+
+        def in_authors(authors, miauthors):
+            for author in authors:
+                for miauthor in miauthors:
+                    if author in miauthor: return True
+            return None
+        
+        def ensure_metadata_match(mi): # {{{
+            match = True
+            if title:
+                mititle = unicode(mi.title).upper() if mi.title else ''
+                match = title in mititle
+            if match and authors:
+                miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
+                match = in_authors(authors, miauthors)
+            
+            if match and ozon_id:
+                mozon_id = mi.identifiers['ozon']
+                match = ozon_id == mozon_id
+            
+            return match 
+        
+        metadata = []
+        for i, entry in enumerate(entries):
+            mi = self.to_metadata(log, entry)
+            mi.source_relevance = i
+            if ensure_metadata_match(mi):
+                metadata.append(mi)
+                # log.debug(u'added metadata %s %s. '%(mi.title, mi.authors))
+            else:
+                log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
+        return metadata
+    # }}}
+
+    def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{
+        req_isbn = identifiers.get('isbn', None)
+        
+        for mi in metadata:
+            if abort.is_set():
+                break
+            try:
+                ozon_id = mi.identifiers['ozon']
+                
+                try:
+                    self.get_book_details(log, mi, timeout)
+                except:
+                    log.exception(u'Failed to get details for metadata: %s'%mi.title)
+                
+                all_isbns = getattr(mi, 'all_isbns', [])
+                if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns:
+                    log.debug(u'skipped, no requested ISBN %s found'%req_isbn)
+                    continue
+                
+                for isbn in all_isbns:
+                    self.cache_isbn_to_identifier(isbn, ozon_id)
+                    
+                if mi.ozon_cover_url:
+                    self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url)
+                    
+                self.clean_downloaded_metadata(mi)
+                result_queue.put(mi)
+            except:
+                log.exception(u'Failed to get details for metadata: %s'%mi.title)
+    # }}}
+    
+    def to_metadata(self, log, entry): # {{{
+        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
+        
+        title = entry.xpath(xp_template.format('Name'))
+        author = entry.xpath(xp_template.format('Author'))
+        mi = Metadata(title, author.split(','))
+        
+        ozon_id = entry.xpath(xp_template.format('ID'))
+        mi.identifiers = {'ozon':ozon_id}
+        
+        mi.comments = entry.xpath(xp_template.format('Annotation'))
+        
+        mi.ozon_cover_url = None
+        cover = entry.xpath(xp_template.format('Picture'))
+        if cover:
+            mi.ozon_cover_url = _translateToBigCoverUrl(cover) 
+            
+        rating = entry.xpath(xp_template.format('ClientRatingValue'))
+        if rating:
+            try:
+                #'rating',     A floating point number between 0 and 10
+                # OZON raion N of 5, calibre of 10, but there is a bug? in identify 
+                mi.rating = float(rating)
+            except:
+                pass
+            rating
+        return mi
+    # }}}
+    
+    def get_cached_cover_url(self, identifiers): # {{{
+        url = None
+        ozon_id = identifiers.get('ozon', None)
+        if ozon_id is None:
+            isbn = identifiers.get('isbn', None)
+            if isbn is not None:
+                ozon_id = self.cached_isbn_to_identifier(isbn)
+        if ozon_id is not None:
+            url = self.cached_identifier_to_cover_url(ozon_id)
+        return url
+    # }}}
+
+    def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): # {{{
+        cached_url = self.get_cached_cover_url(identifiers)
+        if cached_url is None:
+            log.debug('No cached cover found, running identify')
+            rq = Queue()
+            self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers)
+            if abort.is_set():
+                return
+            results = []
+            while True:
+                try:
+                    results.append(rq.get_nowait())
+                except Empty:
+                    break
+            results.sort(key=self.identify_results_keygen(title=title, authors=authors, identifiers=identifiers))
+            for mi in results:
+                cached_url = self.get_cached_cover_url(mi.identifiers)
+                if cached_url is not None:
+                    break
+       
+        if cached_url is None:
+            log.info('No cover found')
+            return
+
+        if abort.is_set():
+            return
+        
+        log.debug('Downloading cover from:', cached_url)
+        try:
+            cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
+            if cdata:
+                result_queue.put((self, cdata))
+        except Exception as e:
+            log.exception(u'Failed to download cover from: %s'%cached_url)
+            return as_unicode(e)
+    # }}}
+    
+    def get_book_details(self, log, metadata, timeout): # {{{
+        url = self.get_book_url(metadata.get_identifiers())[2]
+        
+        raw = self.browser.open_novisit(url, timeout=timeout).read()
+        doc = html.fromstring(raw)
+
+        # series
+        xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)'
+        series = doc.xpath(xpt)
+        if series:
+            metadata.series = series
+
+        xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")'
+        isbn_str = doc.xpath(xpt)
+        if isbn_str:
+            all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
+            if all_isbns:
+                metadata.all_isbns = all_isbns
+                metadata.isbn = all_isbns[0]
+
+        xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]'
+        publishers = doc.xpath(xpt)
+        if publishers:
+            metadata.publisher = publishers[0].text
+
+            xpt = u'string(../text()[contains(., "г.")])'
+            yearIn = publishers[0].xpath(xpt)
+            if yearIn:
+                matcher = re.search(r'\d{4}', yearIn)
+                if matcher:
+                    year = int(matcher.group(0))
+                    # only year is available, so use 1-st of Jan
+                    metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py 
+                    #metadata.pubdate = datetime(year, 1, 1)
+            xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
+            displLang = publishers[0].xpath(xpt)
+            lang_code =_translageLanguageToCode(displLang)
+            if lang_code:
+                metadata.language = lang_code
+        
+        # overwrite comments from HTML if any
+        # tr/td[contains(.//text(), "От издателя")] -> does not work, why?
+        xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
+              u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]'
+        comment_elem = doc.xpath(xpt)
+        if comment_elem:
+            comments = unicode(etree.tostring(comment_elem[0]))
+            if comments:
+                # cleanup root tag, TODO: remove tags like object/embeded
+                comments = re.sub(r'^<td.+?>|</td>.+?$', u'', comments).strip()
+                if comments:
+                    metadata.comments = comments
+        else:
+            log.debug('No book description found in HTML')
+    # }}}
+
+def _quoteString(str): # {{{
+    return '"' + str + '"' if str and str.find(' ') != -1 else str 
+# }}}
+
+# TODO: make customizable
+def _translateToBigCoverUrl(coverUrl): # {{{
+    # http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif
+    # http://www.ozon.ru/multimedia/books_covers/1002986468.jpg
+    
+    m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl)
+    if m:
+        coverUrl = m.group(1) + m.group(2) + 'jpg'
+    return coverUrl
+# }}}
+
+def _get_affiliateId(): # {{{
+    import random
+    
+    aff_id = 'romuk'
+    # Use Kovid's affiliate id 30% of the time.
+    if random.randint(1, 10) in (1, 2, 3):
+        aff_id = 'kovidgoyal'
+    return aff_id 
+# }}}
+
+# for now only RUS ISBN are supported
+#http://ru.wikipedia.org/wiki/ISBN_российских_издательств
+isbn_pat = re.compile(r"""
+    ^
+    (\d{3})?            # match GS1 Prefix for ISBN13
+    (5)                 # group identifier for rRussian-speaking countries
+    (                   # begin variable length for Publisher
+        [01]\d{1}|      # 2x
+        [2-6]\d{2}|     # 3x
+        7\d{3}|         # 4x (starting with 7)
+        8[0-4]\d{2}|    # 4x (starting with 8)
+        9[2567]\d{2}|   # 4x (starting with 9)
+        99[26]\d{1}|    # 4x (starting with 99)
+        8[5-9]\d{3}|    # 5x (starting with 8)
+        9[348]\d{3}|    # 5x (starting with 9)
+        900\d{2}|       # 5x (starting with 900)
+        91[0-8]\d{2}|   # 5x (starting with 91)
+        90[1-9]\d{3}|   # 6x (starting with 90)
+        919\d{3}|       # 6x (starting with 919)
+        99[^26]\d{4}    # 7x (starting with 99)
+    )                   # end variable length for Publisher
+    (\d+)               # Title
+    ([\dX])             # Check digit
+    $
+""", re.VERBOSE)
+
+def _format_isbn(log, isbn):  # {{{
+    res = check_isbn(isbn)
+    if res:
+        m = isbn_pat.match(res)
+        if m:
+            res = '-'.join([g for g in m.groups() if g])
+        else:
+            log.error('cannot format isbn %s'%isbn)
+    return res
+# }}}
+
+def _translageLanguageToCode(displayLang): # {{{
+    displayLang = unicode(displayLang).strip() if displayLang else None
+    langTbl = {  None: 'ru',
+                u'Немецкий': 'de', 
+                u'Английский': 'en', 
+                u'Французский': 'fr',
+                u'Итальянский': 'it', 
+                u'Испанский': 'es',
+                u'Китайский': 'zh',
+                u'Японский': 'ja' }
+    return langTbl.get(displayLang, None)
+# }}}
+
+if __name__ == '__main__': # tests {{{
+    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
+    # comment some touched_fields before run thoses tests
+    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
+            title_test, authors_test, isbn_test)
+
+
+    test_identify_plugin(Ozon.name,
+        [
+         
+            (
+                {'identifiers':{'isbn': '9785916572629'} },
+                [title_test(u'На все четыре стороны', exact=True),
+                 authors_test([u'А. А. Гилл'])]
+            ),
+            (
+                {'identifiers':{}, 'title':u'Der Himmel Kennt Keine Gunstlinge',
+                    'authors':[u'Erich Maria Remarque']},
+                [title_test(u'Der Himmel Kennt Keine Gunstlinge', exact=True),
+                 authors_test([u'Erich Maria Remarque'])]
+            ),
+            (
+                {'identifiers':{ }, 'title':u'Метро 2033',
+                    'authors':[u'Дмитрий Глуховский']},
+                [title_test(u'Метро 2033', exact=False)]
+            ),
+            (
+                {'identifiers':{'isbn': '9785170727209'}, 'title':u'Метро 2033',
+                    'authors':[u'Дмитрий Глуховский']},
+                [title_test(u'Метро 2033', exact=True),
+                    authors_test([u'Дмитрий Глуховский']),
+                    isbn_test('9785170727209')]
+            ),
+            (
+                {'identifiers':{'isbn': '5-699-13613-4'}, 'title':u'Метро 2033',
+                    'authors':[u'Дмитрий Глуховский']},
+                [title_test(u'Метро 2033', exact=True),
+                 authors_test([u'Дмитрий Глуховский'])]
+            ),
+            (
+                {'identifiers':{}, 'title':u'Метро',
+                    'authors':[u'Глуховский']},
+                [title_test(u'Метро', exact=False)]
+            ),
+    ])
+# }}}
@@ -50,6 +50,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
    def search(self, query, max_results=10, timeout=60):
        search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
                    'searchText=%s&searchContext=ebook' % urllib2.quote(query)
+        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
                    
        counter = max_results
        br = browser()
@@ -60,17 +61,14 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
                if counter <= 0:
                    break
                counter -= 1
-                        
-                xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
-                
+
                s = SearchResult()
                s.detail_item = data.xpath(xp_template.format('ID'))
                s.title = data.xpath(xp_template.format('Name'))
                s.author = data.xpath(xp_template.format('Author'))
                s.price = data.xpath(xp_template.format('Price'))
                s.cover_url = data.xpath(xp_template.format('Picture'))
-                if re.match("^\d+?\.\d+?$", s.price):
-                    s.price = u'{:.2F} руб.'.format(float(s.price))
+                s.price = format_price_in_RUR(s.price)
                yield s

    def get_details(self, search_result, timeout=60):
@@ -97,7 +95,22 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
                # unfortunately no direct links to download books (only buy link)
                # search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item)
        return result
-    
+
+def format_price_in_RUR(price):
+    '''
+    Try to format price according ru locale: '12 212,34 руб.'
+    @param price: price in format like 25.99
+    @return: formatted price if possible otherwise original value
+    @rtype: unicode
+    '''
+    if price and re.match("^\d*?\.\d*?$", price):
+        try:
+            price = u'{:,.2F} руб.'.format(float(price))
+            price = price.replace(',', ' ').replace('.', ',', 1)
+        except:
+            pass
+    return price
+
 def _parse_ebook_formats(formatsStr):
    '''
    Creates a list with displayable names of the formats