diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index e7b0a42763..3f5f956fae 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -from xml.etree.ElementTree import _Element __license__ = 'GPL 3' __copyright__ = '2011, Roman Mukhin ' @@ -12,10 +11,8 @@ import datetime from urllib import quote_plus from Queue import Queue, Empty from lxml import etree, html -from lxml.etree import ElementBase from calibre import as_unicode -from calibre import prints from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata import check_isbn @@ -27,16 +24,16 @@ class Ozon(Source): description = _('Downloads metadata and covers from OZON.ru') capabilities = frozenset(['identify', 'cover']) - + touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', 'publisher', 'pubdate', 'comments', 'series', 'rating', 'language']) # Test purpose only, test function does not like when sometimes some filed are empty #touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', - # 'publisher', 'pubdate', 'comments']) + # 'publisher', 'pubdate', 'comments']) supports_gzip_transfer_encoding = True has_html_comments = True - + ozon_url = 'http://www.ozon.ru' # match any ISBN10/13. From "Regular Expressions Cookbook" @@ -53,11 +50,11 @@ class Ozon(Source): res = ('ozon', ozon_id, url) return res # }}} - + def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ # div_book -> search only books, ebooks and audio books search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' - + isbn = _format_isbn(log, identifiers.get('isbn', None)) # TODO: format isbn! qItems = set([isbn, title]) @@ -66,7 +63,7 @@ class Ozon(Source): qItems.discard(None) qItems.discard('') qItems = map(_quoteString, qItems) - + q = ' '.join(qItems).strip() log.info(u'search string: ' + q) @@ -74,10 +71,10 @@ class Ozon(Source): q = q.encode('utf-8') if not q: return None - + search_url += quote_plus(q) log.debug(u'search url: %r'%search_url) - + return search_url # }}} @@ -93,11 +90,11 @@ class Ozon(Source): try: raw = self.browser.open_novisit(query).read() - + except Exception as e: log.exception(u'Failed to make identify query: %r'%query) return as_unicode(e) - + try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser) @@ -110,14 +107,14 @@ class Ozon(Source): return as_unicode(e) # }}} - + def get_metadata(self, log, entries, title, authors, identifiers): # {{{ title = unicode(title).upper() if title else '' authors = map(unicode.upper, map(unicode, authors)) if authors else None ozon_id = identifiers.get('ozon', None) - + unk = unicode(_('Unknown')).upper() - + if title == unk: title = None @@ -129,7 +126,7 @@ class Ozon(Source): for miauthor in miauthors: if author in miauthor: return True return None - + def ensure_metadata_match(mi): # {{{ match = True if title: @@ -138,13 +135,13 @@ class Ozon(Source): if match and authors: miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else [] match = in_authors(authors, miauthors) - + if match and ozon_id: mozon_id = mi.identifiers['ozon'] match = ozon_id == mozon_id - - return match - + + return match + metadata = [] for i, entry in enumerate(entries): mi = self.to_metadata(log, entry) @@ -159,64 +156,64 @@ class Ozon(Source): def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{ req_isbn = identifiers.get('isbn', None) - + for mi in metadata: if abort.is_set(): break try: ozon_id = mi.identifiers['ozon'] - + try: self.get_book_details(log, mi, timeout) except: log.exception(u'Failed to get details for metadata: %s'%mi.title) - + all_isbns = getattr(mi, 'all_isbns', []) if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns: log.debug(u'skipped, no requested ISBN %s found'%req_isbn) continue - + for isbn in all_isbns: self.cache_isbn_to_identifier(isbn, ozon_id) - + if mi.ozon_cover_url: self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url) - + self.clean_downloaded_metadata(mi) result_queue.put(mi) except: log.exception(u'Failed to get details for metadata: %s'%mi.title) # }}} - + def to_metadata(self, log, entry): # {{{ xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' - + title = entry.xpath(xp_template.format('Name')) author = entry.xpath(xp_template.format('Author')) mi = Metadata(title, author.split(',')) - + ozon_id = entry.xpath(xp_template.format('ID')) mi.identifiers = {'ozon':ozon_id} - + mi.comments = entry.xpath(xp_template.format('Annotation')) - + mi.ozon_cover_url = None cover = entry.xpath(xp_template.format('Picture')) if cover: - mi.ozon_cover_url = _translateToBigCoverUrl(cover) - + mi.ozon_cover_url = _translateToBigCoverUrl(cover) + rating = entry.xpath(xp_template.format('ClientRatingValue')) if rating: try: #'rating', A floating point number between 0 and 10 - # OZON raion N of 5, calibre of 10, but there is a bug? in identify + # OZON raion N of 5, calibre of 10, but there is a bug? in identify mi.rating = float(rating) except: pass rating return mi # }}} - + def get_cached_cover_url(self, identifiers): # {{{ url = None ozon_id = identifiers.get('ozon', None) @@ -248,14 +245,14 @@ class Ozon(Source): cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break - + if cached_url is None: log.info('No cover found') return if abort.is_set(): return - + log.debug('Downloading cover from:', cached_url) try: cdata = self.browser.open_novisit(cached_url, timeout=timeout).read() @@ -265,10 +262,10 @@ class Ozon(Source): log.exception(u'Failed to download cover from: %s'%cached_url) return as_unicode(e) # }}} - + def get_book_details(self, log, metadata, timeout): # {{{ url = self.get_book_url(metadata.get_identifiers())[2] - + raw = self.browser.open_novisit(url, timeout=timeout).read() doc = html.fromstring(raw) @@ -298,14 +295,14 @@ class Ozon(Source): if matcher: year = int(matcher.group(0)) # only year is available, so use 1-st of Jan - metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py + metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py #metadata.pubdate = datetime(year, 1, 1) xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")' displLang = publishers[0].xpath(xpt) lang_code =_translageLanguageToCode(displLang) if lang_code: metadata.language = lang_code - + # overwrite comments from HTML if any # tr/td[contains(.//text(), "От издателя")] -> does not work, why? xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\ @@ -323,14 +320,14 @@ class Ozon(Source): # }}} def _quoteString(str): # {{{ - return '"' + str + '"' if str and str.find(' ') != -1 else str + return '"' + str + '"' if str and str.find(' ') != -1 else str # }}} # TODO: make customizable def _translateToBigCoverUrl(coverUrl): # {{{ # http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif # http://www.ozon.ru/multimedia/books_covers/1002986468.jpg - + m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl) if m: coverUrl = m.group(1) + m.group(2) + 'jpg' @@ -339,12 +336,12 @@ def _translateToBigCoverUrl(coverUrl): # {{{ def _get_affiliateId(): # {{{ import random - + aff_id = 'romuk' # Use Kovid's affiliate id 30% of the time. if random.randint(1, 10) in (1, 2, 3): aff_id = 'kovidgoyal' - return aff_id + return aff_id # }}} # for now only RUS ISBN are supported @@ -387,10 +384,10 @@ def _format_isbn(log, isbn): # {{{ def _translageLanguageToCode(displayLang): # {{{ displayLang = unicode(displayLang).strip() if displayLang else None langTbl = { None: 'ru', - u'Немецкий': 'de', - u'Английский': 'en', + u'Немецкий': 'de', + u'Английский': 'en', u'Французский': 'fr', - u'Итальянский': 'it', + u'Итальянский': 'it', u'Испанский': 'es', u'Китайский': 'zh', u'Японский': 'ja' } @@ -406,7 +403,7 @@ if __name__ == '__main__': # tests {{{ test_identify_plugin(Ozon.name, [ - + ( {'identifiers':{'isbn': '9785916572629'} }, [title_test(u'На все четыре стороны', exact=True), @@ -442,4 +439,4 @@ if __name__ == '__main__': # tests {{{ [title_test(u'Метро', exact=False)] ), ]) -# }}} \ No newline at end of file +# }}}