From 7e8ee1be41bb96cf1b3979d8cedd2e658de59e1a Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 18 Aug 2011 17:02:36 -0400 Subject: [PATCH] Changes from Roman Mukhin. --- src/calibre/customize/builtins.py | 3 +- src/calibre/ebooks/metadata/fb2.py | 11 +- src/calibre/ebooks/metadata/sources/ozon.py | 445 ++++++++++++++++++ .../gui2/store/stores/ozon_ru_plugin.py | 25 +- 4 files changed, 476 insertions(+), 8 deletions(-) create mode 100644 src/calibre/ebooks/metadata/sources/ozon.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 75c1d5a21e..6f5a0d4d36 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -590,8 +590,9 @@ from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary from calibre.ebooks.metadata.sources.isbndb import ISBNDB from calibre.ebooks.metadata.sources.overdrive import OverDrive from calibre.ebooks.metadata.sources.douban import Douban +from calibre.ebooks.metadata.sources.ozon import Ozon -plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban] +plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban, Ozon] # }}} diff --git a/src/calibre/ebooks/metadata/fb2.py b/src/calibre/ebooks/metadata/fb2.py index 765ac6d009..b47382697d 100644 --- a/src/calibre/ebooks/metadata/fb2.py +++ b/src/calibre/ebooks/metadata/fb2.py @@ -11,7 +11,7 @@ from functools import partial from base64 import b64decode from lxml import etree from calibre.utils.date import parse_date -from calibre import guess_all_extensions, prints, force_unicode +from calibre import guess_type, guess_all_extensions, prints, force_unicode from calibre.ebooks.metadata import MetaInformation, check_isbn from calibre.ebooks.chardet import xml_to_unicode @@ -147,6 +147,15 @@ def _parse_cover_data(root, imgid, mi): if elm_binary: mimetype = elm_binary[0].get('content-type', 'image/jpeg') mime_extensions = guess_all_extensions(mimetype) + + if not mime_extensions and mimetype.startswith('image/'): + prints("WARNING: Unsupported or misspelled mime-type '%s'. "\ + "Trying to recovery mime-type from id_ref='%s'" % (mimetype, imgid) ) + ctype = guess_type(imgid) # -> (mime-type, encoding) + mimetype_fromid = ctype[0] + if mimetype_fromid and mimetype_fromid.startswith('image/'): + mime_extensions = guess_all_extensions(mimetype_fromid) + if mime_extensions: pic_data = elm_binary[0].text if pic_data: diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py new file mode 100644 index 0000000000..e7b0a42763 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -0,0 +1,445 @@ +# -*- coding: utf-8 -*- +from __future__ import (unicode_literals, division, absolute_import, print_function) +from xml.etree.ElementTree import _Element + +__license__ = 'GPL 3' +__copyright__ = '2011, Roman Mukhin ' +__docformat__ = 'restructuredtext en' + +import re +import urllib2 +import datetime +from urllib import quote_plus +from Queue import Queue, Empty +from lxml import etree, html +from lxml.etree import ElementBase +from calibre import as_unicode + +from calibre import prints +from calibre.ebooks.chardet import xml_to_unicode + +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.book.base import Metadata + +class Ozon(Source): + name = 'OZON.ru' + description = _('Downloads metadata and covers from OZON.ru') + + capabilities = frozenset(['identify', 'cover']) + + touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', + 'publisher', 'pubdate', 'comments', 'series', 'rating', 'language']) + # Test purpose only, test function does not like when sometimes some filed are empty + #touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', + # 'publisher', 'pubdate', 'comments']) + + supports_gzip_transfer_encoding = True + has_html_comments = True + + ozon_url = 'http://www.ozon.ru' + + # match any ISBN10/13. From "Regular Expressions Cookbook" + isbnPattern = r'(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|'\ + '[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?'\ + '(?:[0-9]+[- ]?){2}[0-9X]' + isbnRegex = re.compile(isbnPattern) + + def get_book_url(self, identifiers): # {{{ + ozon_id = identifiers.get('ozon', None) + res = None + if ozon_id: + url = '{}/context/detail/id/{}?partner={}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId()) + res = ('ozon', ozon_id, url) + return res + # }}} + + def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ + # div_book -> search only books, ebooks and audio books + search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' + + isbn = _format_isbn(log, identifiers.get('isbn', None)) + # TODO: format isbn! + qItems = set([isbn, title]) + if authors: + qItems |= frozenset(authors) + qItems.discard(None) + qItems.discard('') + qItems = map(_quoteString, qItems) + + q = ' '.join(qItems).strip() + log.info(u'search string: ' + q) + + if isinstance(q, unicode): + q = q.encode('utf-8') + if not q: + return None + + search_url += quote_plus(q) + log.debug(u'search url: %r'%search_url) + + return search_url + # }}} + + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ + identifiers={}, timeout=30): + if not self.is_configured(): + return + query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) + if not query: + err = 'Insufficient metadata to construct query' + log.error(err) + return err + + try: + raw = self.browser.open_novisit(query).read() + + except Exception as e: + log.exception(u'Failed to make identify query: %r'%query) + return as_unicode(e) + + try: + parser = etree.XMLParser(recover=True, no_network=True) + feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser) + entries = feed.xpath('//*[local-name() = "SearchItems"]') + if entries: + metadata = self.get_metadata(log, entries, title, authors, identifiers) + self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) + except Exception as e: + log.exception('Failed to parse identify results') + return as_unicode(e) + + # }}} + + def get_metadata(self, log, entries, title, authors, identifiers): # {{{ + title = unicode(title).upper() if title else '' + authors = map(unicode.upper, map(unicode, authors)) if authors else None + ozon_id = identifiers.get('ozon', None) + + unk = unicode(_('Unknown')).upper() + + if title == unk: + title = None + + if authors == [unk]: + authors = None + + def in_authors(authors, miauthors): + for author in authors: + for miauthor in miauthors: + if author in miauthor: return True + return None + + def ensure_metadata_match(mi): # {{{ + match = True + if title: + mititle = unicode(mi.title).upper() if mi.title else '' + match = title in mititle + if match and authors: + miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else [] + match = in_authors(authors, miauthors) + + if match and ozon_id: + mozon_id = mi.identifiers['ozon'] + match = ozon_id == mozon_id + + return match + + metadata = [] + for i, entry in enumerate(entries): + mi = self.to_metadata(log, entry) + mi.source_relevance = i + if ensure_metadata_match(mi): + metadata.append(mi) + # log.debug(u'added metadata %s %s. '%(mi.title, mi.authors)) + else: + log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors)) + return metadata + # }}} + + def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{ + req_isbn = identifiers.get('isbn', None) + + for mi in metadata: + if abort.is_set(): + break + try: + ozon_id = mi.identifiers['ozon'] + + try: + self.get_book_details(log, mi, timeout) + except: + log.exception(u'Failed to get details for metadata: %s'%mi.title) + + all_isbns = getattr(mi, 'all_isbns', []) + if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns: + log.debug(u'skipped, no requested ISBN %s found'%req_isbn) + continue + + for isbn in all_isbns: + self.cache_isbn_to_identifier(isbn, ozon_id) + + if mi.ozon_cover_url: + self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url) + + self.clean_downloaded_metadata(mi) + result_queue.put(mi) + except: + log.exception(u'Failed to get details for metadata: %s'%mi.title) + # }}} + + def to_metadata(self, log, entry): # {{{ + xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' + + title = entry.xpath(xp_template.format('Name')) + author = entry.xpath(xp_template.format('Author')) + mi = Metadata(title, author.split(',')) + + ozon_id = entry.xpath(xp_template.format('ID')) + mi.identifiers = {'ozon':ozon_id} + + mi.comments = entry.xpath(xp_template.format('Annotation')) + + mi.ozon_cover_url = None + cover = entry.xpath(xp_template.format('Picture')) + if cover: + mi.ozon_cover_url = _translateToBigCoverUrl(cover) + + rating = entry.xpath(xp_template.format('ClientRatingValue')) + if rating: + try: + #'rating', A floating point number between 0 and 10 + # OZON raion N of 5, calibre of 10, but there is a bug? in identify + mi.rating = float(rating) + except: + pass + rating + return mi + # }}} + + def get_cached_cover_url(self, identifiers): # {{{ + url = None + ozon_id = identifiers.get('ozon', None) + if ozon_id is None: + isbn = identifiers.get('isbn', None) + if isbn is not None: + ozon_id = self.cached_isbn_to_identifier(isbn) + if ozon_id is not None: + url = self.cached_identifier_to_cover_url(ozon_id) + return url + # }}} + + def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): # {{{ + cached_url = self.get_cached_cover_url(identifiers) + if cached_url is None: + log.debug('No cached cover found, running identify') + rq = Queue() + self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) + if abort.is_set(): + return + results = [] + while True: + try: + results.append(rq.get_nowait()) + except Empty: + break + results.sort(key=self.identify_results_keygen(title=title, authors=authors, identifiers=identifiers)) + for mi in results: + cached_url = self.get_cached_cover_url(mi.identifiers) + if cached_url is not None: + break + + if cached_url is None: + log.info('No cover found') + return + + if abort.is_set(): + return + + log.debug('Downloading cover from:', cached_url) + try: + cdata = self.browser.open_novisit(cached_url, timeout=timeout).read() + if cdata: + result_queue.put((self, cdata)) + except Exception as e: + log.exception(u'Failed to download cover from: %s'%cached_url) + return as_unicode(e) + # }}} + + def get_book_details(self, log, metadata, timeout): # {{{ + url = self.get_book_url(metadata.get_identifiers())[2] + + raw = self.browser.open_novisit(url, timeout=timeout).read() + doc = html.fromstring(raw) + + # series + xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)' + series = doc.xpath(xpt) + if series: + metadata.series = series + + xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")' + isbn_str = doc.xpath(xpt) + if isbn_str: + all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)] + if all_isbns: + metadata.all_isbns = all_isbns + metadata.isbn = all_isbns[0] + + xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]' + publishers = doc.xpath(xpt) + if publishers: + metadata.publisher = publishers[0].text + + xpt = u'string(../text()[contains(., "г.")])' + yearIn = publishers[0].xpath(xpt) + if yearIn: + matcher = re.search(r'\d{4}', yearIn) + if matcher: + year = int(matcher.group(0)) + # only year is available, so use 1-st of Jan + metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py + #metadata.pubdate = datetime(year, 1, 1) + xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")' + displLang = publishers[0].xpath(xpt) + lang_code =_translageLanguageToCode(displLang) + if lang_code: + metadata.language = lang_code + + # overwrite comments from HTML if any + # tr/td[contains(.//text(), "От издателя")] -> does not work, why? + xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\ + u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]' + comment_elem = doc.xpath(xpt) + if comment_elem: + comments = unicode(etree.tostring(comment_elem[0])) + if comments: + # cleanup root tag, TODO: remove tags like object/embeded + comments = re.sub(r'^|.+?$', u'', comments).strip() + if comments: + metadata.comments = comments + else: + log.debug('No book description found in HTML') + # }}} + +def _quoteString(str): # {{{ + return '"' + str + '"' if str and str.find(' ') != -1 else str +# }}} + +# TODO: make customizable +def _translateToBigCoverUrl(coverUrl): # {{{ + # http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif + # http://www.ozon.ru/multimedia/books_covers/1002986468.jpg + + m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl) + if m: + coverUrl = m.group(1) + m.group(2) + 'jpg' + return coverUrl +# }}} + +def _get_affiliateId(): # {{{ + import random + + aff_id = 'romuk' + # Use Kovid's affiliate id 30% of the time. + if random.randint(1, 10) in (1, 2, 3): + aff_id = 'kovidgoyal' + return aff_id +# }}} + +# for now only RUS ISBN are supported +#http://ru.wikipedia.org/wiki/ISBN_российских_издательств +isbn_pat = re.compile(r""" + ^ + (\d{3})? # match GS1 Prefix for ISBN13 + (5) # group identifier for rRussian-speaking countries + ( # begin variable length for Publisher + [01]\d{1}| # 2x + [2-6]\d{2}| # 3x + 7\d{3}| # 4x (starting with 7) + 8[0-4]\d{2}| # 4x (starting with 8) + 9[2567]\d{2}| # 4x (starting with 9) + 99[26]\d{1}| # 4x (starting with 99) + 8[5-9]\d{3}| # 5x (starting with 8) + 9[348]\d{3}| # 5x (starting with 9) + 900\d{2}| # 5x (starting with 900) + 91[0-8]\d{2}| # 5x (starting with 91) + 90[1-9]\d{3}| # 6x (starting with 90) + 919\d{3}| # 6x (starting with 919) + 99[^26]\d{4} # 7x (starting with 99) + ) # end variable length for Publisher + (\d+) # Title + ([\dX]) # Check digit + $ +""", re.VERBOSE) + +def _format_isbn(log, isbn): # {{{ + res = check_isbn(isbn) + if res: + m = isbn_pat.match(res) + if m: + res = '-'.join([g for g in m.groups() if g]) + else: + log.error('cannot format isbn %s'%isbn) + return res +# }}} + +def _translageLanguageToCode(displayLang): # {{{ + displayLang = unicode(displayLang).strip() if displayLang else None + langTbl = { None: 'ru', + u'Немецкий': 'de', + u'Английский': 'en', + u'Французский': 'fr', + u'Итальянский': 'it', + u'Испанский': 'es', + u'Китайский': 'zh', + u'Японский': 'ja' } + return langTbl.get(displayLang, None) +# }}} + +if __name__ == '__main__': # tests {{{ + # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py + # comment some touched_fields before run thoses tests + from calibre.ebooks.metadata.sources.test import (test_identify_plugin, + title_test, authors_test, isbn_test) + + + test_identify_plugin(Ozon.name, + [ + + ( + {'identifiers':{'isbn': '9785916572629'} }, + [title_test(u'На все четыре стороны', exact=True), + authors_test([u'А. А. Гилл'])] + ), + ( + {'identifiers':{}, 'title':u'Der Himmel Kennt Keine Gunstlinge', + 'authors':[u'Erich Maria Remarque']}, + [title_test(u'Der Himmel Kennt Keine Gunstlinge', exact=True), + authors_test([u'Erich Maria Remarque'])] + ), + ( + {'identifiers':{ }, 'title':u'Метро 2033', + 'authors':[u'Дмитрий Глуховский']}, + [title_test(u'Метро 2033', exact=False)] + ), + ( + {'identifiers':{'isbn': '9785170727209'}, 'title':u'Метро 2033', + 'authors':[u'Дмитрий Глуховский']}, + [title_test(u'Метро 2033', exact=True), + authors_test([u'Дмитрий Глуховский']), + isbn_test('9785170727209')] + ), + ( + {'identifiers':{'isbn': '5-699-13613-4'}, 'title':u'Метро 2033', + 'authors':[u'Дмитрий Глуховский']}, + [title_test(u'Метро 2033', exact=True), + authors_test([u'Дмитрий Глуховский'])] + ), + ( + {'identifiers':{}, 'title':u'Метро', + 'authors':[u'Глуховский']}, + [title_test(u'Метро', exact=False)] + ), + ]) +# }}} \ No newline at end of file diff --git a/src/calibre/gui2/store/stores/ozon_ru_plugin.py b/src/calibre/gui2/store/stores/ozon_ru_plugin.py index 0d513f3dfa..866c1c2732 100644 --- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py +++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py @@ -50,6 +50,7 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): def search(self, query, max_results=10, timeout=60): search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\ 'searchText=%s&searchContext=ebook' % urllib2.quote(query) + xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() @@ -60,17 +61,14 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): if counter <= 0: break counter -= 1 - - xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' - + s = SearchResult() s.detail_item = data.xpath(xp_template.format('ID')) s.title = data.xpath(xp_template.format('Name')) s.author = data.xpath(xp_template.format('Author')) s.price = data.xpath(xp_template.format('Price')) s.cover_url = data.xpath(xp_template.format('Picture')) - if re.match("^\d+?\.\d+?$", s.price): - s.price = u'{:.2F} руб.'.format(float(s.price)) + s.price = format_price_in_RUR(s.price) yield s def get_details(self, search_result, timeout=60): @@ -97,7 +95,22 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): # unfortunately no direct links to download books (only buy link) # search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item) return result - + +def format_price_in_RUR(price): + ''' + Try to format price according ru locale: '12 212,34 руб.' + @param price: price in format like 25.99 + @return: formatted price if possible otherwise original value + @rtype: unicode + ''' + if price and re.match("^\d*?\.\d*?$", price): + try: + price = u'{:,.2F} руб.'.format(float(price)) + price = price.replace(',', ' ').replace('.', ',', 1) + except: + pass + return price + def _parse_ebook_formats(formatsStr): ''' Creates a list with displayable names of the formats