diff --git a/src/calibre/ebooks/metadata/sources/ozon.py b/src/calibre/ebooks/metadata/sources/ozon.py index 3845ebf97b..ebb104818f 100644 --- a/src/calibre/ebooks/metadata/sources/ozon.py +++ b/src/calibre/ebooks/metadata/sources/ozon.py @@ -54,30 +54,35 @@ class Ozon(Source): # for ozon.ru search we have to format ISBN with '-' isbn = _format_isbn(log, identifiers.get('isbn', None)) - # TODO: format isbn! - qItems = set([isbn, title]) - if authors: - qItems |= frozenset(authors) - qItems.discard(None) - qItems.discard('') - qItems = map(_quoteString, qItems) - - q = u' '.join(qItems).strip() - log.info(u'search string: ' + q) - - if isinstance(q, unicode): - q = q.encode('utf-8') - if not q: - return None - - search_url += quote_plus(q) + ozonid = identifiers.get('ozon', None) + + unk = unicode(_('Unknown')).upper() + if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid: + qItems = set([isbn, title]) + if authors: + qItems |= frozenset(authors) + qItems.discard(None) + qItems.discard('') + qItems = map(_quoteString, qItems) + + q = u' '.join(qItems).strip() + log.info(u'search string: ' + q) + + if isinstance(q, unicode): + q = q.encode('utf-8') + if not q: + return None + + search_url += quote_plus(q) + else: + search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid + log.debug(u'search url: %r'%search_url) - return search_url # }}} def identify(self, log, result_queue, abort, title=None, authors=None, - identifiers={}, timeout=30): # {{{ + identifiers={}, timeout=60): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode @@ -99,7 +104,7 @@ class Ozon(Source): try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser) - entries = feed.xpath('//*[local-name() = "SearchItems"]') + entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]') if entries: metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) @@ -112,8 +117,8 @@ class Ozon(Source): def get_metadata(self, log, entries, title, authors, identifiers): # {{{ # some book titles have extra characters like this # TODO: make a twick - reRemoveFromTitle = None - #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') + #reRemoveFromTitle = None + reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') title = unicode(title).upper() if title else '' if reRemoveFromTitle: @@ -163,7 +168,7 @@ class Ozon(Source): metadata.append(mi) #log.debug(u'added metadata %s %s.'%(mi.title, mi.authors)) else: - log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors)) + log.debug(u'skipped metadata %s %s. (does not match the query)'%(unicode(mi.title), mi.authors)) return metadata # }}} @@ -301,7 +306,7 @@ class Ozon(Source): if series: metadata.series = series - xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))' + xpt = u'normalize-space(//*[@class="product-detail"]//text()[starts-with(., "ISBN")])' isbn_str = doc.xpath(xpt) if isbn_str: all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)] @@ -326,7 +331,7 @@ class Ozon(Source): # can be set before from xml search responce if not metadata.pubdate: - xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])' + xpt = u'normalize-space(substring-after(//div[@class="product-detail"]//text()[contains(., "г.")],";"))' yearIn = doc.xpath(xpt) if yearIn: matcher = re.search(r'\d{4}', yearIn) @@ -334,17 +339,20 @@ class Ozon(Source): metadata.pubdate = toPubdate(log, matcher.group(0)) # overwrite comments from HTML if any - xpt = u'//table[@id="detail_description"]//tr/td' + xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' + from lxml.etree import ElementBase comment_elem = doc.xpath(xpt) if comment_elem: - comments = unicode(etree.tostring(comment_elem[0], encoding=unicode)) - if comments: - # cleanup root tag, TODO: remove tags like object/embeded - comments = re.sub(ur'\A.*?|.*\Z', u'', comments.strip(), re.MULTILINE).strip() - if comments and (not metadata.comments or len(comments) > len(metadata.comments)): - metadata.comments = comments - else: - log.debug('HTML book description skipped in favour of search service xml responce') + comments = u'' + for node in comment_elem: + if isinstance(node, ElementBase): + comments += unicode(etree.tostring(node, encoding=unicode)) + elif isinstance(node, basestring) and node.strip(): + comments += unicode(node) + u'\n' + if comments and (not metadata.comments or len(comments) > len(metadata.comments)): + metadata.comments = comments + else: + log.debug('HTML book description skipped in favour of search service xml responce') else: log.debug('No book description found in HTML') # }}} @@ -430,7 +438,8 @@ def _translageLanguageToCode(displayLang): # {{{ u'Китайский': 'zh', u'Японский': 'ja', u'Финский' : 'fi', - u'Польский' : 'pl',} + u'Польский' : 'pl', + u'Украинский' : 'uk',} return langTbl.get(displayLang, None) # }}} @@ -454,7 +463,7 @@ def toPubdate(log, yearAsString): # {{{ res = None if yearAsString: try: - res = parse_only_date(yearAsString) + res = parse_only_date(u"01.01." + yearAsString) except: log.error('cannot parse to date %s'%yearAsString) return res diff --git a/src/calibre/gui2/store/stores/ozon_ru_plugin.py b/src/calibre/gui2/store/stores/ozon_ru_plugin.py index 5d977700c8..b54bf01daf 100644 --- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py +++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py @@ -46,30 +46,37 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): d.set_tags(self.config.get('tags', '')) d.exec_() - - def search(self, query, max_results=10, timeout=60): + def search(self, query, max_results=15, timeout=60): search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\ 'searchText=%s&searchContext=ebook' % urllib2.quote(query) + search_urls = [ search_url ] + + ## add this as the fist try if it looks like ozon ID + if re.match("^\d{6,9}$", query): + ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query + search_urls.insert(0, ozon_detail) + xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' - counter = max_results br = browser() - with closing(br.open(search_url, timeout=timeout)) as f: - raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] - doc = etree.fromstring(raw) - for data in doc.xpath('//*[local-name() = "SearchItems"]'): - if counter <= 0: - break - counter -= 1 + + for url in search_urls: + with closing(br.open(url, timeout=timeout)) as f: + raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] + doc = etree.fromstring(raw) + for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): + if counter <= 0: + break + counter -= 1 - s = SearchResult() - s.detail_item = data.xpath(xp_template.format('ID')) - s.title = data.xpath(xp_template.format('Name')) - s.author = data.xpath(xp_template.format('Author')) - s.price = data.xpath(xp_template.format('Price')) - s.cover_url = data.xpath(xp_template.format('Picture')) - s.price = format_price_in_RUR(s.price) - yield s + s = SearchResult() + s.detail_item = data.xpath(xp_template.format('ID')) + s.title = data.xpath(xp_template.format('Name')) + s.author = data.xpath(xp_template.format('Author')) + s.price = data.xpath(xp_template.format('Price')) + s.cover_url = data.xpath(xp_template.format('Picture')) + s.price = format_price_in_RUR(s.price) + yield s def get_details(self, search_result, timeout=60): url = self.shop_url + '/context/detail/id/' + urllib2.quote(search_result.detail_item) @@ -97,6 +104,16 @@ class OzonRUStore(BasicStoreConfig, StorePlugin): search_result.formats = ', '.join(_parse_ebook_formats(formats)) # unfortunately no direct links to download books (only buy link) # search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item) + + #

21500 руб.

+ # + # + + # if the price not in the search result (the ID search case) + if not search_result.price: + price = doc.xpath(u'normalize-space(//*[@itemprop="price"]/text())') + search_result.price = format_price_in_RUR(price) + return result def format_price_in_RUR(price):