mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update OZON.ru plugin to reflect site changes
This commit is contained in:
parent
91e69ce7e3
commit
f6ff649dc4
@ -1,5 +1,8 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import (unicode_literals, division, absolute_import, print_function)
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011-2013 Roman Mukhin <ramses_ru at hotmail.com>'
|
__copyright__ = '2011-2013 Roman Mukhin <ramses_ru at hotmail.com>'
|
||||||
@ -9,20 +12,22 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
|
from lxml import etree, html
|
||||||
|
|
||||||
from calibre import as_unicode
|
from calibre import as_unicode
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
from calibre.ebooks.metadata.sources.base import Source, Option
|
from calibre.ebooks.metadata.sources.base import Source, Option
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
|
||||||
|
|
||||||
class Ozon(Source):
|
class Ozon(Source):
|
||||||
name = 'OZON.ru'
|
name = 'OZON.ru'
|
||||||
description = _('Downloads metadata and covers from OZON.ru')
|
description = _('Downloads metadata and covers from OZON.ru (updated)')
|
||||||
|
|
||||||
capabilities = frozenset(['identify', 'cover'])
|
capabilities = frozenset(['identify', 'cover'])
|
||||||
|
|
||||||
touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
|
touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
|
||||||
'publisher', 'pubdate', 'comments', 'series', 'rating', 'languages'])
|
'publisher', 'pubdate', 'comments', 'series', 'rating', 'languages'])
|
||||||
# Test purpose only, test function does not like when sometimes some filed are empty
|
# Test purpose only, test function does not like when sometimes some filed are empty
|
||||||
# touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
|
# touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
|
||||||
# 'publisher', 'pubdate', 'comments'])
|
# 'publisher', 'pubdate', 'comments'])
|
||||||
@ -33,17 +38,18 @@ class Ozon(Source):
|
|||||||
ozon_url = 'http://www.ozon.ru'
|
ozon_url = 'http://www.ozon.ru'
|
||||||
|
|
||||||
# match any ISBN10/13. From "Regular Expressions Cookbook"
|
# match any ISBN10/13. From "Regular Expressions Cookbook"
|
||||||
isbnPattern = r'(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|'\
|
isbnPattern = r'(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|' \
|
||||||
'[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?'\
|
'[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?' \
|
||||||
'(?:[0-9]+[- ]?){2}[0-9X]'
|
'(?:[0-9]+[- ]?){2}[0-9X]'
|
||||||
isbnRegex = re.compile(isbnPattern)
|
isbnRegex = re.compile(isbnPattern)
|
||||||
|
|
||||||
optkey_strictmatch = 'strict_result_match'
|
optkey_strictmatch = 'strict_result_match'
|
||||||
options = (
|
options = (
|
||||||
Option(optkey_strictmatch, 'bool', False,
|
Option(optkey_strictmatch, 'bool', False,
|
||||||
_('Filter out less relevant hits from the search results'),
|
_('Filter out less relevant hits from the search results'),
|
||||||
_('Improve search result by removing less relevant hits. It can be useful to refine the search when there are many matches')),
|
_(
|
||||||
)
|
'Improve search result by removing less relevant hits. It can be useful to refine the search when there are many matches')),
|
||||||
|
)
|
||||||
|
|
||||||
def get_book_url(self, identifiers): # {{{
|
def get_book_url(self, identifiers): # {{{
|
||||||
import urllib2
|
import urllib2
|
||||||
@ -54,6 +60,7 @@ class Ozon(Source):
|
|||||||
url = '{}/context/detail/id/{}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
|
url = '{}/context/detail/id/{}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
|
||||||
res = ('ozon', ozon_id, url)
|
res = ('ozon', ozon_id, url)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
|
||||||
@ -65,25 +72,30 @@ class Ozon(Source):
|
|||||||
# for ozon.ru search we have to format ISBN with '-'
|
# for ozon.ru search we have to format ISBN with '-'
|
||||||
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
||||||
if isbn and '-' not in isbn:
|
if isbn and '-' not in isbn:
|
||||||
log.error("%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)"
|
log.error(
|
||||||
% (self.name, isbn))
|
"%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)"
|
||||||
|
% (self.name, isbn))
|
||||||
isbn = None
|
isbn = None
|
||||||
|
|
||||||
ozonid = identifiers.get('ozon', None)
|
ozonid = identifiers.get('ozon', None)
|
||||||
|
|
||||||
qItems = set([ozonid, isbn])
|
qItems = set([ozonid, isbn])
|
||||||
|
|
||||||
unk = unicode(_('Unknown')).upper()
|
# Added Russian variant of 'Unknown'
|
||||||
|
unk = [unicode(_('Unknown')).upper(), unicode(_('Неизв.')).upper()]
|
||||||
|
|
||||||
if title and title != unk:
|
if title and title not in unk:
|
||||||
qItems.add(title)
|
qItems.add(title)
|
||||||
if authors and authors != [unk]:
|
|
||||||
qItems |= frozenset(authors)
|
if authors:
|
||||||
|
for auth in authors:
|
||||||
|
if auth.upper() not in unk:
|
||||||
|
qItems.add(auth)
|
||||||
|
|
||||||
qItems.discard(None)
|
qItems.discard(None)
|
||||||
qItems.discard('')
|
qItems.discard('')
|
||||||
qItems = map(_quoteString, qItems)
|
|
||||||
searchText = u' '.join(qItems).strip()
|
searchText = u' '.join(qItems).strip()
|
||||||
|
|
||||||
if isinstance(searchText, unicode):
|
if isinstance(searchText, unicode):
|
||||||
searchText = searchText.encode('utf-8')
|
searchText = searchText.encode('utf-8')
|
||||||
if not searchText:
|
if not searchText:
|
||||||
@ -92,12 +104,13 @@ class Ozon(Source):
|
|||||||
search_url += quote_plus(searchText)
|
search_url += quote_plus(searchText)
|
||||||
log.debug(u'search url: %r' % search_url)
|
log.debug(u'search url: %r' % search_url)
|
||||||
return search_url
|
return search_url
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||||
identifiers={}, timeout=90): # {{{
|
identifiers={}, timeout=90): # {{{
|
||||||
from lxml import html
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
from HTMLParser import HTMLParser
|
||||||
|
|
||||||
if not self.is_configured():
|
if not self.is_configured():
|
||||||
return
|
return
|
||||||
@ -110,66 +123,57 @@ class Ozon(Source):
|
|||||||
try:
|
try:
|
||||||
raw = self.browser.open_novisit(query).read()
|
raw = self.browser.open_novisit(query).read()
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception(u'Failed to make identify query: %r' % query)
|
log.exception(u'Failed to make identify query: %r' % query)
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
||||||
entries = doc.xpath(u'//div[@class="SearchResults"]//div[@itemprop="itemListElement"]')
|
entries_block = doc.xpath(u'//div[@class="bSearchResult"]')
|
||||||
|
|
||||||
if entries:
|
if entries_block:
|
||||||
|
entries = doc.xpath(u'//div[contains(@itemprop, "itemListElement")]')
|
||||||
# for entry in entries:
|
# for entry in entries:
|
||||||
# log.debug('entries %s' % etree.tostring(entry))
|
# log.debug('entries %s' % entree.tostring(entry))
|
||||||
metadata = self.get_metadata(log, entries, title, authors, identifiers)
|
metadata = self.get_metadata(log, entries, title, authors, identifiers)
|
||||||
self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
|
self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
|
||||||
else:
|
else:
|
||||||
mainentry = doc.xpath(u'//div[contains(@class, "details-main")]')
|
# Redirect page: trying to extract ozon_id from javascript data
|
||||||
if mainentry:
|
h = HTMLParser()
|
||||||
metadata = self.get_metadata_from_detail(log, mainentry[0], title, authors, identifiers)
|
entry_string = (h.unescape(unicode(etree.tostring(doc, pretty_print=True))))
|
||||||
ozon_id = unicode(metadata.identifiers['ozon'])
|
id_title_pat = re.compile(u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)')
|
||||||
self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, {ozon_id : doc})
|
# result containing ozon_id and entry_title
|
||||||
|
entry_info = re.search(id_title_pat, entry_string)
|
||||||
|
ozon_id = entry_info.group(1) if entry_info else None
|
||||||
|
entry_title = entry_info.group(2) if entry_info else None
|
||||||
|
|
||||||
|
if ozon_id:
|
||||||
|
metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors)
|
||||||
|
identifiers['ozon'] = ozon_id
|
||||||
|
self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={})
|
||||||
else:
|
else:
|
||||||
log.error('No SearchResults/itemListElement entries in Ozon.ru responce found')
|
log.error('No SearchResults in Ozon.ru response found')
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception('Failed to parse identify results')
|
log.exception('Failed to parse identify results')
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_metadata_from_detail(self, log, entry, title, authors, identifiers): # {{{
|
def to_metadata_for_single_entry(self, log, ozon_id, title, authors): # {{{
|
||||||
title = unicode(entry.xpath(u'normalize-space(.//h1[@itemprop="name"][1]/text())'))
|
|
||||||
# log.debug(u'Tile (from_detail): -----> %s' % title)
|
|
||||||
|
|
||||||
author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
|
# parsing javascript data from the redirect page
|
||||||
# log.debug(u'Author (from_detail): -----> %s' % author)
|
mi = Metadata(title, authors)
|
||||||
|
mi.identifiers = {'ozon': ozon_id}
|
||||||
norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
|
|
||||||
mi = Metadata(title, norm_authors)
|
|
||||||
|
|
||||||
ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(//link[@rel="canonical"][contains(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
|
|
||||||
if ozon_id:
|
|
||||||
# log.debug(u'ozon_id (from_detail): -----> %s' % ozon_id)
|
|
||||||
mi.identifiers = {'ozon':ozon_id}
|
|
||||||
|
|
||||||
mi.ozon_cover_url = None
|
|
||||||
cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
|
|
||||||
if cover:
|
|
||||||
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
|
||||||
# log.debug(u'mi.ozon_cover_url (from_detail): -----> %s' % mi.ozon_cover_url)
|
|
||||||
|
|
||||||
mi.rating = self.get_rating(entry)
|
|
||||||
# log.debug(u'mi.rating (from_detail): -----> %s' % mi.rating)
|
|
||||||
if not mi.rating:
|
|
||||||
log.debug('No rating (from_detail) found. ozon_id:%s'%ozon_id)
|
|
||||||
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
||||||
# some book titles have extra characters like this
|
# some book titles have extra characters like this
|
||||||
# TODO: make a twick
|
|
||||||
# reRemoveFromTitle = None
|
|
||||||
reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
||||||
|
|
||||||
title = unicode(title).upper() if title else ''
|
title = unicode(title).upper() if title else ''
|
||||||
@ -177,6 +181,7 @@ class Ozon(Source):
|
|||||||
title = reRemoveFromTitle.sub('', title)
|
title = reRemoveFromTitle.sub('', title)
|
||||||
authors = map(_normalizeAuthorNameWithInitials,
|
authors = map(_normalizeAuthorNameWithInitials,
|
||||||
map(unicode.upper, map(unicode, authors))) if authors else None
|
map(unicode.upper, map(unicode, authors))) if authors else None
|
||||||
|
|
||||||
ozon_id = identifiers.get('ozon', None)
|
ozon_id = identifiers.get('ozon', None)
|
||||||
# log.debug(u'ozonid: ', ozon_id)
|
# log.debug(u'ozonid: ', ozon_id)
|
||||||
|
|
||||||
@ -200,8 +205,10 @@ class Ozon(Source):
|
|||||||
relevance = 0
|
relevance = 0
|
||||||
if title:
|
if title:
|
||||||
mititle = unicode(mi.title).upper() if mi.title else ''
|
mititle = unicode(mi.title).upper() if mi.title else ''
|
||||||
|
|
||||||
if reRemoveFromTitle:
|
if reRemoveFromTitle:
|
||||||
mititle = reRemoveFromTitle.sub('', mititle)
|
mititle = reRemoveFromTitle.sub('', mititle)
|
||||||
|
|
||||||
if title in mititle:
|
if title in mititle:
|
||||||
relevance += 3
|
relevance += 3
|
||||||
elif mititle:
|
elif mititle:
|
||||||
@ -212,6 +219,8 @@ class Ozon(Source):
|
|||||||
|
|
||||||
if authors:
|
if authors:
|
||||||
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
|
miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
|
||||||
|
# log.debug('Authors %s vs miauthors %s'%(','.join(authors), ','.join(miauthors)))
|
||||||
|
|
||||||
if (in_authors(authors, miauthors)):
|
if (in_authors(authors, miauthors)):
|
||||||
relevance += 3
|
relevance += 3
|
||||||
elif u''.join(miauthors):
|
elif u''.join(miauthors):
|
||||||
@ -228,11 +237,13 @@ class Ozon(Source):
|
|||||||
if relevance < 0:
|
if relevance < 0:
|
||||||
relevance = 0
|
relevance = 0
|
||||||
return relevance
|
return relevance
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
strict_match = self.prefs[self.optkey_strictmatch]
|
strict_match = self.prefs[self.optkey_strictmatch]
|
||||||
metadata = []
|
metadata = []
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
|
||||||
mi = self.to_metadata(log, entry)
|
mi = self.to_metadata(log, entry)
|
||||||
relevance = calc_source_relevance(mi)
|
relevance = calc_source_relevance(mi)
|
||||||
# TODO findout which is really used
|
# TODO findout which is really used
|
||||||
@ -240,15 +251,19 @@ class Ozon(Source):
|
|||||||
mi.relevance_in_source = relevance
|
mi.relevance_in_source = relevance
|
||||||
|
|
||||||
if not strict_match or relevance > 0:
|
if not strict_match or relevance > 0:
|
||||||
metadata.append(mi)
|
# getting rid of a random book that shows up in results
|
||||||
# log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
|
if not (mi.title == 'Unknown'):
|
||||||
|
metadata.append(mi)
|
||||||
|
# log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
|
||||||
else:
|
else:
|
||||||
log.debug(u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)'
|
log.debug(u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)'
|
||||||
% (mi.title, u' '.join(mi.authors), relevance))
|
% (mi.title, u' '.join(mi.authors), relevance))
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout, cachedPagesDict={}): # {{{
|
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout, cachedPagesDict={}): # {{{
|
||||||
|
|
||||||
req_isbn = identifiers.get('isbn', None)
|
req_isbn = identifiers.get('isbn', None)
|
||||||
|
|
||||||
for mi in metadata:
|
for mi in metadata:
|
||||||
@ -258,7 +273,8 @@ class Ozon(Source):
|
|||||||
ozon_id = mi.identifiers['ozon']
|
ozon_id = mi.identifiers['ozon']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.get_book_details(log, mi, timeout, cachedPagesDict[ozon_id] if cachedPagesDict and ozon_id in cachedPagesDict else None)
|
self.get_book_details(log, mi, timeout, cachedPagesDict[
|
||||||
|
ozon_id] if cachedPagesDict and ozon_id in cachedPagesDict else None)
|
||||||
except:
|
except:
|
||||||
log.exception(u'Failed to get details for metadata: %s' % mi.title)
|
log.exception(u'Failed to get details for metadata: %s' % mi.title)
|
||||||
|
|
||||||
@ -275,45 +291,54 @@ class Ozon(Source):
|
|||||||
|
|
||||||
self.clean_downloaded_metadata(mi)
|
self.clean_downloaded_metadata(mi)
|
||||||
result_queue.put(mi)
|
result_queue.put(mi)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
log.exception(u'Failed to get details for metadata: %s' % mi.title)
|
log.exception(u'Failed to get details for metadata: %s' % mi.title)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def to_metadata(self, log, entry): # {{{
|
def to_metadata(self, log, entry): # {{{
|
||||||
title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())'))
|
title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())'))
|
||||||
# log.debug(u'Tile: -----> %s' % title)
|
# log.debug(u'Title: -----> %s' % title)
|
||||||
|
|
||||||
author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
|
author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")])'))
|
||||||
# log.debug(u'Author: -----> %s' % author)
|
# log.debug(u'Author: -----> %s' % author)
|
||||||
|
|
||||||
norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
|
norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
|
||||||
mi = Metadata(title, norm_authors)
|
mi = Metadata(title, norm_authors)
|
||||||
|
|
||||||
ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
|
ozon_id = entry.get('data-href').split('/')[-2]
|
||||||
|
|
||||||
if ozon_id:
|
if ozon_id:
|
||||||
mi.identifiers = {'ozon':ozon_id}
|
mi.identifiers = {'ozon': ozon_id}
|
||||||
# log.debug(u'ozon_id: -----> %s' % ozon_id)
|
# log.debug(u'ozon_id: -----> %s' % ozon_id)
|
||||||
|
|
||||||
mi.ozon_cover_url = None
|
mi.ozon_cover_url = None
|
||||||
cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
|
cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
|
||||||
# log.debug(u'cover: -----> %s' % cover)
|
log.debug(u'cover: -----> %s' % cover)
|
||||||
if cover:
|
if cover:
|
||||||
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
||||||
# log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)
|
# log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)
|
||||||
|
|
||||||
pub_year = None
|
pub_year = None
|
||||||
if pub_year:
|
pub_year_block = entry.xpath(u'.//div[@class="bOneTileProperty"]/text()')
|
||||||
mi.pubdate = toPubdate(log, pub_year)
|
year_pattern = re.compile('\d{4}')
|
||||||
# log.debug('pubdate %s' % mi.pubdate)
|
if pub_year_block:
|
||||||
|
pub_year = re.search(year_pattern, pub_year_block[0])
|
||||||
|
if pub_year:
|
||||||
|
mi.pubdate = toPubdate(log, pub_year.group())
|
||||||
|
# log.debug('pubdate %s' % mi.pubdate)
|
||||||
|
|
||||||
mi.rating = self.get_rating(entry)
|
mi.rating = self.get_rating(log, entry)
|
||||||
# if not mi.rating:
|
# if not mi.rating:
|
||||||
# log.debug('No rating found. ozon_id:%s'%ozon_id)
|
# log.debug('No rating found. ozon_id:%s'%ozon_id)
|
||||||
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_rating(self, entry): # {{{
|
def get_rating(self, log, entry): # {{{
|
||||||
|
# log.debug(entry)
|
||||||
ozon_rating = None
|
ozon_rating = None
|
||||||
try:
|
try:
|
||||||
xp_rating_template = u'boolean(.//div[contains(@class, "bStars") and contains(@class, "%s")])'
|
xp_rating_template = u'boolean(.//div[contains(@class, "bStars") and contains(@class, "%s")])'
|
||||||
@ -335,6 +360,7 @@ class Ozon(Source):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return ozon_rating
|
return ozon_rating
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_cached_cover_url(self, identifiers): # {{{
|
def get_cached_cover_url(self, identifiers): # {{{
|
||||||
@ -347,9 +373,12 @@ class Ozon(Source):
|
|||||||
if ozon_id is not None:
|
if ozon_id is not None:
|
||||||
url = self.cached_identifier_to_cover_url(ozon_id)
|
url = self.cached_identifier_to_cover_url(ozon_id)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): # {{{
|
def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30,
|
||||||
|
get_best_cover=False): # {{{
|
||||||
|
|
||||||
cached_url = self.get_cached_cover_url(identifiers)
|
cached_url = self.get_cached_cover_url(identifiers)
|
||||||
if cached_url is None:
|
if cached_url is None:
|
||||||
log.debug('No cached cover found, running identify')
|
log.debug('No cached cover found, running identify')
|
||||||
@ -384,11 +413,10 @@ class Ozon(Source):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception(u'Failed to download cover from: %s' % cached_url)
|
log.exception(u'Failed to download cover from: %s' % cached_url)
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_book_details(self, log, metadata, timeout, cachedPage): # {{{
|
def get_book_details(self, log, metadata, timeout, cachedPage): # {{{
|
||||||
from lxml import html, etree
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
|
||||||
|
|
||||||
if not cachedPage:
|
if not cachedPage:
|
||||||
url = self.get_book_url(metadata.get_identifiers())[2]
|
url = self.get_book_url(metadata.get_identifiers())[2]
|
||||||
@ -398,37 +426,58 @@ class Ozon(Source):
|
|||||||
fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
||||||
else:
|
else:
|
||||||
fulldoc = cachedPage
|
fulldoc = cachedPage
|
||||||
# log.debug(u'book_details -> using cached page')
|
log.debug(u'book_details -> using cached page')
|
||||||
|
|
||||||
doc = fulldoc.xpath(u'//div[@id="PageContent"][1]')[0]
|
fullString = etree.tostring(fulldoc)
|
||||||
|
doc = fulldoc.xpath(u'//div[@class="bDetailPage"][1]')[0]
|
||||||
xpt_tmpl_base = u'.//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]'
|
|
||||||
xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)'
|
|
||||||
|
|
||||||
# series Серия/Серии
|
# series Серия/Серии
|
||||||
series = doc.xpath(xpt_tmpl_a % u'Сери')
|
series_elem = doc.xpath(u'//div[contains(text(), "Сери")]')
|
||||||
if series:
|
if series_elem:
|
||||||
metadata.series = series
|
series_text_elem = series_elem[0].getnext()
|
||||||
# log.debug(u'Seria: ', metadata.series)
|
metadata.series = series_text_elem.xpath(u'.//a/text()')[0]
|
||||||
|
log.debug(u'**Seria: ', metadata.series)
|
||||||
|
|
||||||
xpt_isbn = u'normalize-space(' + xpt_tmpl_base + u')'
|
isbn = None
|
||||||
isbn_str = doc.xpath(xpt_isbn % u'ISBN')
|
isbn_elem = doc.xpath(u'//div[contains(text(), "ISBN")]')
|
||||||
if isbn_str:
|
if isbn_elem:
|
||||||
# log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str))
|
isbn = isbn_elem[0].getnext().xpath(u'normalize-space(./text())')
|
||||||
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
|
metadata.identifiers['isbn'] = isbn
|
||||||
if all_isbns:
|
|
||||||
metadata.all_isbns = all_isbns
|
# get authors/editors if no authors are available
|
||||||
metadata.isbn = all_isbns[0]
|
authors_joined = ','.join(metadata.authors)
|
||||||
# log.debug(u'ISBN: ', metadata.isbn)
|
|
||||||
|
if authors_joined == '' or authors_joined == "Unknown":
|
||||||
|
authors_from_detail = []
|
||||||
|
editor_elem = doc.xpath(u'//div[contains(text(), "Редактор")]')
|
||||||
|
if editor_elem:
|
||||||
|
editor = editor_elem[0].getnext().xpath(u'.//a/text()')[0]
|
||||||
|
authors_from_detail.append(editor + u' (ред.)')
|
||||||
|
authors_elem = doc.xpath(u'//div[contains(text(), "Автор")]')
|
||||||
|
if authors_elem:
|
||||||
|
authors = authors_elem[0].getnext().xpath(u'.//a/text()') # list
|
||||||
|
authors_from_detail.extend(authors)
|
||||||
|
if len(authors_from_detail) > 0:
|
||||||
|
metadata.authors = authors_from_detail
|
||||||
|
|
||||||
|
cover = doc.xpath('.//img[contains(@class, "fullImage")]/@src')[0]
|
||||||
|
metadata.ozon_cover_url = _translateToBigCoverUrl(cover)
|
||||||
|
|
||||||
|
publishers = None
|
||||||
|
publishers_elem = doc.xpath(u'//div[contains(text(), "Издатель")]')
|
||||||
|
if publishers_elem:
|
||||||
|
publishers_elem = publishers_elem[0].getnext()
|
||||||
|
publishers = publishers_elem.xpath(u'.//a/text()')[0]
|
||||||
|
|
||||||
publishers = doc.xpath(xpt_tmpl_a % u'Издатель')
|
|
||||||
if publishers:
|
if publishers:
|
||||||
metadata.publisher = publishers
|
metadata.publisher = publishers
|
||||||
# log.debug(u'Publisher: ', metadata.publisher)
|
|
||||||
|
|
||||||
xpt_lang = u'substring-after(normalize-space(.//text()[contains(normalize-space(.), "%s")]), ":")'
|
|
||||||
displ_lang = None
|
displ_lang = None
|
||||||
langs = doc.xpath(xpt_lang % u'Язык')
|
langs = None
|
||||||
|
langs_elem = doc.xpath(u'//div[contains(text(), "зык")]')
|
||||||
|
if langs_elem:
|
||||||
|
langs_elem = langs_elem[0].getnext()
|
||||||
|
langs = langs_elem.xpath(u'text()')[0].strip()
|
||||||
if langs:
|
if langs:
|
||||||
lng_splt = langs.split(u',')
|
lng_splt = langs.split(u',')
|
||||||
if lng_splt:
|
if lng_splt:
|
||||||
@ -437,38 +486,26 @@ class Ozon(Source):
|
|||||||
metadata.language = _translageLanguageToCode(displ_lang)
|
metadata.language = _translageLanguageToCode(displ_lang)
|
||||||
# log.debug(u'Language: ', metadata.language)
|
# log.debug(u'Language: ', metadata.language)
|
||||||
|
|
||||||
# can be set before from xml search responce
|
|
||||||
|
# can be set before from xml search response
|
||||||
if not metadata.pubdate:
|
if not metadata.pubdate:
|
||||||
xpt = u'substring-after(' + xpt_isbn + u',";")'
|
pubdate_elem = doc.xpath(u'//div[contains(text(), "Год выпуска")]')
|
||||||
yearIn = doc.xpath(xpt % u'ISBN')
|
if pubdate_elem:
|
||||||
if yearIn:
|
pubYear = pubdate_elem[0].getnext().xpath(u'text()')[0].strip()
|
||||||
matcher = re.search(r'\d{4}', yearIn)
|
if pubYear:
|
||||||
if matcher:
|
matcher = re.search(r'\d{4}', pubYear)
|
||||||
metadata.pubdate = toPubdate(log, matcher.group(0))
|
if matcher:
|
||||||
|
metadata.pubdate = toPubdate(log, matcher.group(0))
|
||||||
# log.debug(u'Pubdate: ', metadata.pubdate)
|
# log.debug(u'Pubdate: ', metadata.pubdate)
|
||||||
|
|
||||||
# overwrite comments from HTML if any
|
|
||||||
xpt = u'.//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa
|
|
||||||
from lxml.etree import ElementBase
|
|
||||||
comment_elem = doc.xpath(xpt)
|
|
||||||
if comment_elem:
|
|
||||||
comments = u''
|
|
||||||
for node in comment_elem:
|
|
||||||
if isinstance(node, ElementBase):
|
|
||||||
comments += unicode(etree.tostring(node, encoding=unicode))
|
|
||||||
elif isinstance(node, basestring) and node.strip():
|
|
||||||
comments += unicode(node) + u'\n'
|
|
||||||
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
|
|
||||||
metadata.comments = comments
|
|
||||||
else:
|
|
||||||
log.debug('HTML book description skipped in favor of search service xml response')
|
|
||||||
else:
|
|
||||||
log.debug('No book description found in HTML')
|
|
||||||
# }}}
|
|
||||||
|
|
||||||
def _quoteString(strToQuote): # {{{
|
# comments, from Javascript data
|
||||||
return '"' + strToQuote + '"' if strToQuote and strToQuote.find(' ') != -1 else strToQuote
|
beginning = fullString.find(u'FirstBlock')
|
||||||
# }}}
|
end = fullString.find(u'}', beginning)
|
||||||
|
comments = unicode(fullString[beginning + 75:end - 1]).decode("unicode-escape")
|
||||||
|
metadata.comments = comments
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
def _verifyISBNIntegrity(log, isbn): # {{{
|
def _verifyISBNIntegrity(log, isbn): # {{{
|
||||||
# Online ISBN-Check http://www.isbn-check.de/
|
# Online ISBN-Check http://www.isbn-check.de/
|
||||||
@ -476,6 +513,8 @@ def _verifyISBNIntegrity(log, isbn): # {{{
|
|||||||
if not res:
|
if not res:
|
||||||
log.error(u'ISBN integrity check failed for "%s"' % isbn)
|
log.error(u'ISBN integrity check failed for "%s"' % isbn)
|
||||||
return res is not None
|
return res is not None
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# TODO: make customizable
|
# TODO: make customizable
|
||||||
@ -486,6 +525,8 @@ def _translateToBigCoverUrl(coverUrl): # {{{
|
|||||||
if m:
|
if m:
|
||||||
coverUrl = 'http://www.ozon.ru/multimedia/books_covers/' + m.group(1) + '.jpg'
|
coverUrl = 'http://www.ozon.ru/multimedia/books_covers/' + m.group(1) + '.jpg'
|
||||||
return coverUrl
|
return coverUrl
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def _get_affiliateId(): # {{{
|
def _get_affiliateId(): # {{{
|
||||||
@ -496,6 +537,8 @@ def _get_affiliateId(): # {{{
|
|||||||
if random.randint(1, 10) in (1, 2, 3):
|
if random.randint(1, 10) in (1, 2, 3):
|
||||||
aff_id = 'kovidgoyal'
|
aff_id = 'kovidgoyal'
|
||||||
return aff_id
|
return aff_id
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def _format_isbn(log, isbn): # {{{
|
def _format_isbn(log, isbn): # {{{
|
||||||
@ -533,23 +576,27 @@ def _format_isbn(log, isbn): # {{{
|
|||||||
else:
|
else:
|
||||||
log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported' % isbn)
|
log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported' % isbn)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def _translageLanguageToCode(displayLang): # {{{
|
def _translageLanguageToCode(displayLang): # {{{
|
||||||
displayLang = unicode(displayLang).strip() if displayLang else None
|
displayLang = unicode(displayLang).strip() if displayLang else None
|
||||||
langTbl = {None: 'ru',
|
langTbl = {None: 'ru',
|
||||||
u'Русский': 'ru',
|
u'Русский': 'ru',
|
||||||
u'Немецкий': 'de',
|
u'Немецкий': 'de',
|
||||||
u'Английский': 'en',
|
u'Английский': 'en',
|
||||||
u'Французский': 'fr',
|
u'Французский': 'fr',
|
||||||
u'Итальянский': 'it',
|
u'Итальянский': 'it',
|
||||||
u'Испанский': 'es',
|
u'Испанский': 'es',
|
||||||
u'Китайский': 'zh',
|
u'Китайский': 'zh',
|
||||||
u'Японский': 'ja',
|
u'Японский': 'ja',
|
||||||
u'Финский' : 'fi',
|
u'Финский': 'fi',
|
||||||
u'Польский' : 'pl',
|
u'Польский': 'pl',
|
||||||
u'Украинский' : 'uk', }
|
u'Украинский': 'uk',}
|
||||||
return langTbl.get(displayLang, None)
|
return langTbl.get(displayLang, None)
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников
|
# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников
|
||||||
@ -566,6 +613,8 @@ def _normalizeAuthorNameWithInitials(name): # {{{
|
|||||||
d = matcher.groupdict()
|
d = matcher.groupdict()
|
||||||
res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x)
|
res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def toPubdate(log, yearAsString): # {{{
|
def toPubdate(log, yearAsString): # {{{
|
||||||
@ -577,59 +626,63 @@ def toPubdate(log, yearAsString): # {{{
|
|||||||
except:
|
except:
|
||||||
log.error('cannot parse to date %s' % yearAsString)
|
log.error('cannot parse to date %s' % yearAsString)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def _listToUnicodePrintStr(lst): # {{{
|
def _listToUnicodePrintStr(lst): # {{{
|
||||||
return u'[' + u', '.join(unicode(x) for x in lst) + u']'
|
return u'[' + u', '.join(unicode(x) for x in lst) + u']'
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
if __name__ == '__main__': # tests {{{
|
if __name__ == '__main__': # tests {{{
|
||||||
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
|
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
|
||||||
# comment some touched_fields before run thoses tests
|
# comment some touched_fields before run thoses tests
|
||||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
title_test, authors_test, isbn_test)
|
title_test, authors_test, isbn_test)
|
||||||
|
|
||||||
test_identify_plugin(Ozon.name,
|
test_identify_plugin(Ozon.name,
|
||||||
[
|
[
|
||||||
# (
|
# (
|
||||||
# {'identifiers':{}, 'title':u'Норвежский язык: Практический курс',
|
# {'identifiers':{}, 'title':u'Норвежский язык: Практический курс',
|
||||||
# 'authors':[u'Колесников В.П.', u'Г.В. Шатков']},
|
# 'authors':[u'Колесников В.П.', u'Г.В. Шатков']},
|
||||||
# [title_test(u'Норвежский язык: Практический курс', exact=True),
|
# [title_test(u'Норвежский язык: Практический курс', exact=True),
|
||||||
# authors_test([u'В. П. Колесников', u'Г. В. Шатков'])]
|
# authors_test([u'В. П. Колесников', u'Г. В. Шатков'])]
|
||||||
# ),
|
# ),
|
||||||
(
|
(
|
||||||
{'identifiers':{'isbn': '9785916572629'}},
|
{'identifiers': {'isbn': '9785916572629'}},
|
||||||
[title_test(u'На все четыре стороны', exact=True),
|
[title_test(u'На все четыре стороны', exact=True),
|
||||||
authors_test([u'А. А. Гилл'])]
|
authors_test([u'А. А. Гилл'])]
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
{'identifiers':{}, 'title':u'Der Himmel Kennt Keine Gunstlinge',
|
{'identifiers': {}, 'title': u'Der Himmel Kennt Keine Gunstlinge',
|
||||||
'authors':[u'Erich Maria Remarque']},
|
'authors': [u'Erich Maria Remarque']},
|
||||||
[title_test(u'Der Himmel Kennt Keine Gunstlinge', exact=True),
|
[title_test(u'Der Himmel Kennt Keine Gunstlinge', exact=True),
|
||||||
authors_test([u'Erich Maria Remarque'])]
|
authors_test([u'Erich Maria Remarque'])]
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
{'identifiers':{}, 'title':u'Метро 2033',
|
{'identifiers': {}, 'title': u'Метро 2033',
|
||||||
'authors':[u'Дмитрий Глуховский']},
|
'authors': [u'Дмитрий Глуховский']},
|
||||||
[title_test(u'Метро 2033', exact=False)]
|
[title_test(u'Метро 2033', exact=False)]
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
{'identifiers':{'isbn': '9785170727209'}, 'title':u'Метро 2033',
|
{'identifiers': {'isbn': '9785170727209'}, 'title': u'Метро 2033',
|
||||||
'authors':[u'Дмитрий Глуховский']},
|
'authors': [u'Дмитрий Глуховский']},
|
||||||
[title_test(u'Метро 2033', exact=True),
|
[title_test(u'Метро 2033', exact=True),
|
||||||
authors_test([u'Дмитрий Глуховский']),
|
authors_test([u'Дмитрий Глуховский']),
|
||||||
isbn_test('9785170727209')]
|
isbn_test('9785170727209')]
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
{'identifiers':{'isbn': '5-699-13613-4'}, 'title':u'Метро 2033',
|
{'identifiers': {'isbn': '5-699-13613-4'}, 'title': u'Метро 2033',
|
||||||
'authors':[u'Дмитрий Глуховский']},
|
'authors': [u'Дмитрий Глуховский']},
|
||||||
[title_test(u'Метро 2033', exact=True),
|
[title_test(u'Метро 2033', exact=True),
|
||||||
authors_test([u'Дмитрий Глуховский'])]
|
authors_test([u'Дмитрий Глуховский'])]
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
{'identifiers':{}, 'title':u'Метро',
|
{'identifiers': {}, 'title': u'Метро',
|
||||||
'authors':[u'Глуховский']},
|
'authors': [u'Глуховский']},
|
||||||
[title_test(u'Метро', exact=False)]
|
[title_test(u'Метро', exact=False)]
|
||||||
),
|
),
|
||||||
])
|
])
|
||||||
# }}}
|
# }}}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user