mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Fix metadata plugin to download metadata from OZON for website changes. Fixes #1300383 [Searching metadata using Ozon.ru failed with error](https://bugs.launchpad.net/calibre/+bug/1300383)
This commit is contained in:
parent
35e585bd0d
commit
fb970e24c6
@ -5,6 +5,8 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2011-2013 Roman Mukhin <ramses_ru at hotmail.com>'
|
__copyright__ = '2011-2013 Roman Mukhin <ramses_ru at hotmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
# To ensure bugfix and development please donate bitcoins to 1E6CRSLY1uNstcZjLYZBHRVs1CPKbdi4ep
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
|
|
||||||
@ -48,7 +50,8 @@ class Ozon(Source):
|
|||||||
ozon_id = identifiers.get('ozon', None)
|
ozon_id = identifiers.get('ozon', None)
|
||||||
res = None
|
res = None
|
||||||
if ozon_id:
|
if ozon_id:
|
||||||
url = '{}/context/detail/id/{}?partner={}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
|
#no affiliateId is used in search/detail
|
||||||
|
url = '{}/context/detail/id/{}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
|
||||||
res = ('ozon', ozon_id, url)
|
res = ('ozon', ozon_id, url)
|
||||||
return res
|
return res
|
||||||
# }}}
|
# }}}
|
||||||
@ -57,13 +60,13 @@ class Ozon(Source):
|
|||||||
from urllib import quote_plus
|
from urllib import quote_plus
|
||||||
|
|
||||||
# div_book -> search only books, ebooks and audio books
|
# div_book -> search only books, ebooks and audio books
|
||||||
search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='
|
search_url = self.ozon_url + '/?context=search&group=div_book&text='
|
||||||
|
|
||||||
# for ozon.ru search we have to format ISBN with '-'
|
# for ozon.ru search we have to format ISBN with '-'
|
||||||
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
||||||
if isbn and not '-' in isbn:
|
if isbn and not '-' in isbn:
|
||||||
log.error("%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)"
|
log.error("%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)"
|
||||||
%(self.name, isbn))
|
% (self.name, isbn))
|
||||||
isbn = None
|
isbn = None
|
||||||
|
|
||||||
ozonid = identifiers.get('ozon', None)
|
ozonid = identifiers.get('ozon', None)
|
||||||
@ -87,13 +90,13 @@ class Ozon(Source):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
search_url += quote_plus(searchText)
|
search_url += quote_plus(searchText)
|
||||||
log.debug(u'search url: %r'%search_url)
|
log.debug(u'search url: %r' % search_url)
|
||||||
return search_url
|
return search_url
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||||
identifiers={}, timeout=60): # {{{
|
identifiers={}, timeout=90): # {{{
|
||||||
from lxml import etree
|
from lxml import html, etree
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
if not self.is_configured():
|
if not self.is_configured():
|
||||||
@ -108,26 +111,65 @@ class Ozon(Source):
|
|||||||
raw = self.browser.open_novisit(query).read()
|
raw = self.browser.open_novisit(query).read()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception(u'Failed to make identify query: %r'%query)
|
log.exception(u'Failed to make identify query: %r' % query)
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
||||||
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
|
entries = doc.xpath(u'//div[@class="SearchResults"]//div[@itemprop="itemListElement"]')
|
||||||
entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
|
|
||||||
if entries:
|
if entries:
|
||||||
|
#for entry in entries:
|
||||||
|
# log.debug('entries %s' % etree.tostring(entry))
|
||||||
metadata = self.get_metadata(log, entries, title, authors, identifiers)
|
metadata = self.get_metadata(log, entries, title, authors, identifiers)
|
||||||
self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
|
self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
|
||||||
|
else:
|
||||||
|
mainentry = doc.xpath(u'//div[contains(@class, "details-main")]')
|
||||||
|
if mainentry:
|
||||||
|
metadata = self.get_metadata_from_detail(log, mainentry[0], title, authors, identifiers)
|
||||||
|
ozon_id = unicode(metadata.identifiers['ozon'])
|
||||||
|
self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, {ozon_id : doc})
|
||||||
|
else:
|
||||||
|
log.error('No SearchResults/itemListElement entries in Ozon.ru responce found')
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception('Failed to parse identify results')
|
log.exception('Failed to parse identify results')
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def get_metadata_from_detail(self, log, entry, title, authors, identifiers): # {{{
|
||||||
|
title = unicode(entry.xpath(u'normalize-space(.//h1[@itemprop="name"][1]/text())'))
|
||||||
|
#log.debug(u'Tile (from_detail): -----> %s' % title)
|
||||||
|
|
||||||
|
author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
|
||||||
|
#log.debug(u'Author (from_detail): -----> %s' % author)
|
||||||
|
|
||||||
|
norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
|
||||||
|
mi = Metadata(title, norm_authors)
|
||||||
|
|
||||||
|
ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
|
||||||
|
if ozon_id:
|
||||||
|
#log.debug(u'ozon_id (from_detail): -----> %s' % ozon_id)
|
||||||
|
mi.identifiers = {'ozon':ozon_id}
|
||||||
|
|
||||||
|
mi.ozon_cover_url = None
|
||||||
|
cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
|
||||||
|
if cover:
|
||||||
|
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
||||||
|
#log.debug(u'mi.ozon_cover_url (from_detail): -----> %s' % mi.ozon_cover_url)
|
||||||
|
|
||||||
|
mi.rating = self.get_rating(entry)
|
||||||
|
#log.debug(u'mi.rating (from_detail): -----> %s' % mi.rating)
|
||||||
|
if not mi.rating:
|
||||||
|
log.debug('No rating (from_detail) found. ozon_id:%s'%ozon_id)
|
||||||
|
|
||||||
|
return mi
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
||||||
# some book titles have extra characters like this
|
# some book titles have extra characters like this
|
||||||
# TODO: make a twick
|
# TODO: make a twick
|
||||||
#reRemoveFromTitle = None
|
# reRemoveFromTitle = None
|
||||||
reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
||||||
|
|
||||||
title = unicode(title).upper() if title else ''
|
title = unicode(title).upper() if title else ''
|
||||||
@ -136,7 +178,7 @@ class Ozon(Source):
|
|||||||
authors = map(_normalizeAuthorNameWithInitials,
|
authors = map(_normalizeAuthorNameWithInitials,
|
||||||
map(unicode.upper, map(unicode, authors))) if authors else None
|
map(unicode.upper, map(unicode, authors))) if authors else None
|
||||||
ozon_id = identifiers.get('ozon', None)
|
ozon_id = identifiers.get('ozon', None)
|
||||||
#log.debug(u'ozonid: ', ozon_id)
|
# log.debug(u'ozonid: ', ozon_id)
|
||||||
|
|
||||||
unk = unicode(_('Unknown')).upper()
|
unk = unicode(_('Unknown')).upper()
|
||||||
|
|
||||||
@ -149,7 +191,7 @@ class Ozon(Source):
|
|||||||
def in_authors(authors, miauthors):
|
def in_authors(authors, miauthors):
|
||||||
for author in authors:
|
for author in authors:
|
||||||
for miauthor in miauthors:
|
for miauthor in miauthors:
|
||||||
#log.debug(u'=> %s <> %s'%(author, miauthor))
|
# log.debug(u'=> %s <> %s'%(author, miauthor))
|
||||||
if author in miauthor:
|
if author in miauthor:
|
||||||
return True
|
return True
|
||||||
return None
|
return None
|
||||||
@ -199,14 +241,14 @@ class Ozon(Source):
|
|||||||
|
|
||||||
if not strict_match or relevance > 0:
|
if not strict_match or relevance > 0:
|
||||||
metadata.append(mi)
|
metadata.append(mi)
|
||||||
#log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
|
# log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
|
||||||
else:
|
else:
|
||||||
log.debug(u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)'
|
log.debug(u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)'
|
||||||
%(mi.title, u' '.join(mi.authors), relevance))
|
% (mi.title, u' '.join(mi.authors), relevance))
|
||||||
return metadata
|
return metadata
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{
|
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout, cachedPagesDict = {}): # {{{
|
||||||
req_isbn = identifiers.get('isbn', None)
|
req_isbn = identifiers.get('isbn', None)
|
||||||
|
|
||||||
for mi in metadata:
|
for mi in metadata:
|
||||||
@ -216,13 +258,13 @@ class Ozon(Source):
|
|||||||
ozon_id = mi.identifiers['ozon']
|
ozon_id = mi.identifiers['ozon']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.get_book_details(log, mi, timeout)
|
self.get_book_details(log, mi, timeout, cachedPagesDict[ozon_id] if cachedPagesDict and cachedPagesDict.has_key(ozon_id) else None)
|
||||||
except:
|
except:
|
||||||
log.exception(u'Failed to get details for metadata: %s'%mi.title)
|
log.exception(u'Failed to get details for metadata: %s' % mi.title)
|
||||||
|
|
||||||
all_isbns = getattr(mi, 'all_isbns', [])
|
all_isbns = getattr(mi, 'all_isbns', [])
|
||||||
if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns:
|
if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns:
|
||||||
log.debug(u'skipped, no requested ISBN %s found'%req_isbn)
|
log.debug(u'skipped, no requested ISBN %s found' % req_isbn)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for isbn in all_isbns:
|
for isbn in all_isbns:
|
||||||
@ -234,44 +276,67 @@ class Ozon(Source):
|
|||||||
self.clean_downloaded_metadata(mi)
|
self.clean_downloaded_metadata(mi)
|
||||||
result_queue.put(mi)
|
result_queue.put(mi)
|
||||||
except:
|
except:
|
||||||
log.exception(u'Failed to get details for metadata: %s'%mi.title)
|
log.exception(u'Failed to get details for metadata: %s' % mi.title)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def to_metadata(self, log, entry): # {{{
|
def to_metadata(self, log, entry): # {{{
|
||||||
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
|
title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())'))
|
||||||
|
#log.debug(u'Tile: -----> %s' % title)
|
||||||
|
|
||||||
title = entry.xpath(xp_template.format('Name'))
|
author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
|
||||||
author = entry.xpath(xp_template.format('Author'))
|
#log.debug(u'Author: -----> %s' % author)
|
||||||
|
|
||||||
norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
|
norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
|
||||||
mi = Metadata(title, norm_authors)
|
mi = Metadata(title, norm_authors)
|
||||||
|
|
||||||
ozon_id = entry.xpath(xp_template.format('ID'))
|
ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
|
||||||
mi.identifiers = {'ozon':ozon_id}
|
if ozon_id:
|
||||||
|
mi.identifiers = {'ozon':ozon_id}
|
||||||
mi.comments = entry.xpath(xp_template.format('Annotation'))
|
#log.debug(u'ozon_id: -----> %s' % ozon_id)
|
||||||
|
|
||||||
mi.ozon_cover_url = None
|
mi.ozon_cover_url = None
|
||||||
cover = entry.xpath(xp_template.format('Picture'))
|
cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
|
||||||
|
#log.debug(u'cover: -----> %s' % cover)
|
||||||
if cover:
|
if cover:
|
||||||
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
mi.ozon_cover_url = _translateToBigCoverUrl(cover)
|
||||||
|
#log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)
|
||||||
|
|
||||||
pub_year = entry.xpath(xp_template.format('Year'))
|
pub_year = None
|
||||||
if pub_year:
|
if pub_year:
|
||||||
mi.pubdate = toPubdate(log, pub_year)
|
mi.pubdate = toPubdate(log, pub_year)
|
||||||
#log.debug('pubdate %s'%mi.pubdate)
|
#log.debug('pubdate %s' % mi.pubdate)
|
||||||
|
|
||||||
rating = entry.xpath(xp_template.format('ClientRatingValue'))
|
mi.rating = self.get_rating(entry)
|
||||||
if rating:
|
#if not mi.rating:
|
||||||
try:
|
# log.debug('No rating found. ozon_id:%s'%ozon_id)
|
||||||
#'rating', A floating point number between 0 and 10
|
|
||||||
# OZON raion N of 5, calibre of 10, but there is a bug? in identify
|
|
||||||
mi.rating = float(rating)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
rating
|
|
||||||
return mi
|
return mi
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
def get_rating(self, entry): # {{{
|
||||||
|
ozon_rating = None
|
||||||
|
try:
|
||||||
|
xp_rating_template = u'boolean(.//div[contains(@class, "bStars") and contains(@class, "%s")])'
|
||||||
|
rating = None
|
||||||
|
if entry.xpath(xp_rating_template % 'm5'):
|
||||||
|
rating = 5.
|
||||||
|
elif entry.xpath(xp_rating_template % 'm4'):
|
||||||
|
rating = 4.
|
||||||
|
elif entry.xpath(xp_rating_template % 'm3'):
|
||||||
|
rating = 3.
|
||||||
|
elif entry.xpath(xp_rating_template % 'm2'):
|
||||||
|
rating = 2.
|
||||||
|
elif entry.xpath(xp_rating_template % 'm1'):
|
||||||
|
rating = 1.
|
||||||
|
if rating:
|
||||||
|
# 'rating', A floating point number between 0 and 10
|
||||||
|
# OZON raion N of 5, calibre of 10, but there is a bug? in identify
|
||||||
|
ozon_rating = float(rating)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return ozon_rating
|
||||||
|
# }}}
|
||||||
|
|
||||||
def get_cached_cover_url(self, identifiers): # {{{
|
def get_cached_cover_url(self, identifiers): # {{{
|
||||||
url = None
|
url = None
|
||||||
ozon_id = identifiers.get('ozon', None)
|
ozon_id = identifiers.get('ozon', None)
|
||||||
@ -317,20 +382,27 @@ class Ozon(Source):
|
|||||||
if cdata:
|
if cdata:
|
||||||
result_queue.put((self, cdata))
|
result_queue.put((self, cdata))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception(u'Failed to download cover from: %s'%cached_url)
|
log.exception(u'Failed to download cover from: %s' % cached_url)
|
||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_book_details(self, log, metadata, timeout): # {{{
|
def get_book_details(self, log, metadata, timeout, cachedPage): # {{{
|
||||||
from lxml import html, etree
|
from lxml import html, etree
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
url = self.get_book_url(metadata.get_identifiers())[2]
|
if not cachedPage:
|
||||||
|
url = self.get_book_url(metadata.get_identifiers())[2]
|
||||||
|
#log.debug(u'book_details_url', url)
|
||||||
|
|
||||||
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
raw = self.browser.open_novisit(url, timeout=timeout).read()
|
||||||
doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
|
||||||
|
else:
|
||||||
|
fulldoc = cachedPage
|
||||||
|
#log.debug(u'book_details -> using cached page')
|
||||||
|
|
||||||
|
doc = fulldoc.xpath(u'//div[@id="PageContent"][1]')[0]
|
||||||
|
|
||||||
xpt_tmpl_base = u'//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]'
|
xpt_tmpl_base = u'.//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]'
|
||||||
xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)'
|
xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)'
|
||||||
|
|
||||||
# series Серия/Серии
|
# series Серия/Серии
|
||||||
@ -342,25 +414,26 @@ class Ozon(Source):
|
|||||||
xpt_isbn = u'normalize-space(' + xpt_tmpl_base + u')'
|
xpt_isbn = u'normalize-space(' + xpt_tmpl_base + u')'
|
||||||
isbn_str = doc.xpath(xpt_isbn % u'ISBN')
|
isbn_str = doc.xpath(xpt_isbn % u'ISBN')
|
||||||
if isbn_str:
|
if isbn_str:
|
||||||
#log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str))
|
# log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str))
|
||||||
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
|
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
|
||||||
if all_isbns:
|
if all_isbns:
|
||||||
metadata.all_isbns = all_isbns
|
metadata.all_isbns = all_isbns
|
||||||
metadata.isbn = all_isbns[0]
|
metadata.isbn = all_isbns[0]
|
||||||
#log.debug(u'ISBN: ', metadata.isbn)
|
# log.debug(u'ISBN: ', metadata.isbn)
|
||||||
|
|
||||||
publishers = doc.xpath(xpt_tmpl_a % u'Издатель')
|
publishers = doc.xpath(xpt_tmpl_a % u'Издатель')
|
||||||
if publishers:
|
if publishers:
|
||||||
metadata.publisher = publishers
|
metadata.publisher = publishers
|
||||||
#log.debug(u'Publisher: ', metadata.publisher)
|
# log.debug(u'Publisher: ', metadata.publisher)
|
||||||
|
|
||||||
xpt_lang = u'substring-after(normalize-space(//text()[contains(normalize-space(.), "%s")]), ":")'
|
xpt_lang = u'substring-after(normalize-space(.//text()[contains(normalize-space(.), "%s")]), ":")'
|
||||||
displ_lang = None
|
displ_lang = None
|
||||||
langs = doc.xpath(xpt_lang % u'Язык')
|
langs = doc.xpath(xpt_lang % u'Язык')
|
||||||
if langs:
|
if langs:
|
||||||
lng_splt = langs.split(u',')
|
lng_splt = langs.split(u',')
|
||||||
if lng_splt:
|
if lng_splt:
|
||||||
displ_lang = lng_splt[0].strip()
|
displ_lang = lng_splt[0].strip()
|
||||||
|
#log.debug(u'displ_lang1: ', displ_lang)
|
||||||
metadata.language = _translageLanguageToCode(displ_lang)
|
metadata.language = _translageLanguageToCode(displ_lang)
|
||||||
#log.debug(u'Language: ', metadata.language)
|
#log.debug(u'Language: ', metadata.language)
|
||||||
|
|
||||||
@ -372,10 +445,10 @@ class Ozon(Source):
|
|||||||
matcher = re.search(r'\d{4}', yearIn)
|
matcher = re.search(r'\d{4}', yearIn)
|
||||||
if matcher:
|
if matcher:
|
||||||
metadata.pubdate = toPubdate(log, matcher.group(0))
|
metadata.pubdate = toPubdate(log, matcher.group(0))
|
||||||
#log.debug(u'Pubdate: ', metadata.pubdate)
|
# log.debug(u'Pubdate: ', metadata.pubdate)
|
||||||
|
|
||||||
# overwrite comments from HTML if any
|
# overwrite comments from HTML if any
|
||||||
xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa
|
xpt = u'.//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa
|
||||||
from lxml.etree import ElementBase
|
from lxml.etree import ElementBase
|
||||||
comment_elem = doc.xpath(xpt)
|
comment_elem = doc.xpath(xpt)
|
||||||
if comment_elem:
|
if comment_elem:
|
||||||
@ -401,18 +474,17 @@ def _verifyISBNIntegrity(log, isbn): # {{{
|
|||||||
# Online ISBN-Check http://www.isbn-check.de/
|
# Online ISBN-Check http://www.isbn-check.de/
|
||||||
res = check_isbn(isbn)
|
res = check_isbn(isbn)
|
||||||
if not res:
|
if not res:
|
||||||
log.error(u'ISBN integrity check failed for "%s"'%isbn)
|
log.error(u'ISBN integrity check failed for "%s"' % isbn)
|
||||||
return res is not None
|
return res is not None
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# TODO: make customizable
|
# TODO: make customizable
|
||||||
def _translateToBigCoverUrl(coverUrl): # {{{
|
def _translateToBigCoverUrl(coverUrl): # {{{
|
||||||
# http://www.ozon.ru/multimedia/books_covers/small/1002986468.gif
|
# //static.ozone.ru/multimedia/c200/1005748980.jpg
|
||||||
# http://www.ozon.ru/multimedia/books_covers/1002986468.jpg
|
# http://www.ozon.ru/multimedia/books_covers/1009493080.jpg
|
||||||
|
m = re.match(r'.+\/([^\.\\]+).+$', coverUrl)
|
||||||
m = re.match(r'^(.+\/)small\/(.+\.).+$', coverUrl)
|
|
||||||
if m:
|
if m:
|
||||||
coverUrl = m.group(1) + m.group(2) + 'jpg'
|
coverUrl = 'http://www.ozon.ru/multimedia/books_covers/' + m.group(1) + '.jpg'
|
||||||
return coverUrl
|
return coverUrl
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@ -459,13 +531,14 @@ def _format_isbn(log, isbn): # {{{
|
|||||||
if m:
|
if m:
|
||||||
res = '-'.join([g for g in m.groups() if g])
|
res = '-'.join([g for g in m.groups() if g])
|
||||||
else:
|
else:
|
||||||
log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported'%isbn)
|
log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported' % isbn)
|
||||||
return res
|
return res
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def _translageLanguageToCode(displayLang): # {{{
|
def _translageLanguageToCode(displayLang): # {{{
|
||||||
displayLang = unicode(displayLang).strip() if displayLang else None
|
displayLang = unicode(displayLang).strip() if displayLang else None
|
||||||
langTbl = {None: 'ru',
|
langTbl = { None: 'ru',
|
||||||
|
u'Русский': 'ru',
|
||||||
u'Немецкий': 'de',
|
u'Немецкий': 'de',
|
||||||
u'Английский': 'en',
|
u'Английский': 'en',
|
||||||
u'Французский': 'fr',
|
u'Французский': 'fr',
|
||||||
@ -475,7 +548,7 @@ def _translageLanguageToCode(displayLang): # {{{
|
|||||||
u'Японский': 'ja',
|
u'Японский': 'ja',
|
||||||
u'Финский' : 'fi',
|
u'Финский' : 'fi',
|
||||||
u'Польский' : 'pl',
|
u'Польский' : 'pl',
|
||||||
u'Украинский' : 'uk',}
|
u'Украинский' : 'uk', }
|
||||||
return langTbl.get(displayLang, None)
|
return langTbl.get(displayLang, None)
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@ -502,7 +575,7 @@ def toPubdate(log, yearAsString): # {{{
|
|||||||
try:
|
try:
|
||||||
res = parse_only_date(u"01.01." + yearAsString)
|
res = parse_only_date(u"01.01." + yearAsString)
|
||||||
except:
|
except:
|
||||||
log.error('cannot parse to date %s'%yearAsString)
|
log.error('cannot parse to date %s' % yearAsString)
|
||||||
return res
|
return res
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user