mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Get Books: Fix ozon.ru
This commit is contained in:
parent
5a2848bacb
commit
343d8f448c
@ -54,30 +54,35 @@ class Ozon(Source):
|
||||
|
||||
# for ozon.ru search we have to format ISBN with '-'
|
||||
isbn = _format_isbn(log, identifiers.get('isbn', None))
|
||||
# TODO: format isbn!
|
||||
qItems = set([isbn, title])
|
||||
if authors:
|
||||
qItems |= frozenset(authors)
|
||||
qItems.discard(None)
|
||||
qItems.discard('')
|
||||
qItems = map(_quoteString, qItems)
|
||||
|
||||
q = u' '.join(qItems).strip()
|
||||
log.info(u'search string: ' + q)
|
||||
|
||||
if isinstance(q, unicode):
|
||||
q = q.encode('utf-8')
|
||||
if not q:
|
||||
return None
|
||||
|
||||
search_url += quote_plus(q)
|
||||
ozonid = identifiers.get('ozon', None)
|
||||
|
||||
unk = unicode(_('Unknown')).upper()
|
||||
if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid:
|
||||
qItems = set([isbn, title])
|
||||
if authors:
|
||||
qItems |= frozenset(authors)
|
||||
qItems.discard(None)
|
||||
qItems.discard('')
|
||||
qItems = map(_quoteString, qItems)
|
||||
|
||||
q = u' '.join(qItems).strip()
|
||||
log.info(u'search string: ' + q)
|
||||
|
||||
if isinstance(q, unicode):
|
||||
q = q.encode('utf-8')
|
||||
if not q:
|
||||
return None
|
||||
|
||||
search_url += quote_plus(q)
|
||||
else:
|
||||
search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid
|
||||
|
||||
log.debug(u'search url: %r'%search_url)
|
||||
|
||||
return search_url
|
||||
# }}}
|
||||
|
||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||
identifiers={}, timeout=30): # {{{
|
||||
identifiers={}, timeout=60): # {{{
|
||||
from lxml import etree
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
|
||||
@ -99,7 +104,7 @@ class Ozon(Source):
|
||||
try:
|
||||
parser = etree.XMLParser(recover=True, no_network=True)
|
||||
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
|
||||
entries = feed.xpath('//*[local-name() = "SearchItems"]')
|
||||
entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
|
||||
if entries:
|
||||
metadata = self.get_metadata(log, entries, title, authors, identifiers)
|
||||
self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
|
||||
@ -112,8 +117,8 @@ class Ozon(Source):
|
||||
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
|
||||
# some book titles have extra characters like this
|
||||
# TODO: make a twick
|
||||
reRemoveFromTitle = None
|
||||
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
||||
#reRemoveFromTitle = None
|
||||
reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
|
||||
|
||||
title = unicode(title).upper() if title else ''
|
||||
if reRemoveFromTitle:
|
||||
@ -163,7 +168,7 @@ class Ozon(Source):
|
||||
metadata.append(mi)
|
||||
#log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
|
||||
else:
|
||||
log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
|
||||
log.debug(u'skipped metadata %s %s. (does not match the query)'%(unicode(mi.title), mi.authors))
|
||||
return metadata
|
||||
# }}}
|
||||
|
||||
@ -301,7 +306,7 @@ class Ozon(Source):
|
||||
if series:
|
||||
metadata.series = series
|
||||
|
||||
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
|
||||
xpt = u'normalize-space(//*[@class="product-detail"]//text()[starts-with(., "ISBN")])'
|
||||
isbn_str = doc.xpath(xpt)
|
||||
if isbn_str:
|
||||
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
|
||||
@ -326,7 +331,7 @@ class Ozon(Source):
|
||||
|
||||
# can be set before from xml search responce
|
||||
if not metadata.pubdate:
|
||||
xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
|
||||
xpt = u'normalize-space(substring-after(//div[@class="product-detail"]//text()[contains(., "г.")],";"))'
|
||||
yearIn = doc.xpath(xpt)
|
||||
if yearIn:
|
||||
matcher = re.search(r'\d{4}', yearIn)
|
||||
@ -334,17 +339,20 @@ class Ozon(Source):
|
||||
metadata.pubdate = toPubdate(log, matcher.group(0))
|
||||
|
||||
# overwrite comments from HTML if any
|
||||
xpt = u'//table[@id="detail_description"]//tr/td'
|
||||
xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]'
|
||||
from lxml.etree import ElementBase
|
||||
comment_elem = doc.xpath(xpt)
|
||||
if comment_elem:
|
||||
comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
|
||||
if comments:
|
||||
# cleanup root tag, TODO: remove tags like object/embeded
|
||||
comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
|
||||
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
|
||||
metadata.comments = comments
|
||||
else:
|
||||
log.debug('HTML book description skipped in favour of search service xml responce')
|
||||
comments = u''
|
||||
for node in comment_elem:
|
||||
if isinstance(node, ElementBase):
|
||||
comments += unicode(etree.tostring(node, encoding=unicode))
|
||||
elif isinstance(node, basestring) and node.strip():
|
||||
comments += unicode(node) + u'\n'
|
||||
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
|
||||
metadata.comments = comments
|
||||
else:
|
||||
log.debug('HTML book description skipped in favour of search service xml responce')
|
||||
else:
|
||||
log.debug('No book description found in HTML')
|
||||
# }}}
|
||||
@ -430,7 +438,8 @@ def _translageLanguageToCode(displayLang): # {{{
|
||||
u'Китайский': 'zh',
|
||||
u'Японский': 'ja',
|
||||
u'Финский' : 'fi',
|
||||
u'Польский' : 'pl',}
|
||||
u'Польский' : 'pl',
|
||||
u'Украинский' : 'uk',}
|
||||
return langTbl.get(displayLang, None)
|
||||
# }}}
|
||||
|
||||
@ -454,7 +463,7 @@ def toPubdate(log, yearAsString): # {{{
|
||||
res = None
|
||||
if yearAsString:
|
||||
try:
|
||||
res = parse_only_date(yearAsString)
|
||||
res = parse_only_date(u"01.01." + yearAsString)
|
||||
except:
|
||||
log.error('cannot parse to date %s'%yearAsString)
|
||||
return res
|
||||
|
@ -46,30 +46,37 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
|
||||
d.set_tags(self.config.get('tags', ''))
|
||||
d.exec_()
|
||||
|
||||
|
||||
def search(self, query, max_results=10, timeout=60):
|
||||
def search(self, query, max_results=15, timeout=60):
|
||||
search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
|
||||
'searchText=%s&searchContext=ebook' % urllib2.quote(query)
|
||||
search_urls = [ search_url ]
|
||||
|
||||
## add this as the fist try if it looks like ozon ID
|
||||
if re.match("^\d{6,9}$", query):
|
||||
ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query
|
||||
search_urls.insert(0, ozon_detail)
|
||||
|
||||
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
|
||||
|
||||
counter = max_results
|
||||
br = browser()
|
||||
with closing(br.open(search_url, timeout=timeout)) as f:
|
||||
raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
|
||||
doc = etree.fromstring(raw)
|
||||
for data in doc.xpath('//*[local-name() = "SearchItems"]'):
|
||||
if counter <= 0:
|
||||
break
|
||||
counter -= 1
|
||||
|
||||
for url in search_urls:
|
||||
with closing(br.open(url, timeout=timeout)) as f:
|
||||
raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
|
||||
doc = etree.fromstring(raw)
|
||||
for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
|
||||
if counter <= 0:
|
||||
break
|
||||
counter -= 1
|
||||
|
||||
s = SearchResult()
|
||||
s.detail_item = data.xpath(xp_template.format('ID'))
|
||||
s.title = data.xpath(xp_template.format('Name'))
|
||||
s.author = data.xpath(xp_template.format('Author'))
|
||||
s.price = data.xpath(xp_template.format('Price'))
|
||||
s.cover_url = data.xpath(xp_template.format('Picture'))
|
||||
s.price = format_price_in_RUR(s.price)
|
||||
yield s
|
||||
s = SearchResult()
|
||||
s.detail_item = data.xpath(xp_template.format('ID'))
|
||||
s.title = data.xpath(xp_template.format('Name'))
|
||||
s.author = data.xpath(xp_template.format('Author'))
|
||||
s.price = data.xpath(xp_template.format('Price'))
|
||||
s.cover_url = data.xpath(xp_template.format('Picture'))
|
||||
s.price = format_price_in_RUR(s.price)
|
||||
yield s
|
||||
|
||||
def get_details(self, search_result, timeout=60):
|
||||
url = self.shop_url + '/context/detail/id/' + urllib2.quote(search_result.detail_item)
|
||||
@ -97,6 +104,16 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
|
||||
search_result.formats = ', '.join(_parse_ebook_formats(formats))
|
||||
# unfortunately no direct links to download books (only buy link)
|
||||
# search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item)
|
||||
|
||||
#<p class="main-cost"><span class="main">215</span><span class="submain">00</span> руб.</p>
|
||||
#<span itemprop="price" class="hidden">215.00</span>
|
||||
#<meta itemprop="priceCurrency" content="RUR " />
|
||||
|
||||
# if the price not in the search result (the ID search case)
|
||||
if not search_result.price:
|
||||
price = doc.xpath(u'normalize-space(//*[@itemprop="price"]/text())')
|
||||
search_result.price = format_price_in_RUR(price)
|
||||
|
||||
return result
|
||||
|
||||
def format_price_in_RUR(price):
|
||||
|
Loading…
x
Reference in New Issue
Block a user