Get Books: Fix ozon.ru

This commit is contained in:
Kovid Goyal 2012-07-02 10:17:54 +05:30
parent 5a2848bacb
commit 343d8f448c
2 changed files with 80 additions and 54 deletions

View File

@ -54,30 +54,35 @@ class Ozon(Source):
# for ozon.ru search we have to format ISBN with '-'
isbn = _format_isbn(log, identifiers.get('isbn', None))
# TODO: format isbn!
qItems = set([isbn, title])
if authors:
qItems |= frozenset(authors)
qItems.discard(None)
qItems.discard('')
qItems = map(_quoteString, qItems)
q = u' '.join(qItems).strip()
log.info(u'search string: ' + q)
if isinstance(q, unicode):
q = q.encode('utf-8')
if not q:
return None
search_url += quote_plus(q)
ozonid = identifiers.get('ozon', None)
unk = unicode(_('Unknown')).upper()
if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid:
qItems = set([isbn, title])
if authors:
qItems |= frozenset(authors)
qItems.discard(None)
qItems.discard('')
qItems = map(_quoteString, qItems)
q = u' '.join(qItems).strip()
log.info(u'search string: ' + q)
if isinstance(q, unicode):
q = q.encode('utf-8')
if not q:
return None
search_url += quote_plus(q)
else:
search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid
log.debug(u'search url: %r'%search_url)
return search_url
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=30): # {{{
identifiers={}, timeout=60): # {{{
from lxml import etree
from calibre.ebooks.chardet import xml_to_unicode
@ -99,7 +104,7 @@ class Ozon(Source):
try:
parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
entries = feed.xpath('//*[local-name() = "SearchItems"]')
entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
if entries:
metadata = self.get_metadata(log, entries, title, authors, identifiers)
self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
@ -112,8 +117,8 @@ class Ozon(Source):
def get_metadata(self, log, entries, title, authors, identifiers): # {{{
# some book titles have extra characters like this
# TODO: make a twick
reRemoveFromTitle = None
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
#reRemoveFromTitle = None
reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
title = unicode(title).upper() if title else ''
if reRemoveFromTitle:
@ -163,7 +168,7 @@ class Ozon(Source):
metadata.append(mi)
#log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
else:
log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors))
log.debug(u'skipped metadata %s %s. (does not match the query)'%(unicode(mi.title), mi.authors))
return metadata
# }}}
@ -301,7 +306,7 @@ class Ozon(Source):
if series:
metadata.series = series
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
xpt = u'normalize-space(//*[@class="product-detail"]//text()[starts-with(., "ISBN")])'
isbn_str = doc.xpath(xpt)
if isbn_str:
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
@ -326,7 +331,7 @@ class Ozon(Source):
# can be set before from xml search responce
if not metadata.pubdate:
xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
xpt = u'normalize-space(substring-after(//div[@class="product-detail"]//text()[contains(., "г.")],";"))'
yearIn = doc.xpath(xpt)
if yearIn:
matcher = re.search(r'\d{4}', yearIn)
@ -334,17 +339,20 @@ class Ozon(Source):
metadata.pubdate = toPubdate(log, matcher.group(0))
# overwrite comments from HTML if any
xpt = u'//table[@id="detail_description"]//tr/td'
xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]'
from lxml.etree import ElementBase
comment_elem = doc.xpath(xpt)
if comment_elem:
comments = unicode(etree.tostring(comment_elem[0], encoding=unicode))
if comments:
# cleanup root tag, TODO: remove tags like object/embeded
comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
metadata.comments = comments
else:
log.debug('HTML book description skipped in favour of search service xml responce')
comments = u''
for node in comment_elem:
if isinstance(node, ElementBase):
comments += unicode(etree.tostring(node, encoding=unicode))
elif isinstance(node, basestring) and node.strip():
comments += unicode(node) + u'\n'
if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
metadata.comments = comments
else:
log.debug('HTML book description skipped in favour of search service xml responce')
else:
log.debug('No book description found in HTML')
# }}}
@ -430,7 +438,8 @@ def _translageLanguageToCode(displayLang): # {{{
u'Китайский': 'zh',
u'Японский': 'ja',
u'Финский' : 'fi',
u'Польский' : 'pl',}
u'Польский' : 'pl',
u'Украинский' : 'uk',}
return langTbl.get(displayLang, None)
# }}}
@ -454,7 +463,7 @@ def toPubdate(log, yearAsString): # {{{
res = None
if yearAsString:
try:
res = parse_only_date(yearAsString)
res = parse_only_date(u"01.01." + yearAsString)
except:
log.error('cannot parse to date %s'%yearAsString)
return res

View File

@ -46,30 +46,37 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
d.set_tags(self.config.get('tags', ''))
d.exec_()
def search(self, query, max_results=10, timeout=60):
def search(self, query, max_results=15, timeout=60):
search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
'searchText=%s&searchContext=ebook' % urllib2.quote(query)
search_urls = [ search_url ]
## add this as the fist try if it looks like ozon ID
if re.match("^\d{6,9}$", query):
ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query
search_urls.insert(0, ozon_detail)
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
counter = max_results
br = browser()
with closing(br.open(search_url, timeout=timeout)) as f:
raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
doc = etree.fromstring(raw)
for data in doc.xpath('//*[local-name() = "SearchItems"]'):
if counter <= 0:
break
counter -= 1
for url in search_urls:
with closing(br.open(url, timeout=timeout)) as f:
raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
doc = etree.fromstring(raw)
for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
if counter <= 0:
break
counter -= 1
s = SearchResult()
s.detail_item = data.xpath(xp_template.format('ID'))
s.title = data.xpath(xp_template.format('Name'))
s.author = data.xpath(xp_template.format('Author'))
s.price = data.xpath(xp_template.format('Price'))
s.cover_url = data.xpath(xp_template.format('Picture'))
s.price = format_price_in_RUR(s.price)
yield s
s = SearchResult()
s.detail_item = data.xpath(xp_template.format('ID'))
s.title = data.xpath(xp_template.format('Name'))
s.author = data.xpath(xp_template.format('Author'))
s.price = data.xpath(xp_template.format('Price'))
s.cover_url = data.xpath(xp_template.format('Picture'))
s.price = format_price_in_RUR(s.price)
yield s
def get_details(self, search_result, timeout=60):
url = self.shop_url + '/context/detail/id/' + urllib2.quote(search_result.detail_item)
@ -97,6 +104,16 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
search_result.formats = ', '.join(_parse_ebook_formats(formats))
# unfortunately no direct links to download books (only buy link)
# search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item)
#<p class="main-cost"><span class="main">215</span><span class="submain">00</span> руб.</p>
#<span itemprop="price" class="hidden">215.00</span>
#<meta itemprop="priceCurrency" content="RUR " />
# if the price not in the search result (the ID search case)
if not search_result.price:
price = doc.xpath(u'normalize-space(//*[@itemprop="price"]/text())')
search_result.price = format_price_in_RUR(price)
return result
def format_price_in_RUR(price):