Get Books: Fix ozon.ru

This commit is contained in:
Kovid Goyal 2012-07-02 10:17:54 +05:30
parent 5a2848bacb
commit 343d8f448c
2 changed files with 80 additions and 54 deletions

View File

@ -54,30 +54,35 @@ class Ozon(Source):
# for ozon.ru search we have to format ISBN with '-' # for ozon.ru search we have to format ISBN with '-'
isbn = _format_isbn(log, identifiers.get('isbn', None)) isbn = _format_isbn(log, identifiers.get('isbn', None))
# TODO: format isbn! ozonid = identifiers.get('ozon', None)
qItems = set([isbn, title])
if authors:
qItems |= frozenset(authors)
qItems.discard(None)
qItems.discard('')
qItems = map(_quoteString, qItems)
q = u' '.join(qItems).strip() unk = unicode(_('Unknown')).upper()
log.info(u'search string: ' + q) if (title and title != unk) or (authors and authors != [unk]) or isbn or not ozonid:
qItems = set([isbn, title])
if authors:
qItems |= frozenset(authors)
qItems.discard(None)
qItems.discard('')
qItems = map(_quoteString, qItems)
if isinstance(q, unicode): q = u' '.join(qItems).strip()
q = q.encode('utf-8') log.info(u'search string: ' + q)
if not q:
return None if isinstance(q, unicode):
q = q.encode('utf-8')
if not q:
return None
search_url += quote_plus(q)
else:
search_url = self.ozon_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % ozonid
search_url += quote_plus(q)
log.debug(u'search url: %r'%search_url) log.debug(u'search url: %r'%search_url)
return search_url return search_url
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=30): # {{{ identifiers={}, timeout=60): # {{{
from lxml import etree from lxml import etree
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
@ -99,7 +104,7 @@ class Ozon(Source):
try: try:
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser) feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
entries = feed.xpath('//*[local-name() = "SearchItems"]') entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
if entries: if entries:
metadata = self.get_metadata(log, entries, title, authors, identifiers) metadata = self.get_metadata(log, entries, title, authors, identifiers)
self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
@ -112,8 +117,8 @@ class Ozon(Source):
def get_metadata(self, log, entries, title, authors, identifiers): # {{{ def get_metadata(self, log, entries, title, authors, identifiers): # {{{
# some book titles have extra characters like this # some book titles have extra characters like this
# TODO: make a twick # TODO: make a twick
reRemoveFromTitle = None #reRemoveFromTitle = None
#reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
title = unicode(title).upper() if title else '' title = unicode(title).upper() if title else ''
if reRemoveFromTitle: if reRemoveFromTitle:
@ -163,7 +168,7 @@ class Ozon(Source):
metadata.append(mi) metadata.append(mi)
#log.debug(u'added metadata %s %s.'%(mi.title, mi.authors)) #log.debug(u'added metadata %s %s.'%(mi.title, mi.authors))
else: else:
log.debug(u'skipped metadata %s %s. (does not match the query)'%(mi.title, mi.authors)) log.debug(u'skipped metadata %s %s. (does not match the query)'%(unicode(mi.title), mi.authors))
return metadata return metadata
# }}} # }}}
@ -301,7 +306,7 @@ class Ozon(Source):
if series: if series:
metadata.series = series metadata.series = series
xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))' xpt = u'normalize-space(//*[@class="product-detail"]//text()[starts-with(., "ISBN")])'
isbn_str = doc.xpath(xpt) isbn_str = doc.xpath(xpt)
if isbn_str: if isbn_str:
all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)] all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn)]
@ -326,7 +331,7 @@ class Ozon(Source):
# can be set before from xml search responce # can be set before from xml search responce
if not metadata.pubdate: if not metadata.pubdate:
xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])' xpt = u'normalize-space(substring-after(//div[@class="product-detail"]//text()[contains(., "г.")],";"))'
yearIn = doc.xpath(xpt) yearIn = doc.xpath(xpt)
if yearIn: if yearIn:
matcher = re.search(r'\d{4}', yearIn) matcher = re.search(r'\d{4}', yearIn)
@ -334,17 +339,20 @@ class Ozon(Source):
metadata.pubdate = toPubdate(log, matcher.group(0)) metadata.pubdate = toPubdate(log, matcher.group(0))
# overwrite comments from HTML if any # overwrite comments from HTML if any
xpt = u'//table[@id="detail_description"]//tr/td' xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]'
from lxml.etree import ElementBase
comment_elem = doc.xpath(xpt) comment_elem = doc.xpath(xpt)
if comment_elem: if comment_elem:
comments = unicode(etree.tostring(comment_elem[0], encoding=unicode)) comments = u''
if comments: for node in comment_elem:
# cleanup root tag, TODO: remove tags like object/embeded if isinstance(node, ElementBase):
comments = re.sub(ur'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip() comments += unicode(etree.tostring(node, encoding=unicode))
if comments and (not metadata.comments or len(comments) > len(metadata.comments)): elif isinstance(node, basestring) and node.strip():
metadata.comments = comments comments += unicode(node) + u'\n'
else: if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
log.debug('HTML book description skipped in favour of search service xml responce') metadata.comments = comments
else:
log.debug('HTML book description skipped in favour of search service xml responce')
else: else:
log.debug('No book description found in HTML') log.debug('No book description found in HTML')
# }}} # }}}
@ -430,7 +438,8 @@ def _translageLanguageToCode(displayLang): # {{{
u'Китайский': 'zh', u'Китайский': 'zh',
u'Японский': 'ja', u'Японский': 'ja',
u'Финский' : 'fi', u'Финский' : 'fi',
u'Польский' : 'pl',} u'Польский' : 'pl',
u'Украинский' : 'uk',}
return langTbl.get(displayLang, None) return langTbl.get(displayLang, None)
# }}} # }}}
@ -454,7 +463,7 @@ def toPubdate(log, yearAsString): # {{{
res = None res = None
if yearAsString: if yearAsString:
try: try:
res = parse_only_date(yearAsString) res = parse_only_date(u"01.01." + yearAsString)
except: except:
log.error('cannot parse to date %s'%yearAsString) log.error('cannot parse to date %s'%yearAsString)
return res return res

View File

@ -46,30 +46,37 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
d.set_tags(self.config.get('tags', '')) d.set_tags(self.config.get('tags', ''))
d.exec_() d.exec_()
def search(self, query, max_results=15, timeout=60):
def search(self, query, max_results=10, timeout=60):
search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\ search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
'searchText=%s&searchContext=ebook' % urllib2.quote(query) 'searchText=%s&searchContext=ebook' % urllib2.quote(query)
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' search_urls = [ search_url ]
## add this as the fist try if it looks like ozon ID
if re.match("^\d{6,9}$", query):
ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query
search_urls.insert(0, ozon_detail)
xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
counter = max_results counter = max_results
br = browser() br = browser()
with closing(br.open(search_url, timeout=timeout)) as f:
raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
doc = etree.fromstring(raw)
for data in doc.xpath('//*[local-name() = "SearchItems"]'):
if counter <= 0:
break
counter -= 1
s = SearchResult() for url in search_urls:
s.detail_item = data.xpath(xp_template.format('ID')) with closing(br.open(url, timeout=timeout)) as f:
s.title = data.xpath(xp_template.format('Name')) raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
s.author = data.xpath(xp_template.format('Author')) doc = etree.fromstring(raw)
s.price = data.xpath(xp_template.format('Price')) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
s.cover_url = data.xpath(xp_template.format('Picture')) if counter <= 0:
s.price = format_price_in_RUR(s.price) break
yield s counter -= 1
s = SearchResult()
s.detail_item = data.xpath(xp_template.format('ID'))
s.title = data.xpath(xp_template.format('Name'))
s.author = data.xpath(xp_template.format('Author'))
s.price = data.xpath(xp_template.format('Price'))
s.cover_url = data.xpath(xp_template.format('Picture'))
s.price = format_price_in_RUR(s.price)
yield s
def get_details(self, search_result, timeout=60): def get_details(self, search_result, timeout=60):
url = self.shop_url + '/context/detail/id/' + urllib2.quote(search_result.detail_item) url = self.shop_url + '/context/detail/id/' + urllib2.quote(search_result.detail_item)
@ -97,6 +104,16 @@ class OzonRUStore(BasicStoreConfig, StorePlugin):
search_result.formats = ', '.join(_parse_ebook_formats(formats)) search_result.formats = ', '.join(_parse_ebook_formats(formats))
# unfortunately no direct links to download books (only buy link) # unfortunately no direct links to download books (only buy link)
# search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item) # search_result.downloads['BF2'] = self.shop_url + '/order/digitalorder.aspx?id=' + + urllib2.quote(search_result.detail_item)
#<p class="main-cost"><span class="main">215</span><span class="submain">00</span> руб.</p>
#<span itemprop="price" class="hidden">215.00</span>
#<meta itemprop="priceCurrency" content="RUR " />
# if the price not in the search result (the ID search case)
if not search_result.price:
price = doc.xpath(u'normalize-space(//*[@itemprop="price"]/text())')
search_result.price = format_price_in_RUR(price)
return result return result
def format_price_in_RUR(price): def format_price_in_RUR(price):