Empik Plugin: improve hidden formats detection

This commit is contained in:
Tomasz Długosz 2015-02-21 03:02:20 +01:00
parent 9355da4576
commit 7c0036a7c6

View File

@ -45,7 +45,7 @@ class EmpikStore(BasicStoreConfig, StorePlugin):
d.exec_() d.exec_()
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
url = 'http://www.empik.com/szukaj/produkt?c=ebooki-ebooki&q=' + urllib.quote(query) + '&qtype=basicForm&start=1&catalogType=pl&searchCategory=3501&resultsPP=' + str(max_results) url = 'http://www.empik.com/szukaj/produkt?c=ebooki-ebooki&q=' + urllib.quote(query) + '&qtype=basicForm&start=1&catalogType=pl&searchCategory=3501&format=epub&format=mobi&format=pdf&resultsPP=' + str(max_results)
br = browser() br = browser()
@ -68,13 +68,19 @@ class EmpikStore(BasicStoreConfig, StorePlugin):
formats = ''.join(data.xpath('.//div[@class="productBox-450Type"]/text()')) formats = ''.join(data.xpath('.//div[@class="productBox-450Type"]/text()'))
formats = re.sub(r'Ebook *,? *','', formats) formats = re.sub(r'Ebook *,? *','', formats)
formats = re.sub(r'\(.*\)','', formats) formats = re.sub(r'\(.*\)','', formats)
with closing(br.open('http://empik.com' + id.strip(), timeout=timeout/4)) as nf:
idata = html.fromstring(nf.read())
crawled = idata.xpath('.//td[(@class="connectedInfo") or (@class="connectedInfo connectedBordered")]/a/text()')
formats_more = ','.join([ re.sub('ebook, ','', x) for x in crawled if 'ebook' in x])
if formats_more:
formats += ', ' + formats_more
drm = data.xpath('boolean(.//div[@class="productBox-450Type" and contains(text(), "ADE")])') drm = data.xpath('boolean(.//div[@class="productBox-450Type" and contains(text(), "ADE")])')
counter -= 1 counter -= 1
s = SearchResult() s = SearchResult()
s.cover_url = cover_url s.cover_url = cover_url
s.title = title.strip() + ' ' + formats s.title = title.strip()
s.author = author.strip() s.author = author.strip()
s.price = price s.price = price
s.detail_item = 'http://empik.com' + id.strip() s.detail_item = 'http://empik.com' + id.strip()