Update amazon Get books plugin for markup change

This commit is contained in:
Kovid Goyal 2019-03-08 08:40:30 +05:30
parent 7667b177d8
commit d205255cb7
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -4,12 +4,12 @@
from __future__ import (unicode_literals, division, absolute_import, from __future__ import (unicode_literals, division, absolute_import,
print_function) print_function)
store_version = 14 # Needed for dynamic plugin loading store_version = 16 # Needed for dynamic plugin loading
from contextlib import closing from contextlib import closing
import urllib import urllib
from lxml import html from lxml import html, etree
from PyQt5.Qt import QUrl from PyQt5.Qt import QUrl
@ -19,7 +19,7 @@ from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
SEARCH_BASE_URL = 'https://www.amazon.com/s/' SEARCH_BASE_URL = 'https://www.amazon.com/s/'
SEARCH_BASE_QUERY = {'url': 'search-alias=digital-text'} SEARCH_BASE_QUERY = {'i': 'digital-text'}
DETAILS_URL = 'https://amazon.com/dp/' DETAILS_URL = 'https://amazon.com/dp/'
STORE_LINK = 'https://www.amazon.com/Kindle-eBooks' STORE_LINK = 'https://www.amazon.com/Kindle-eBooks'
DRM_SEARCH_TEXT = 'Simultaneous Device Usage' DRM_SEARCH_TEXT = 'Simultaneous Device Usage'
@ -34,7 +34,7 @@ def search_amazon(query, max_results=10, timeout=60,
write_html_to=None, write_html_to=None,
base_url=SEARCH_BASE_URL, base_url=SEARCH_BASE_URL,
base_query=SEARCH_BASE_QUERY, base_query=SEARCH_BASE_QUERY,
field_keywords='field-keywords' field_keywords='k'
): ):
uquery = base_query.copy() uquery = base_query.copy()
uquery[field_keywords] = query uquery[field_keywords] = query
@ -54,89 +54,25 @@ def search_amazon(query, max_results=10, timeout=60,
with open(write_html_to, 'wb') as f: with open(write_html_to, 'wb') as f:
f.write(raw) f.write(raw)
doc = html.fromstring(raw) doc = html.fromstring(raw)
try: for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'):
results = doc.xpath('//div[@id="atfResults" and @class]')[0] kformat = ''.join(result.xpath('.//a[contains(text(), "Kindle Edition")]//text()'))
except IndexError:
return
if 's-result-list-parent-container' in results.get('class', ''):
data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]"
format_xpath = './/a[contains(text(), "Kindle Edition")]//text()'
asin_xpath = '@data-asin'
cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src"
title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()"
author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()'
price_xpath = 'descendant::span[contains(@class, "sx-price")]/../@aria-label'
elif 'grid' in results.get('class', ''):
data_xpath = '//div[contains(@class, "prod")]'
format_xpath = (
'.//ul[contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name'
cover_xpath = './/img[contains(@class, "productImage")]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
price_xpath = (
'.//ul[contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
elif 'ilresults' in results.get('class', ''):
data_xpath = '//li[(@class="ilo")]'
format_xpath = (
'.//ul[contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name'
cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
# Results can be in a grid (table) or a column
price_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
elif 'list' in results.get('class', ''):
data_xpath = '//div[contains(@class, "prod")]'
format_xpath = (
'.//ul[contains(@class, "rsltL")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name'
cover_xpath = './/img[contains(@class, "productImage")]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
price_xpath = (
'.//ul[contains(@class, "rsltL")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
else:
return
for data in doc.xpath(data_xpath):
if counter <= 0:
break
# Even though we are searching digital-text only Amazon will still # Even though we are searching digital-text only Amazon will still
# put in results for non Kindle books (author pages). Se we need # put in results for non Kindle books (author pages). Se we need
# to explicitly check if the item is a Kindle book and ignore it # to explicitly check if the item is a Kindle book and ignore it
# if it isn't. # if it isn't.
format = ''.join(data.xpath(format_xpath)) if 'kindle' not in kformat.lower():
if 'kindle' not in format.lower(): continue
asin = result.get('data-asin')
if not asin:
continue continue
# We must have an asin otherwise we can't easily reference the cover_url = ''.join(result.xpath('.//img/@src'))
# book later. title = etree.tostring(result.xpath('.//h5')[0], method='text', encoding='unicode')
asin = data.xpath(asin_xpath) adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0]
if asin: aparts = etree.tostring(adiv, method='text', encoding='unicode').split()
asin = asin[0] idx = aparts.index('|')
else: author = ' '.join(aparts[1:idx])
continue price = ''.join(result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]/text()'))
cover_url = ''.join(data.xpath(cover_xpath))
title = ''.join(data.xpath(title_xpath))
author = ''.join(data.xpath(author_xpath))
try:
author = author.split('by ', 1)[1].split(" (")[0]
except:
pass
price = ''.join(data.xpath(price_xpath))
counter -= 1 counter -= 1
@ -144,8 +80,8 @@ def search_amazon(query, max_results=10, timeout=60,
s.cover_url = cover_url.strip() s.cover_url = cover_url.strip()
s.title = title.strip() s.title = title.strip()
s.author = author.strip() s.author = author.strip()
s.price = price.strip()
s.detail_item = asin.strip() s.detail_item = asin.strip()
s.price = price.strip()
s.formats = 'Kindle' s.formats = 'Kindle'
yield s yield s