diff --git a/src/calibre/gui2/store/stores/amazon_plugin.py b/src/calibre/gui2/store/stores/amazon_plugin.py index a7c7b8694f..20b721ff7b 100644 --- a/src/calibre/gui2/store/stores/amazon_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_plugin.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 5 # Needed for dynamic plugin loading +store_version = 6 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' @@ -19,9 +19,123 @@ from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin from calibre.gui2.store.search_result import SearchResult +def CSSSelect(expr): + from cssselect import HTMLTranslator + return HTMLTranslator().css_to_xpath(expr) + +def search_amazon(query, max_results=10, timeout=60, + write_html_to=None, + search_url='http://www.amazon.com/s/?url=search-alias%3Ddigital-text&field-keywords='): + url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() + + counter = max_results + with closing(br.open(url, timeout=timeout)) as f: + raw = f.read() + if write_html_to is not None: + with open(write_html_to, 'wb') as f: + f.write(raw) + doc = html.fromstring(raw) + try: + results = doc.xpath('//div[@id="atfResults" and @class]')[0] + except IndexError: + return + + if 's-result-list-parent-container' in results.get('class', ''): + data_xpath = CSSSelect('li.s-result-item') + format_xpath = './/a[@title="Kindle Edition"]//h3/text()' + asin_xpath = '@data-asin' + cover_xpath = CSSSelect('img.s-access-image') + '/@src' + title_xpath = CSSSelect('h2.s-access-title') + '//text()' + author_xpath = ('.//a[not(@title) and starts-with(@href, "/") and' + ' contains(@class, "a-link-normal") and' + ' contains(@href, "ref=sr_ntt_srch_lnk")]//text()') + price_xpath = '(.//span[contains(@class, " s-price ")])[last()]//text()' + elif 'grid' in results.get('class', ''): + data_xpath = '//div[contains(@class, "prod")]' + format_xpath = ( + './/ul[contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[contains(@class, "productImage")]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + price_xpath = ( + './/ul[contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') + elif 'ilresults' in results.get('class', ''): + data_xpath = '//li[(@class="ilo")]' + format_xpath = ( + './/ul[contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') + elif 'list' in results.get('class', ''): + data_xpath = '//div[contains(@class, "prod")]' + format_xpath = ( + './/ul[contains(@class, "rsltL")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[contains(@class, "productImage")]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + price_xpath = ( + './/ul[contains(@class, "rsltL")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') + else: + return + + for data in doc.xpath(data_xpath): + if counter <= 0: + break + + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (author pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format.lower(): + continue + + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue + + cover_url = ''.join(data.xpath(cover_xpath)) + + title = ''.join(data.xpath(title_xpath)) + author = ''.join(data.xpath(author_xpath)) + try: + author = author.split('by ', 1)[1].split(" (")[0] + except: + pass + + price = ''.join(data.xpath(price_xpath)) + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = author.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.formats = 'Kindle' + + yield s + class AmazonKindleStore(StorePlugin): - search_url = 'http://www.amazon.com/s/?url=search-alias%3Ddigital-text&field-keywords=' details_url = 'http://amazon.com/dp/' drm_search_text = u'Simultaneous Device Usage' drm_free_text = u'Unlimited' @@ -114,102 +228,15 @@ class AmazonKindleStore(StorePlugin): # Use Kovid's affiliate id 30% of the time. if random.randint(1, 10) in (1, 2, 3): aff_id['tag'] = 'calibrebs-20' - store_link = 'http://www.amazon.com/Kindle-eBooks/b/?ie=UTF&node=1286228011&ref_=%(tag)s&ref=%(tag)s&tag=%(tag)s&linkCode=ur2&camp=1789&creative=390957' % aff_id + store_link = 'http://www.amazon.com/Kindle-eBooks/b/?ie=UTF&node=1286228011&ref_=%(tag)s&ref=%(tag)s&tag=%(tag)s&linkCode=ur2&camp=1789&creative=390957' % aff_id # noqa if detail_item: aff_id['asin'] = detail_item store_link = 'http://www.amazon.com/dp/%(asin)s/?tag=%(tag)s' % aff_id open_url(QUrl(store_link)) def search(self, query, max_results=10, timeout=60): - url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') - br = browser() - - counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - - if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'): - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = ( - './/ul[contains(@class, "rsltGridList")]' - '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') - asin_xpath = '@name' - cover_xpath = './/img[contains(@class, "productImage")]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = ( - './/ul[contains(@class, "rsltGridList")]' - '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - elif doc.xpath('//div[@id = "atfResults" and contains(@class, "ilresults")]'): - data_xpath = '//li[(@class="ilo")]' - format_xpath = ( - './/ul[contains(@class, "rsltGridList")]' - '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') - asin_xpath = '@name' - cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - # Results can be in a grid (table) or a column - price_xpath = ( - './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' - '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'): - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = ( - './/ul[contains(@class, "rsltL")]' - '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') - asin_xpath = '@name' - cover_xpath = './/img[contains(@class, "productImage")]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = ( - './/ul[contains(@class, "rsltL")]' - '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - else: - return - - for data in doc.xpath(data_xpath): - if counter <= 0: - break - - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format.lower(): - continue - - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue - - cover_url = ''.join(data.xpath(cover_xpath)) - - title = ''.join(data.xpath(title_xpath)) - author = ''.join(data.xpath(author_xpath)) - try: - author = author.split('by ', 1)[1].split(" (")[0] - except: - pass - - price = ''.join(data.xpath(price_xpath)) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.formats = 'Kindle' - - yield s + for result in search_amazon(query, max_results=max_results, timeout=timeout): + yield result def get_details(self, search_result, timeout): url = self.details_url @@ -228,3 +255,11 @@ class AmazonKindleStore(StorePlugin): else: search_result.drm = SearchResult.DRM_LOCKED return True + +if __name__ == '__main__': + import os + from tempfile import gettempdir + rp = os.path.join(gettempdir(), 'raw.html') + for result in search_amazon('heroes abercrombie', write_html_to=rp): + print (result) + print ('HTML written to: %s' % rp)