From 61b24aef56ff19a0187b2f7d58d84e4bd1b5229e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 2 Apr 2022 15:39:24 +0530 Subject: [PATCH] Get books: Update English language Amazon plugins for website changes --- src/calibre/gui2/store/amazon_base.py | 115 +++++++++++++ .../gui2/store/stores/amazon_au_plugin.py | 153 ++--------------- .../gui2/store/stores/amazon_ca_plugin.py | 153 ++--------------- .../gui2/store/stores/amazon_in_plugin.py | 155 ++---------------- .../gui2/store/stores/amazon_plugin.py | 128 ++------------- .../gui2/store/stores/amazon_uk_plugin.py | 153 ++--------------- 6 files changed, 186 insertions(+), 671 deletions(-) create mode 100644 src/calibre/gui2/store/amazon_base.py diff --git a/src/calibre/gui2/store/amazon_base.py b/src/calibre/gui2/store/amazon_base.py new file mode 100644 index 0000000000..a08d3d40b0 --- /dev/null +++ b/src/calibre/gui2/store/amazon_base.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPL v3 Copyright: 2022, Kovid Goyal + +from lxml import etree, html +from qt.core import QUrl +from urllib.parse import urlencode + +from calibre.gui2 import open_url +from calibre.gui2.store.search_result import SearchResult +from calibre.scraper.simple import read_url + + +class AmazonStore: + + minimum_calibre_version = (5, 40, 1) + SEARCH_BASE_URL = 'https://www.amazon.com/s/' + SEARCH_BASE_QUERY = {'i': 'digital-text'} + BY = 'by' + KINDLE_EDITION = 'Kindle Edition' + DETAILS_URL = 'https://amazon.com/dp/' + STORE_LINK = 'https://www.amazon.com/Kindle-eBooks' + DRM_SEARCH_TEXT = 'Simultaneous Device Usage' + DRM_FREE_TEXT = 'Unlimited' + FIELD_KEYWORDS = 'k' + + def search_amazon(self, query, max_results=10, timeout=60, write_html_to=None): + field_keywords = self.FIELD_KEYWORDS + uquery = self.SEARCH_BASE_QUERY.copy() + uquery[field_keywords] = query + + def asbytes(x): + if isinstance(x, type('')): + x = x.encode('utf-8') + return x + uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} + url = self.SEARCH_BASE_URL + '?' + urlencode(uquery) + + counter = max_results + raw = read_url(self.scraper_storage, url, timeout=timeout) + if write_html_to is not None: + with open(write_html_to, 'w') as f: + f.write(raw) + doc = html.fromstring(raw) + for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'): + kformat = ''.join(result.xpath('.//a[contains(text(), "{}")]//text()'.format(self.KINDLE_EDITION))) + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (author pages). So we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + if 'kindle' not in kformat.lower(): + continue + asin = result.get('data-asin') + if not asin: + continue + + cover_url = ''.join(result.xpath('.//img/@src')) + title = etree.tostring(result.xpath('.//h2')[0], method='text', encoding='unicode') + adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0] + aparts = etree.tostring(adiv, method='text', encoding='unicode').split() + idx = aparts.index(self.BY) + author = ' '.join(aparts[idx+1:]).split('|')[0].strip() + price = '' + for span in result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]'): + q = ''.join(span.xpath('./text()')) + if q: + price = q + break + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = author.strip() + s.detail_item = asin.strip() + s.price = price.strip() + s.formats = 'Kindle' + + yield s + + def get_details_amazon(self, search_result, timeout): + url = self.DETAILS_URL + search_result.detail_item + raw = read_url(self.scraper_storage, url, timeout=timeout) + idata = html.fromstring(raw) + return self.parse_details_amazon(idata, search_result) + + def parse_details_amazon(self, idata, search_result): + if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + self.DRM_SEARCH_TEXT + '")])'): + if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + + self.DRM_FREE_TEXT + '") and contains(b, "' + + self.DRM_SEARCH_TEXT + '")])'): + search_result.drm = SearchResult.DRM_UNLOCKED + else: + search_result.drm = SearchResult.DRM_UNKNOWN + else: + search_result.drm = SearchResult.DRM_LOCKED + return True + + def open(self, parent=None, detail_item=None, external=False): + store_link = (self.DETAILS_URL + detail_item) if detail_item else self.STORE_LINK + open_url(QUrl(store_link)) + + def search(self, query, max_results=10, timeout=60): + for result in self.search_amazon(query, max_results=max_results, timeout=timeout): + yield result + + def get_details(self, search_result, timeout): + return self.get_details_amazon(search_result, timeout) + + def develop_plugin(self): + import sys + for result in self.search_amazon(' '.join(sys.argv[1:]), write_html_to='/t/amazon.html'): + print(result) diff --git a/src/calibre/gui2/store/stores/amazon_au_plugin.py b/src/calibre/gui2/store/stores/amazon_au_plugin.py index 05c14c6199..347e6e5fd3 100644 --- a/src/calibre/gui2/store/stores/amazon_au_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_au_plugin.py @@ -3,150 +3,27 @@ # License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals -store_version = 5 # Needed for dynamic plugin loading +store_version = 20 # Needed for dynamic plugin loading -from contextlib import closing -try: - from urllib.parse import urlencode -except ImportError: - from urllib import urlencode - -from lxml import html - -from qt.core import QUrl - -from calibre import browser -from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin -from calibre.gui2.store.search_result import SearchResult - -SEARCH_BASE_URL = 'https://www.amazon.com.au/s/' -SEARCH_BASE_QUERY = {'url': 'search-alias=digital-text'} -DETAILS_URL = 'https://amazon.com.au/dp/' -STORE_LINK = 'https://www.amazon.com.au' -DRM_SEARCH_TEXT = 'Simultaneous Device Usage' -DRM_FREE_TEXT = 'Unlimited' +try: + from calibre.gui2.store.amazon_base import AmazonStore +except ImportError: + class AmazonStore: + minimum_calibre_version = 9999, 0, 0 -def get_user_agent(): - return 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko' +class Base(AmazonStore): + scraper_storage = [] + SEARCH_BASE_URL = 'https://www.amazon.com.au/s/' + SEARCH_BASE_QUERY = {'url': 'search-alias=digital-text'} + DETAILS_URL = 'https://amazon.com.au/dp/' + STORE_LINK = 'https://www.amazon.com.au' -def search_amazon(query, max_results=10, timeout=60, - write_html_to=None, - base_url=SEARCH_BASE_URL, - base_query=SEARCH_BASE_QUERY, - field_keywords='field-keywords' - ): - uquery = base_query.copy() - uquery[field_keywords] = query - - def asbytes(x): - if isinstance(x, type('')): - x = x.encode('utf-8') - return x - uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} - url = base_url + '?' + urlencode(uquery) - br = browser(user_agent=get_user_agent()) - - counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_html_to is not None: - with open(write_html_to, 'wb') as f: - f.write(raw) - doc = html.fromstring(raw) - try: - results = doc.xpath('//div[@id="atfResults" and @class]')[0] - except IndexError: - return - - if 's-result-list-parent-container' in results.get('class', ''): - data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" - format_xpath = './/a[@title="Kindle Edition"]/@title' - asin_xpath = '@data-asin' - cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" - title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" - author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()' - price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' - ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') - else: - return - - for data in doc.xpath(data_xpath): - if counter <= 0: - break - - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format.lower(): - continue - - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue - - cover_url = ''.join(data.xpath(cover_xpath)) - - title = ''.join(data.xpath(title_xpath)) - author = ''.join(data.xpath(author_xpath)) - try: - author = author.split('by ', 1)[1].split(" (")[0] - except: - pass - - price = ''.join(data.xpath(price_xpath)) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.formats = 'Kindle' - - yield s - - -class AmazonKindleStore(StorePlugin): - - def open(self, parent=None, detail_item=None, external=False): - store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK - open_url(QUrl(store_link)) - - def search(self, query, max_results=10, timeout=60): - for result in search_amazon(query, max_results=max_results, timeout=timeout): - yield result - - def get_details(self, search_result, timeout): - url = DETAILS_URL - - br = browser(user_agent=get_user_agent()) - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + - DRM_SEARCH_TEXT + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - DRM_FREE_TEXT + '") and contains(b, "' + - DRM_SEARCH_TEXT + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN - else: - search_result.drm = SearchResult.DRM_LOCKED - return True +class AmazonKindleStore(Base, StorePlugin): + pass if __name__ == '__main__': - import sys - for result in search_amazon(' '.join(sys.argv[1:]), write_html_to='/t/amazon.html'): - print(result) + Base().develop_plugin() diff --git a/src/calibre/gui2/store/stores/amazon_ca_plugin.py b/src/calibre/gui2/store/stores/amazon_ca_plugin.py index cac37eb7e5..1bacb9342c 100644 --- a/src/calibre/gui2/store/stores/amazon_ca_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_ca_plugin.py @@ -3,150 +3,27 @@ # License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals -store_version = 7 # Needed for dynamic plugin loading +store_version = 20 # Needed for dynamic plugin loading -from contextlib import closing -try: - from urllib.parse import urlencode -except ImportError: - from urllib import urlencode - -from lxml import html - -from qt.core import QUrl - -from calibre import browser -from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin -from calibre.gui2.store.search_result import SearchResult - -SEARCH_BASE_URL = 'https://www.amazon.ca/s/' -SEARCH_BASE_QUERY = {'url': 'search-alias=digital-text'} -DETAILS_URL = 'https://amazon.ca/dp/' -STORE_LINK = 'https://www.amazon.ca' -DRM_SEARCH_TEXT = 'Simultaneous Device Usage' -DRM_FREE_TEXT = 'Unlimited' +try: + from calibre.gui2.store.amazon_base import AmazonStore +except ImportError: + class AmazonStore: + minimum_calibre_version = 9999, 0, 0 -def get_user_agent(): - return 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko' +class Base(AmazonStore): + scraper_storage = [] + SEARCH_BASE_URL = 'https://www.amazon.ca/s/' + SEARCH_BASE_QUERY = {'url': 'search-alias=digital-text'} + DETAILS_URL = 'https://amazon.ca/dp/' + STORE_LINK = 'https://www.amazon.ca' -def search_amazon(query, max_results=10, timeout=60, - write_html_to=None, - base_url=SEARCH_BASE_URL, - base_query=SEARCH_BASE_QUERY, - field_keywords='field-keywords' - ): - uquery = base_query.copy() - uquery[field_keywords] = query - - def asbytes(x): - if isinstance(x, type('')): - x = x.encode('utf-8') - return x - uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} - url = base_url + '?' + urlencode(uquery) - br = browser(user_agent=get_user_agent()) - - counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_html_to is not None: - with open(write_html_to, 'wb') as f: - f.write(raw) - doc = html.fromstring(raw) - try: - results = doc.xpath('//div[@id="atfResults" and @class]')[0] - except IndexError: - return - - if 's-result-list-parent-container' in results.get('class', ''): - data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" - format_xpath = './/a[@title="Kindle Edition"]/@title' - asin_xpath = '@data-asin' - cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" - title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" - author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()' - price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' - ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') - else: - return - - for data in doc.xpath(data_xpath): - if counter <= 0: - break - - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format.lower(): - continue - - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue - - cover_url = ''.join(data.xpath(cover_xpath)) - - title = ''.join(data.xpath(title_xpath)) - author = ''.join(data.xpath(author_xpath)) - try: - author = author.split('by ', 1)[1].split(" (")[0] - except: - pass - - price = ''.join(data.xpath(price_xpath)) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.formats = 'Kindle' - - yield s - - -class AmazonKindleStore(StorePlugin): - - def open(self, parent=None, detail_item=None, external=False): - store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK - open_url(QUrl(store_link)) - - def search(self, query, max_results=10, timeout=60): - for result in search_amazon(query, max_results=max_results, timeout=timeout): - yield result - - def get_details(self, search_result, timeout): - url = DETAILS_URL - - br = browser(user_agent=get_user_agent()) - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + - DRM_SEARCH_TEXT + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - DRM_FREE_TEXT + '") and contains(b, "' + - DRM_SEARCH_TEXT + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN - else: - search_result.drm = SearchResult.DRM_LOCKED - return True +class AmazonKindleStore(Base, StorePlugin): + pass if __name__ == '__main__': - import sys - for result in search_amazon(' '.join(sys.argv[1:]), write_html_to='/t/amazon.html'): - print(result) + Base().develop_plugin() diff --git a/src/calibre/gui2/store/stores/amazon_in_plugin.py b/src/calibre/gui2/store/stores/amazon_in_plugin.py index ca53510bd8..18fcd72c59 100644 --- a/src/calibre/gui2/store/stores/amazon_in_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_in_plugin.py @@ -3,152 +3,27 @@ # License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals -store_version = 5 # Needed for dynamic plugin loading +store_version = 20 # Needed for dynamic plugin loading -from contextlib import closing -try: - from urllib.parse import urlencode -except ImportError: - from urllib import urlencode - -from lxml import html - -from qt.core import QUrl - -from calibre import browser -from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin -from calibre.gui2.store.search_result import SearchResult - -SEARCH_BASE_URL = 'https://www.amazon.in/s/' -SEARCH_BASE_QUERY = {'url': 'search-alias=digital-text'} -DETAILS_URL = 'https://amazon.in/dp/' -STORE_LINK = 'https://www.amazon.in' -DRM_SEARCH_TEXT = 'Simultaneous Device Usage' -DRM_FREE_TEXT = 'Unlimited' +try: + from calibre.gui2.store.amazon_base import AmazonStore +except ImportError: + class AmazonStore: + minimum_calibre_version = 9999, 0, 0 -def get_user_agent(): - return 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko' +class Base(AmazonStore): + scraper_storage = [] + SEARCH_BASE_URL = 'https://www.amazon.in/s/' + SEARCH_BASE_QUERY = {'url': 'search-alias=digital-text'} + DETAILS_URL = 'https://amazon.in/dp/' + STORE_LINK = 'https://www.amazon.in' -def search_amazon(query, max_results=10, timeout=60, - write_html_to=None, - base_url=SEARCH_BASE_URL, - base_query=SEARCH_BASE_QUERY, - field_keywords='field-keywords' - ): - uquery = base_query.copy() - uquery[field_keywords] = query - - def asbytes(x): - if isinstance(x, type('')): - x = x.encode('utf-8') - return x - uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} - url = base_url + '?' + urlencode(uquery) - br = browser(user_agent=get_user_agent()) - - counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_html_to is not None: - with open(write_html_to, 'wb') as f: - f.write(raw) - doc = html.fromstring(raw) - try: - results = doc.xpath('//div[@id="atfResults" and @class]')[0] - except IndexError: - return - - if 's-result-list-parent-container' in results.get('class', ''): - data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" - format_xpath = './/a[@title="Kindle Edition"]/@title' - asin_xpath = '@data-asin' - cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" - title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" - author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()' - price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' - ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') - else: - return - - for data in doc.xpath(data_xpath): - if counter <= 0: - break - - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format.lower(): - continue - - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue - - cover_url = ''.join(data.xpath(cover_xpath)) - - title = ''.join(data.xpath(title_xpath)) - author = ''.join(data.xpath(author_xpath)) - try: - author = author.split('by ', 1)[1].split(" (")[0] - except: - pass - - price = ''.join(data.xpath(price_xpath)) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - if s.price: - s.price = '₹ ' + s.price - s.detail_item = asin.strip() - s.formats = 'Kindle' - - yield s - - -class AmazonKindleStore(StorePlugin): - - def open(self, parent=None, detail_item=None, external=False): - store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK - open_url(QUrl(store_link)) - - def search(self, query, max_results=10, timeout=60): - for result in search_amazon(query, max_results=max_results, timeout=timeout): - yield result - - def get_details(self, search_result, timeout): - url = DETAILS_URL - - br = browser(user_agent=get_user_agent()) - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + - DRM_SEARCH_TEXT + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - DRM_FREE_TEXT + '") and contains(b, "' + - DRM_SEARCH_TEXT + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN - else: - search_result.drm = SearchResult.DRM_LOCKED - return True +class AmazonKindleStore(Base, StorePlugin): + pass if __name__ == '__main__': - import sys - for result in search_amazon(' '.join(sys.argv[1:]), write_html_to='/t/amazon.html'): - print(result) + Base().develop_plugin() diff --git a/src/calibre/gui2/store/stores/amazon_plugin.py b/src/calibre/gui2/store/stores/amazon_plugin.py index a3bfb694a7..605efb3b44 100644 --- a/src/calibre/gui2/store/stores/amazon_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_plugin.py @@ -3,129 +3,23 @@ # License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals -store_version = 19 # Needed for dynamic plugin loading +store_version = 20 # Needed for dynamic plugin loading -from contextlib import closing -try: - from urllib.parse import urlencode -except ImportError: - from urllib import urlencode - -from lxml import html, etree - -from qt.core import QUrl - -from calibre import browser -from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin -from calibre.gui2.store.search_result import SearchResult - -SEARCH_BASE_URL = 'https://www.amazon.com/s/' -SEARCH_BASE_QUERY = {'i': 'digital-text'} -BY = 'by' -KINDLE_EDITION = 'Kindle Edition' -DETAILS_URL = 'https://amazon.com/dp/' -STORE_LINK = 'https://www.amazon.com/Kindle-eBooks' -DRM_SEARCH_TEXT = 'Simultaneous Device Usage' -DRM_FREE_TEXT = 'Unlimited' +try: + from calibre.gui2.store.amazon_base import AmazonStore +except ImportError: + class AmazonStore: + minimum_calibre_version = 9999, 0, 0 -def get_user_agent(): - return 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko' +class Base(AmazonStore): + scraper_storage = [] -def search_amazon(query, max_results=10, timeout=60, - write_html_to=None, - base_url=SEARCH_BASE_URL, - base_query=SEARCH_BASE_QUERY, - field_keywords='k' - ): - uquery = base_query.copy() - uquery[field_keywords] = query - - def asbytes(x): - if isinstance(x, type('')): - x = x.encode('utf-8') - return x - uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} - url = base_url + '?' + urlencode(uquery) - br = browser(user_agent=get_user_agent()) - - counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_html_to is not None: - with open(write_html_to, 'wb') as f: - f.write(raw) - doc = html.fromstring(raw) - for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'): - kformat = ''.join(result.xpath('.//a[contains(text(), "{}")]//text()'.format(KINDLE_EDITION))) - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - if 'kindle' not in kformat.lower(): - continue - asin = result.get('data-asin') - if not asin: - continue - - cover_url = ''.join(result.xpath('.//img/@src')) - title = etree.tostring(result.xpath('.//h2')[0], method='text', encoding='unicode') - adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0] - aparts = etree.tostring(adiv, method='text', encoding='unicode').split() - idx = aparts.index(BY) - author = ' '.join(aparts[idx+1:]).split('|')[0].strip() - price = '' - for span in result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]'): - q = ''.join(span.xpath('./text()')) - if q: - price = q - break - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.detail_item = asin.strip() - s.price = price.strip() - s.formats = 'Kindle' - - yield s - - -class AmazonKindleStore(StorePlugin): - - def open(self, parent=None, detail_item=None, external=False): - store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK - open_url(QUrl(store_link)) - - def search(self, query, max_results=10, timeout=60): - for result in search_amazon(query, max_results=max_results, timeout=timeout): - yield result - - def get_details(self, search_result, timeout): - url = DETAILS_URL - - br = browser(user_agent=get_user_agent()) - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + - DRM_SEARCH_TEXT + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - DRM_FREE_TEXT + '") and contains(b, "' + - DRM_SEARCH_TEXT + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN - else: - search_result.drm = SearchResult.DRM_LOCKED - return True +class AmazonKindleStore(Base, StorePlugin): + pass if __name__ == '__main__': - import sys - for result in search_amazon(' '.join(sys.argv[1:]), write_html_to='/t/amazon.html'): - print(result) + Base().develop_plugin() diff --git a/src/calibre/gui2/store/stores/amazon_uk_plugin.py b/src/calibre/gui2/store/stores/amazon_uk_plugin.py index 4d464d793c..76a9615902 100644 --- a/src/calibre/gui2/store/stores/amazon_uk_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_uk_plugin.py @@ -3,150 +3,27 @@ # License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals -store_version = 15 # Needed for dynamic plugin loading +store_version = 20 # Needed for dynamic plugin loading -from contextlib import closing -try: - from urllib.parse import urlencode -except ImportError: - from urllib import urlencode - -from lxml import html - -from qt.core import QUrl - -from calibre import browser -from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin -from calibre.gui2.store.search_result import SearchResult - -SEARCH_BASE_URL = 'https://www.amazon.co.uk/s/' -SEARCH_BASE_QUERY = {'url': 'search-alias=digital-text'} -DETAILS_URL = 'https://amazon.co.uk/dp/' -STORE_LINK = 'https://www.amazon.co.uk' -DRM_SEARCH_TEXT = 'Simultaneous Device Usage' -DRM_FREE_TEXT = 'Unlimited' +try: + from calibre.gui2.store.amazon_base import AmazonStore +except ImportError: + class AmazonStore: + minimum_calibre_version = 9999, 0, 0 -def get_user_agent(): - return 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko' +class Base(AmazonStore): + scraper_storage = [] + SEARCH_BASE_URL = 'https://www.amazon.co.uk/s/' + SEARCH_BASE_QUERY = {'url': 'search-alias=digital-text'} + DETAILS_URL = 'https://amazon.co.uk/dp/' + STORE_LINK = 'https://www.amazon.co.uk' -def search_amazon(query, max_results=10, timeout=60, - write_html_to=None, - base_url=SEARCH_BASE_URL, - base_query=SEARCH_BASE_QUERY, - field_keywords='field-keywords' - ): - uquery = base_query.copy() - uquery[field_keywords] = query - - def asbytes(x): - if isinstance(x, type('')): - x = x.encode('utf-8') - return x - uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} - url = base_url + '?' + urlencode(uquery) - br = browser(user_agent=get_user_agent()) - - counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_html_to is not None: - with open(write_html_to, 'wb') as f: - f.write(raw) - doc = html.fromstring(raw) - try: - results = doc.xpath('//div[@id="atfResults" and @class]')[0] - except IndexError: - return - - if 's-result-list-parent-container' in results.get('class', ''): - data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" - format_xpath = './/a[contains(text(), "Kindle Edition")]//text()' - asin_xpath = '@data-asin' - cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" - title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" - author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()' - price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' - ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') - else: - return - - for data in doc.xpath(data_xpath): - if counter <= 0: - break - - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format.lower(): - continue - - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue - - cover_url = ''.join(data.xpath(cover_xpath)) - - title = ''.join(data.xpath(title_xpath)) - author = ''.join(data.xpath(author_xpath)) - try: - author = author.split('by ', 1)[1].split(" (")[0] - except: - pass - - price = ''.join(data.xpath(price_xpath)) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.formats = 'Kindle' - - yield s - - -class AmazonKindleStore(StorePlugin): - - def open(self, parent=None, detail_item=None, external=False): - store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK - open_url(QUrl(store_link)) - - def search(self, query, max_results=10, timeout=60): - for result in search_amazon(query, max_results=max_results, timeout=timeout): - yield result - - def get_details(self, search_result, timeout): - url = DETAILS_URL - - br = browser(user_agent=get_user_agent()) - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + - DRM_SEARCH_TEXT + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - DRM_FREE_TEXT + '") and contains(b, "' + - DRM_SEARCH_TEXT + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN - else: - search_result.drm = SearchResult.DRM_LOCKED - return True +class AmazonKindleStore(Base, StorePlugin): + pass if __name__ == '__main__': - import sys - for result in search_amazon(' '.join(sys.argv[1:]), write_html_to='/t/amazon.html'): - print(result) + Base().develop_plugin()