diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 33a089b369..ad7450923f 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -1090,7 +1090,7 @@ class Worker(Thread): # Get details {{{ class Amazon(Source): name = 'Amazon.com' - version = (1, 3, 11) + version = (1, 3, 12) minimum_calibre_version = (2, 82, 0) description = _('Downloads metadata and covers from Amazon') @@ -1588,12 +1588,10 @@ class Amazon(Source): purl = urlparse(result.url) if '/dp/' in purl.path and site in purl.netloc: - url = result.cached_url - if url is None: - url = se.get_cached_url(result.url, br, timeout=timeout) - if url is None: - log('Failed to find cached page for:', result.url) - continue + # We cannot use cached URL as wayback machine no longer caches + # amazon and Google and Bing web caches are no longer + # accessible. + url = result.url if url not in matches: matches.append(url) if len(matches) >= 3: @@ -1778,6 +1776,14 @@ def manual_tests(domain, **kw): # {{{ from calibre.ebooks.metadata.sources.test import authors_test, comments_test, isbn_test, series_test, test_identify_plugin, title_test all_tests = {} all_tests['com'] = [ # {{{ + ( # # in title + {'title': 'Expert C# 2008 Business Objects', + 'authors': ['Lhotka']}, + [title_test('Expert C#'), + authors_test(['Rockford Lhotka']) + ] + ), + ( # Paperback with series {'identifiers': {'amazon': '1423146786'}}, [title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)] @@ -1805,14 +1811,6 @@ def manual_tests(domain, **kw): # {{{ ] ), - ( # # in title - {'title': 'Expert C# 2008 Business Objects', - 'authors': ['Lhotka']}, - [title_test('Expert C#'), - authors_test(['Rockford Lhotka']) - ] - ), - ( # New search results page markup (Dec 2024) {'title': 'Come si scrive un articolo medico-scientifico'}, [title_test('Come si scrive un articolo medico-scientifico', exact=True)] diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index 0b1403abb1..dbbfdef934 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -31,7 +31,7 @@ from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.lock import ExclusiveFile from calibre.utils.random_ua import accept_header_for_ua -current_version = (1, 2, 12) +current_version = (1, 2, 13) minimum_calibre_version = (2, 80, 0) webcache = {} webcache_lock = Lock() @@ -221,13 +221,6 @@ def bing_url_processor(url): return url -def bing_cached_url(url, br=None, log=prints, timeout=60): - # See https://support.microsoft.com/en-gb/topic/advanced-search-keywords-ea595928-5d63-4a0b-9c6b-0b769865e78a for operators - results, search_url = bing_search(['url:' + url], br=br, log=log, timeout=timeout) - for result in results: - return result.cached_url - - def resolve_bing_wrapper_page(url, br, log): raw = br.open_novisit(url).read().decode('utf-8', 'replace') m = re.search(r'var u = "(.+)"', raw) @@ -238,6 +231,9 @@ def resolve_bing_wrapper_page(url, br, log): return m.group(1) +bing_scraper_storage = [] + + def bing_search( terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60, show_user_agent=False, result_url_is_ok=lambda x: True @@ -249,20 +245,8 @@ def bing_search( q = '+'.join(terms) url = 'https://www.bing.com/search?q={q}'.format(q=q) log('Making bing query: ' + url) - if br is None: - br = browser() - else: - br = br.clone_browser() - br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent'] - ua = '' - from calibre.utils.random_ua import random_common_chrome_user_agent - while not ua: - ua = random_common_chrome_user_agent() - if show_user_agent: - print('User-agent:', ua) - br.addheaders.append(('User-agent', ua)) - - root = query(br, url, 'bing', dump_raw, timeout=timeout) + from calibre.scraper.simple import read_url + root = query(br, url, 'bing', dump_raw, timeout=timeout, simple_scraper=partial(read_url, bing_scraper_storage)) ans = [] result_items = root.xpath('//*[@id="b_results"]/li[@class="b_algo"]') if not result_items: @@ -272,19 +256,11 @@ def bing_search( a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class="b_algoheader"]/a[@href]') a = a[0] title = tostring(a) - try: - div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0] - except IndexError: - log('Ignoring {!r} as it has no cached page'.format(title)) - continue - d, w = div.get('u').split('|')[-2:] - cached_url = 'https://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format( - q=q, d=d, w=w) ans_url = a.get('href') if ans_url.startswith('https://www.bing.com/'): ans_url = resolve_bing_wrapper_page(ans_url, br, log) if result_url_is_ok(ans_url): - ans.append(Result(ans_url, title, cached_url)) + ans.append(Result(ans_url, title, None)) if not ans: title = ' '.join(root.xpath('//title/text()')) log('Failed to find any results on results page, with title:', title) @@ -469,7 +445,6 @@ def get_cached_url(url, br=None, log=prints, timeout=60): threads = [] threads.append(Thread(target=doit, args=(wayback_machine_cached_url,), daemon=True).start()) - threads.append(Thread(target=doit, args=(bing_cached_url,), daemon=True).start()) while threads: x = q.get() if x is not None: