Microsoft removed the Bing webcache

Amazon plugin no longer uses a webcache as Wayback machine doesnt cache amazon anymore and Google and Bing webcaches are no longer available. Presumably Amazon will start CAPTCHAing us soon as well at which point look into using a webengine scraper for it. Sigh.
2025-07-09 03:04:10 -04:00 · 2024-12-27 13:51:34 +05:30 · 2024-12-27 13:51:34 +05:30 · c79818240e
commit c79818240e
parent 62bd116cdb
2 changed files with 20 additions and 47 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -1090,7 +1090,7 @@ class Worker(Thread):  # Get details {{{
 class Amazon(Source):
    name = 'Amazon.com'
-    version = (1, 3, 11)
+    version = (1, 3, 12)
    minimum_calibre_version = (2, 82, 0)
    description = _('Downloads metadata and covers from Amazon')
@ -1588,12 +1588,10 @@ class Amazon(Source):
            purl = urlparse(result.url)
            if '/dp/' in purl.path and site in purl.netloc:
-                url = result.cached_url
+                # We cannot use cached URL as wayback machine no longer caches
-                if url is None:
+                # amazon and Google and Bing web caches are no longer
-                    url = se.get_cached_url(result.url, br, timeout=timeout)
+                # accessible.
-                if url is None:
+                url = result.url
                    log('Failed to find cached page for:', result.url)
                    continue
                if url not in matches:
                    matches.append(url)
                if len(matches) >= 3:
@ -1778,6 +1776,14 @@ def manual_tests(domain, **kw):  # {{{
    from calibre.ebooks.metadata.sources.test import authors_test, comments_test, isbn_test, series_test, test_identify_plugin, title_test
    all_tests = {}
    all_tests['com'] = [  # {{{
        (  # # in title
            {'title': 'Expert C# 2008 Business Objects',
             'authors': ['Lhotka']},
            [title_test('Expert C#'),
             authors_test(['Rockford Lhotka'])
             ]
        ),
        (   # Paperback with series
            {'identifiers': {'amazon': '1423146786'}},
            [title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)]
@ -1805,14 +1811,6 @@ def manual_tests(domain, **kw):  # {{{
             ]
        ),
        (  # # in title
            {'title': 'Expert C# 2008 Business Objects',
             'authors': ['Lhotka']},
            [title_test('Expert C#'),
             authors_test(['Rockford Lhotka'])
             ]
        ),
        (   # New search results page markup (Dec 2024)
            {'title': 'Come si scrive un articolo medico-scientifico'},
            [title_test('Come si scrive un articolo medico-scientifico', exact=True)]
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@ -31,7 +31,7 @@ from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.lock import ExclusiveFile
 from calibre.utils.random_ua import accept_header_for_ua
-current_version = (1, 2, 12)
+current_version = (1, 2, 13)
 minimum_calibre_version = (2, 80, 0)
 webcache = {}
 webcache_lock = Lock()
@ -221,13 +221,6 @@ def bing_url_processor(url):
    return url
 def bing_cached_url(url, br=None, log=prints, timeout=60):
    # See https://support.microsoft.com/en-gb/topic/advanced-search-keywords-ea595928-5d63-4a0b-9c6b-0b769865e78a for operators
    results, search_url = bing_search(['url:' + url], br=br, log=log, timeout=timeout)
    for result in results:
        return result.cached_url
 def resolve_bing_wrapper_page(url, br, log):
    raw = br.open_novisit(url).read().decode('utf-8', 'replace')
    m = re.search(r'var u = "(.+)"', raw)
@ -238,6 +231,9 @@ def resolve_bing_wrapper_page(url, br, log):
    return m.group(1)
 bing_scraper_storage = []
 def bing_search(
    terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60,
    show_user_agent=False, result_url_is_ok=lambda x: True
@ -249,20 +245,8 @@ def bing_search(
    q = '+'.join(terms)
    url = 'https://www.bing.com/search?q={q}'.format(q=q)
    log('Making bing query: ' + url)
-    if br is None:
+    from calibre.scraper.simple import read_url
-        br = browser()
+    root = query(br, url, 'bing', dump_raw, timeout=timeout, simple_scraper=partial(read_url, bing_scraper_storage))
    else:
        br = br.clone_browser()
    br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent']
    ua = ''
    from calibre.utils.random_ua import random_common_chrome_user_agent
    while not ua:
        ua = random_common_chrome_user_agent()
    if show_user_agent:
        print('User-agent:', ua)
    br.addheaders.append(('User-agent', ua))
    root = query(br, url, 'bing', dump_raw, timeout=timeout)
    ans = []
    result_items = root.xpath('//*[@id="b_results"]/li[@class="b_algo"]')
    if not result_items:
@ -272,19 +256,11 @@ def bing_search(
        a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class="b_algoheader"]/a[@href]')
        a = a[0]
        title = tostring(a)
        try:
            div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0]
        except IndexError:
            log('Ignoring {!r} as it has no cached page'.format(title))
            continue
        d, w = div.get('u').split('|')[-2:]
        cached_url = 'https://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(
            q=q, d=d, w=w)
        ans_url = a.get('href')
        if ans_url.startswith('https://www.bing.com/'):
            ans_url = resolve_bing_wrapper_page(ans_url, br, log)
        if result_url_is_ok(ans_url):
-            ans.append(Result(ans_url, title, cached_url))
+            ans.append(Result(ans_url, title, None))
    if not ans:
        title = ' '.join(root.xpath('//title/text()'))
        log('Failed to find any results on results page, with title:', title)
@ -469,7 +445,6 @@ def get_cached_url(url, br=None, log=prints, timeout=60):
    threads = []
    threads.append(Thread(target=doit, args=(wayback_machine_cached_url,), daemon=True).start())
    threads.append(Thread(target=doit, args=(bing_cached_url,), daemon=True).start())
    while threads:
        x = q.get()
        if x is not None: