Microsoft removed the Bing webcache

Amazon plugin no longer uses a webcache as Wayback machine doesnt cache
amazon anymore and Google and Bing webcaches are no longer available.
Presumably Amazon will start CAPTCHAing us soon as well at which point
look into using a webengine scraper for it. Sigh.
This commit is contained in:
Kovid Goyal 2024-12-27 13:51:34 +05:30
parent 62bd116cdb
commit c79818240e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 20 additions and 47 deletions

View File

@ -1090,7 +1090,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source): class Amazon(Source):
name = 'Amazon.com' name = 'Amazon.com'
version = (1, 3, 11) version = (1, 3, 12)
minimum_calibre_version = (2, 82, 0) minimum_calibre_version = (2, 82, 0)
description = _('Downloads metadata and covers from Amazon') description = _('Downloads metadata and covers from Amazon')
@ -1588,12 +1588,10 @@ class Amazon(Source):
purl = urlparse(result.url) purl = urlparse(result.url)
if '/dp/' in purl.path and site in purl.netloc: if '/dp/' in purl.path and site in purl.netloc:
url = result.cached_url # We cannot use cached URL as wayback machine no longer caches
if url is None: # amazon and Google and Bing web caches are no longer
url = se.get_cached_url(result.url, br, timeout=timeout) # accessible.
if url is None: url = result.url
log('Failed to find cached page for:', result.url)
continue
if url not in matches: if url not in matches:
matches.append(url) matches.append(url)
if len(matches) >= 3: if len(matches) >= 3:
@ -1778,6 +1776,14 @@ def manual_tests(domain, **kw): # {{{
from calibre.ebooks.metadata.sources.test import authors_test, comments_test, isbn_test, series_test, test_identify_plugin, title_test from calibre.ebooks.metadata.sources.test import authors_test, comments_test, isbn_test, series_test, test_identify_plugin, title_test
all_tests = {} all_tests = {}
all_tests['com'] = [ # {{{ all_tests['com'] = [ # {{{
( # # in title
{'title': 'Expert C# 2008 Business Objects',
'authors': ['Lhotka']},
[title_test('Expert C#'),
authors_test(['Rockford Lhotka'])
]
),
( # Paperback with series ( # Paperback with series
{'identifiers': {'amazon': '1423146786'}}, {'identifiers': {'amazon': '1423146786'}},
[title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)] [title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)]
@ -1805,14 +1811,6 @@ def manual_tests(domain, **kw): # {{{
] ]
), ),
( # # in title
{'title': 'Expert C# 2008 Business Objects',
'authors': ['Lhotka']},
[title_test('Expert C#'),
authors_test(['Rockford Lhotka'])
]
),
( # New search results page markup (Dec 2024) ( # New search results page markup (Dec 2024)
{'title': 'Come si scrive un articolo medico-scientifico'}, {'title': 'Come si scrive un articolo medico-scientifico'},
[title_test('Come si scrive un articolo medico-scientifico', exact=True)] [title_test('Come si scrive un articolo medico-scientifico', exact=True)]

View File

@ -31,7 +31,7 @@ from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.lock import ExclusiveFile from calibre.utils.lock import ExclusiveFile
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 2, 12) current_version = (1, 2, 13)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
webcache = {} webcache = {}
webcache_lock = Lock() webcache_lock = Lock()
@ -221,13 +221,6 @@ def bing_url_processor(url):
return url return url
def bing_cached_url(url, br=None, log=prints, timeout=60):
# See https://support.microsoft.com/en-gb/topic/advanced-search-keywords-ea595928-5d63-4a0b-9c6b-0b769865e78a for operators
results, search_url = bing_search(['url:' + url], br=br, log=log, timeout=timeout)
for result in results:
return result.cached_url
def resolve_bing_wrapper_page(url, br, log): def resolve_bing_wrapper_page(url, br, log):
raw = br.open_novisit(url).read().decode('utf-8', 'replace') raw = br.open_novisit(url).read().decode('utf-8', 'replace')
m = re.search(r'var u = "(.+)"', raw) m = re.search(r'var u = "(.+)"', raw)
@ -238,6 +231,9 @@ def resolve_bing_wrapper_page(url, br, log):
return m.group(1) return m.group(1)
bing_scraper_storage = []
def bing_search( def bing_search(
terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60, terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60,
show_user_agent=False, result_url_is_ok=lambda x: True show_user_agent=False, result_url_is_ok=lambda x: True
@ -249,20 +245,8 @@ def bing_search(
q = '+'.join(terms) q = '+'.join(terms)
url = 'https://www.bing.com/search?q={q}'.format(q=q) url = 'https://www.bing.com/search?q={q}'.format(q=q)
log('Making bing query: ' + url) log('Making bing query: ' + url)
if br is None: from calibre.scraper.simple import read_url
br = browser() root = query(br, url, 'bing', dump_raw, timeout=timeout, simple_scraper=partial(read_url, bing_scraper_storage))
else:
br = br.clone_browser()
br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent']
ua = ''
from calibre.utils.random_ua import random_common_chrome_user_agent
while not ua:
ua = random_common_chrome_user_agent()
if show_user_agent:
print('User-agent:', ua)
br.addheaders.append(('User-agent', ua))
root = query(br, url, 'bing', dump_raw, timeout=timeout)
ans = [] ans = []
result_items = root.xpath('//*[@id="b_results"]/li[@class="b_algo"]') result_items = root.xpath('//*[@id="b_results"]/li[@class="b_algo"]')
if not result_items: if not result_items:
@ -272,19 +256,11 @@ def bing_search(
a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class="b_algoheader"]/a[@href]') a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class="b_algoheader"]/a[@href]')
a = a[0] a = a[0]
title = tostring(a) title = tostring(a)
try:
div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0]
except IndexError:
log('Ignoring {!r} as it has no cached page'.format(title))
continue
d, w = div.get('u').split('|')[-2:]
cached_url = 'https://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(
q=q, d=d, w=w)
ans_url = a.get('href') ans_url = a.get('href')
if ans_url.startswith('https://www.bing.com/'): if ans_url.startswith('https://www.bing.com/'):
ans_url = resolve_bing_wrapper_page(ans_url, br, log) ans_url = resolve_bing_wrapper_page(ans_url, br, log)
if result_url_is_ok(ans_url): if result_url_is_ok(ans_url):
ans.append(Result(ans_url, title, cached_url)) ans.append(Result(ans_url, title, None))
if not ans: if not ans:
title = ' '.join(root.xpath('//title/text()')) title = ' '.join(root.xpath('//title/text()'))
log('Failed to find any results on results page, with title:', title) log('Failed to find any results on results page, with title:', title)
@ -469,7 +445,6 @@ def get_cached_url(url, br=None, log=prints, timeout=60):
threads = [] threads = []
threads.append(Thread(target=doit, args=(wayback_machine_cached_url,), daemon=True).start()) threads.append(Thread(target=doit, args=(wayback_machine_cached_url,), daemon=True).start())
threads.append(Thread(target=doit, args=(bing_cached_url,), daemon=True).start())
while threads: while threads:
x = q.get() x = q.get()
if x is not None: if x is not None: