mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Microsoft removed the Bing webcache
Amazon plugin no longer uses a webcache as Wayback machine doesnt cache amazon anymore and Google and Bing webcaches are no longer available. Presumably Amazon will start CAPTCHAing us soon as well at which point look into using a webengine scraper for it. Sigh.
This commit is contained in:
parent
62bd116cdb
commit
c79818240e
@ -1090,7 +1090,7 @@ class Worker(Thread): # Get details {{{
|
||||
class Amazon(Source):
|
||||
|
||||
name = 'Amazon.com'
|
||||
version = (1, 3, 11)
|
||||
version = (1, 3, 12)
|
||||
minimum_calibre_version = (2, 82, 0)
|
||||
description = _('Downloads metadata and covers from Amazon')
|
||||
|
||||
@ -1588,12 +1588,10 @@ class Amazon(Source):
|
||||
|
||||
purl = urlparse(result.url)
|
||||
if '/dp/' in purl.path and site in purl.netloc:
|
||||
url = result.cached_url
|
||||
if url is None:
|
||||
url = se.get_cached_url(result.url, br, timeout=timeout)
|
||||
if url is None:
|
||||
log('Failed to find cached page for:', result.url)
|
||||
continue
|
||||
# We cannot use cached URL as wayback machine no longer caches
|
||||
# amazon and Google and Bing web caches are no longer
|
||||
# accessible.
|
||||
url = result.url
|
||||
if url not in matches:
|
||||
matches.append(url)
|
||||
if len(matches) >= 3:
|
||||
@ -1778,6 +1776,14 @@ def manual_tests(domain, **kw): # {{{
|
||||
from calibre.ebooks.metadata.sources.test import authors_test, comments_test, isbn_test, series_test, test_identify_plugin, title_test
|
||||
all_tests = {}
|
||||
all_tests['com'] = [ # {{{
|
||||
( # # in title
|
||||
{'title': 'Expert C# 2008 Business Objects',
|
||||
'authors': ['Lhotka']},
|
||||
[title_test('Expert C#'),
|
||||
authors_test(['Rockford Lhotka'])
|
||||
]
|
||||
),
|
||||
|
||||
( # Paperback with series
|
||||
{'identifiers': {'amazon': '1423146786'}},
|
||||
[title_test('Heroes of Olympus', exact=False), series_test('The Heroes of Olympus', 5)]
|
||||
@ -1805,14 +1811,6 @@ def manual_tests(domain, **kw): # {{{
|
||||
]
|
||||
),
|
||||
|
||||
( # # in title
|
||||
{'title': 'Expert C# 2008 Business Objects',
|
||||
'authors': ['Lhotka']},
|
||||
[title_test('Expert C#'),
|
||||
authors_test(['Rockford Lhotka'])
|
||||
]
|
||||
),
|
||||
|
||||
( # New search results page markup (Dec 2024)
|
||||
{'title': 'Come si scrive un articolo medico-scientifico'},
|
||||
[title_test('Come si scrive un articolo medico-scientifico', exact=True)]
|
||||
|
@ -31,7 +31,7 @@ from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.lock import ExclusiveFile
|
||||
from calibre.utils.random_ua import accept_header_for_ua
|
||||
|
||||
current_version = (1, 2, 12)
|
||||
current_version = (1, 2, 13)
|
||||
minimum_calibre_version = (2, 80, 0)
|
||||
webcache = {}
|
||||
webcache_lock = Lock()
|
||||
@ -221,13 +221,6 @@ def bing_url_processor(url):
|
||||
return url
|
||||
|
||||
|
||||
def bing_cached_url(url, br=None, log=prints, timeout=60):
|
||||
# See https://support.microsoft.com/en-gb/topic/advanced-search-keywords-ea595928-5d63-4a0b-9c6b-0b769865e78a for operators
|
||||
results, search_url = bing_search(['url:' + url], br=br, log=log, timeout=timeout)
|
||||
for result in results:
|
||||
return result.cached_url
|
||||
|
||||
|
||||
def resolve_bing_wrapper_page(url, br, log):
|
||||
raw = br.open_novisit(url).read().decode('utf-8', 'replace')
|
||||
m = re.search(r'var u = "(.+)"', raw)
|
||||
@ -238,6 +231,9 @@ def resolve_bing_wrapper_page(url, br, log):
|
||||
return m.group(1)
|
||||
|
||||
|
||||
bing_scraper_storage = []
|
||||
|
||||
|
||||
def bing_search(
|
||||
terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60,
|
||||
show_user_agent=False, result_url_is_ok=lambda x: True
|
||||
@ -249,20 +245,8 @@ def bing_search(
|
||||
q = '+'.join(terms)
|
||||
url = 'https://www.bing.com/search?q={q}'.format(q=q)
|
||||
log('Making bing query: ' + url)
|
||||
if br is None:
|
||||
br = browser()
|
||||
else:
|
||||
br = br.clone_browser()
|
||||
br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent']
|
||||
ua = ''
|
||||
from calibre.utils.random_ua import random_common_chrome_user_agent
|
||||
while not ua:
|
||||
ua = random_common_chrome_user_agent()
|
||||
if show_user_agent:
|
||||
print('User-agent:', ua)
|
||||
br.addheaders.append(('User-agent', ua))
|
||||
|
||||
root = query(br, url, 'bing', dump_raw, timeout=timeout)
|
||||
from calibre.scraper.simple import read_url
|
||||
root = query(br, url, 'bing', dump_raw, timeout=timeout, simple_scraper=partial(read_url, bing_scraper_storage))
|
||||
ans = []
|
||||
result_items = root.xpath('//*[@id="b_results"]/li[@class="b_algo"]')
|
||||
if not result_items:
|
||||
@ -272,19 +256,11 @@ def bing_search(
|
||||
a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class="b_algoheader"]/a[@href]')
|
||||
a = a[0]
|
||||
title = tostring(a)
|
||||
try:
|
||||
div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0]
|
||||
except IndexError:
|
||||
log('Ignoring {!r} as it has no cached page'.format(title))
|
||||
continue
|
||||
d, w = div.get('u').split('|')[-2:]
|
||||
cached_url = 'https://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(
|
||||
q=q, d=d, w=w)
|
||||
ans_url = a.get('href')
|
||||
if ans_url.startswith('https://www.bing.com/'):
|
||||
ans_url = resolve_bing_wrapper_page(ans_url, br, log)
|
||||
if result_url_is_ok(ans_url):
|
||||
ans.append(Result(ans_url, title, cached_url))
|
||||
ans.append(Result(ans_url, title, None))
|
||||
if not ans:
|
||||
title = ' '.join(root.xpath('//title/text()'))
|
||||
log('Failed to find any results on results page, with title:', title)
|
||||
@ -469,7 +445,6 @@ def get_cached_url(url, br=None, log=prints, timeout=60):
|
||||
|
||||
threads = []
|
||||
threads.append(Thread(target=doit, args=(wayback_machine_cached_url,), daemon=True).start())
|
||||
threads.append(Thread(target=doit, args=(bing_cached_url,), daemon=True).start())
|
||||
while threads:
|
||||
x = q.get()
|
||||
if x is not None:
|
||||
|
Loading…
x
Reference in New Issue
Block a user