From 9fba8f7bae01db12b0e60061dde2e21bee4207ae Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 14 Jun 2025 20:42:23 +0530 Subject: [PATCH] Detect nytimes captcha pages --- src/calibre/web/site_parsers/nytimes.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 32724b2f0c..c4321c29ef 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 -module_version = 12 # needed for live updates +module_version = 13 # needed for live updates pprint @@ -284,7 +284,12 @@ def extract_html(soup, url): 'This is an interactive article, which is supposed to be read in a browser.' '

' ) - script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] + candidates = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x) + if not candidates: + if soup.find('script', src='https://ct.captcha-delivery.com/c.js'): + raise ValueError('NYTimes returned a CAPTCHA page from captcha-delivery.com') + raise ValueError('NYTimes returned HTML without preloaded data') + script = candidates[0] script = str(script) raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';') return json_to_html(raw)