From 9fba8f7bae01db12b0e60061dde2e21bee4207ae Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 14 Jun 2025 20:42:23 +0530
Subject: [PATCH] Detect nytimes captcha pages

---
 src/calibre/web/site_parsers/nytimes.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py
index 32724b2f0c..c4321c29ef 100644
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr
 
 from calibre.utils.iso8601 import parse_iso8601
 
-module_version = 12  # needed for live updates
+module_version = 13  # needed for live updates
 pprint
 
 
@@ -284,7 +284,12 @@ def extract_html(soup, url):
             'This is an interactive article, which is supposed to be read in a browser.'
             '</p></em></body></html>'
         )
-    script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
+    candidates = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)
+    if not candidates:
+        if soup.find('script', src='https://ct.captcha-delivery.com/c.js'):
+            raise ValueError('NYTimes returned a CAPTCHA page from captcha-delivery.com')
+        raise ValueError('NYTimes returned HTML without preloaded data')
+    script = candidates[0]
     script = str(script)
     raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';')
     return json_to_html(raw)