Detect nytimes captcha pages

2025-07-09 03:04:10 -04:00 · 2025-06-14 20:42:23 +05:30 · 2025-06-14 20:42:23 +05:30 · 9fba8f7bae
commit 9fba8f7bae
parent 4be0c12647
1 changed files with 7 additions and 2 deletions
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr

 from calibre.utils.iso8601 import parse_iso8601

-module_version = 12  # needed for live updates
+module_version = 13  # needed for live updates
 pprint


@ -284,7 +284,12 @@ def extract_html(soup, url):
            'This is an interactive article, which is supposed to be read in a browser.'
            '</p></em></body></html>'
        )
-    script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
+    candidates = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)
+    if not candidates:
+        if soup.find('script', src='https://ct.captcha-delivery.com/c.js'):
+            raise ValueError('NYTimes returned a CAPTCHA page from captcha-delivery.com')
+        raise ValueError('NYTimes returned HTML without preloaded data')
+    script = candidates[0]
    script = str(script)
    raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';')
    return json_to_html(raw)