From 5c1f96269d7ee1c6dcfe92243cbb8bee720634bb Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 15 Sep 2025 17:05:38 +0530
Subject: [PATCH] Fix NYTimes

Avoid double cleaning of JSON
---
 recipes/nytimes.recipe                  | 3 +--
 recipes/nytimes_sub.recipe              | 3 +--
 src/calibre/web/site_parsers/nytimes.py | 9 +++++----
 3 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe
index 9cd65b207a..97491bca46 100644
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@@ -128,8 +128,7 @@ class NewYorkTimes(BasicNewsRecipe):
         return self.index_to_soup(url, raw=True)
 
     def preprocess_raw_html(self, raw_html, url):
-        cleaned = self.nyt_parser.clean_js_json(raw_html)
-        return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
+        return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url)
 
     articles_are_obfuscated = use_wayback_machine
 
diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 1b953f4a26..05120ae68d 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -128,8 +128,7 @@ class NewYorkTimes(BasicNewsRecipe):
         return self.index_to_soup(url, raw=True)
 
     def preprocess_raw_html(self, raw_html, url):
-        cleaned = self.nyt_parser.clean_js_json(raw_html)
-        return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
+        return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url)
 
     articles_are_obfuscated = use_wayback_machine
 
diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py
index 870368f474..388f947196 100644
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr
 
 from calibre.utils.iso8601 import parse_iso8601
 
-module_version = 16  # needed for live updates
+module_version = 17  # needed for live updates
 pprint
 
 
@@ -213,7 +213,8 @@ def clean_js_json(text):
 
 
 def json_to_html(raw):
-    data = json.loads(clean_js_json(raw))
+    cleaned = clean_js_json(raw)
+    data = json.JSONDecoder(strict=False).raw_decode(cleaned)[0]
     # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
     try:
         data = data['initialData']['data']
@@ -290,7 +291,7 @@ def extract_html(soup, url):
             'This is an interactive article, which is supposed to be read in a browser.'
             '</p></em></body></html>'
         )
-    candidates = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)
+    candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
     if not candidates:
         if soup.find('script', src='https://ct.captcha-delivery.com/c.js'):
             raise ValueError('NYTimes returned a CAPTCHA page from captcha-delivery.com')
@@ -333,6 +334,6 @@ if __name__ == '__main__':
         from calibre.ebooks.BeautifulSoup import BeautifulSoup
 
         soup = BeautifulSoup(raw)
-        print(extract_html(soup))
+        print(extract_html(soup, 'moose'))
     else:
         print(json_to_html(raw))