Fix NYTimes

Avoid double cleaning of JSON
2026-03-03 15:40:02 -05:00 · 2025-09-15 17:05:38 +05:30 · 2025-09-15 17:05:38 +05:30 · 5c1f96269d
commit 5c1f96269d
parent bb605226e7
3 changed files with 7 additions and 8 deletions
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -128,8 +128,7 @@ class NewYorkTimes(BasicNewsRecipe):
        return self.index_to_soup(url, raw=True)

    def preprocess_raw_html(self, raw_html, url):
-        cleaned = self.nyt_parser.clean_js_json(raw_html)
-        return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
+        return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url)

    articles_are_obfuscated = use_wayback_machine

--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -128,8 +128,7 @@ class NewYorkTimes(BasicNewsRecipe):
        return self.index_to_soup(url, raw=True)

    def preprocess_raw_html(self, raw_html, url):
-        cleaned = self.nyt_parser.clean_js_json(raw_html)
-        return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
+        return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url)

    articles_are_obfuscated = use_wayback_machine

--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr

 from calibre.utils.iso8601 import parse_iso8601

-module_version = 16  # needed for live updates
+module_version = 17  # needed for live updates
 pprint


@ -213,7 +213,8 @@ def clean_js_json(text):


 def json_to_html(raw):
-    data = json.loads(clean_js_json(raw))
+    cleaned = clean_js_json(raw)
+    data = json.JSONDecoder(strict=False).raw_decode(cleaned)[0]
    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
    try:
        data = data['initialData']['data']
@ -290,7 +291,7 @@ def extract_html(soup, url):
            'This is an interactive article, which is supposed to be read in a browser.'
            '</p></em></body></html>'
        )
-    candidates = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)
+    candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
    if not candidates:
        if soup.find('script', src='https://ct.captcha-delivery.com/c.js'):
            raise ValueError('NYTimes returned a CAPTCHA page from captcha-delivery.com')
@ -333,6 +334,6 @@ if __name__ == '__main__':
        from calibre.ebooks.BeautifulSoup import BeautifulSoup

        soup = BeautifulSoup(raw)
-        print(extract_html(soup))
+        print(extract_html(soup, 'moose'))
    else:
        print(json_to_html(raw))