From 5c1f96269d7ee1c6dcfe92243cbb8bee720634bb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 15 Sep 2025 17:05:38 +0530 Subject: [PATCH] Fix NYTimes Avoid double cleaning of JSON --- recipes/nytimes.recipe | 3 +-- recipes/nytimes_sub.recipe | 3 +-- src/calibre/web/site_parsers/nytimes.py | 9 +++++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index 9cd65b207a..97491bca46 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -128,8 +128,7 @@ class NewYorkTimes(BasicNewsRecipe): return self.index_to_soup(url, raw=True) def preprocess_raw_html(self, raw_html, url): - cleaned = self.nyt_parser.clean_js_json(raw_html) - return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) + return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url) articles_are_obfuscated = use_wayback_machine diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 1b953f4a26..05120ae68d 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -128,8 +128,7 @@ class NewYorkTimes(BasicNewsRecipe): return self.index_to_soup(url, raw=True) def preprocess_raw_html(self, raw_html, url): - cleaned = self.nyt_parser.clean_js_json(raw_html) - return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) + return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url) articles_are_obfuscated = use_wayback_machine diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 870368f474..388f947196 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 -module_version = 16 # needed for live updates +module_version = 17 # needed for live updates pprint @@ -213,7 +213,8 @@ def clean_js_json(text): def json_to_html(raw): - data = json.loads(clean_js_json(raw)) + cleaned = clean_js_json(raw) + data = json.JSONDecoder(strict=False).raw_decode(cleaned)[0] # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) try: data = data['initialData']['data'] @@ -290,7 +291,7 @@ def extract_html(soup, url): 'This is an interactive article, which is supposed to be read in a browser.' '

' ) - candidates = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x) + candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x) if not candidates: if soup.find('script', src='https://ct.captcha-delivery.com/c.js'): raise ValueError('NYTimes returned a CAPTCHA page from captcha-delivery.com') @@ -333,6 +334,6 @@ if __name__ == '__main__': from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw) - print(extract_html(soup)) + print(extract_html(soup, 'moose')) else: print(json_to_html(raw))