From 07ac145a538298476f64daf818f236cf1a943efb Mon Sep 17 00:00:00 2001 From: ByteTheBubble Date: Fri, 30 May 2025 13:16:37 +0200 Subject: [PATCH 1/2] Update nytimes.recipe to fix JSON parsing error Fix JSON parsing error by replacing `undefined` with `null` and stripping JS functions --- recipes/nytimes.recipe | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index bb16ad34a2..2550a51c7e 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -82,6 +82,17 @@ def new_tag(soup, name, attrs=()): return Tag(soup, name, attrs=attrs or None) +def clean_js_json(text): + text = text.replace('undefined', 'null') + text = re.sub( + r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}', + '', + text, + flags=re.DOTALL + ) + return text + + class NewYorkTimes(BasicNewsRecipe): if is_web_edition: title = 'The New York Times (Web)' @@ -127,7 +138,8 @@ class NewYorkTimes(BasicNewsRecipe): return self.index_to_soup(url, raw=True) def preprocess_raw_html(self, raw_html, url): - return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url) + cleaned = clean_js_json(raw_html) + return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) articles_are_obfuscated = use_wayback_machine @@ -212,8 +224,9 @@ class NewYorkTimes(BasicNewsRecipe): self.nytimes_publication_date = pdate script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = type(u'')(script) - json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} - self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config'] + raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} + clean_json = clean_js_json(raw_json) + self.nytimes_graphql_config = json.loads(clean_json)['config'] return soup def nyt_graphql_query(self, qid, operationName='CollectionsQuery'): From d8c324e47be57422201ab43c5d7d457584eb104a Mon Sep 17 00:00:00 2001 From: ByteTheBubble Date: Fri, 30 May 2025 13:19:44 +0200 Subject: [PATCH 2/2] Update nytimes_sub.recipe to fix the same json decoding error Fix JSON parsing error by replacing `undefined` with `null` and stripping JS functions for nytimes_sub.recipe --- recipes/nytimes_sub.recipe | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 82ba43a8a1..0f7e52b9ea 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -82,6 +82,19 @@ def new_tag(soup, name, attrs=()): return Tag(soup, name, attrs=attrs or None) +def clean_js_json(text): + text = text.replace('undefined', 'null') + + # drop any JS function definitions + text = re.sub( + r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}', + '', + text, + flags=re.DOTALL + ) + return text + + class NewYorkTimes(BasicNewsRecipe): if is_web_edition: title = 'The New York Times (Web)' @@ -127,7 +140,8 @@ class NewYorkTimes(BasicNewsRecipe): return self.index_to_soup(url, raw=True) def preprocess_raw_html(self, raw_html, url): - return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url) + cleaned = clean_js_json(raw_html) + return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) articles_are_obfuscated = use_wayback_machine @@ -212,8 +226,9 @@ class NewYorkTimes(BasicNewsRecipe): self.nytimes_publication_date = pdate script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = type(u'')(script) - json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} - self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config'] + raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} + clean_json = clean_js_json(raw_json) + self.nytimes_graphql_config = json.loads(clean_json)['config'] return soup def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):