From 26c10fd4d8cae80fdbb01cc66b43d39a95f7592b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 30 May 2025 17:12:07 +0530
Subject: [PATCH] Cleanup previous PR

---
 recipes/nytimes.recipe                  | 15 ++-------------
 recipes/nytimes_sub.recipe              | 17 ++---------------
 recipes/nytimesbook.recipe              |  2 +-
 src/calibre/web/site_parsers/nytimes.py | 15 +++++++++++++--
 4 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe
index 2550a51c7e..bb87ca3fa1 100644
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@@ -82,17 +82,6 @@ def new_tag(soup, name, attrs=()):
     return Tag(soup, name, attrs=attrs or None)
 
 
-def clean_js_json(text):
-    text = text.replace('undefined', 'null')
-    text = re.sub(
-        r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}', 
-        '',
-        text,
-        flags=re.DOTALL
-    )
-    return text
-
-
 class NewYorkTimes(BasicNewsRecipe):
     if is_web_edition:
         title = 'The New York Times (Web)'
@@ -138,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe):
         return self.index_to_soup(url, raw=True)
 
     def preprocess_raw_html(self, raw_html, url):
-        cleaned = clean_js_json(raw_html)
+        cleaned = self.nyt_parser.clean_js_json(raw_html)
         return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
 
     articles_are_obfuscated = use_wayback_machine
@@ -225,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe):
         script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
         script = type(u'')(script)
         raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
-        clean_json = clean_js_json(raw_json)
+        clean_json = self.nyt_parser.clean_js_json(raw_json)
         self.nytimes_graphql_config = json.loads(clean_json)['config']
         return soup
 
diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 0f7e52b9ea..c4b1adad3c 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -82,19 +82,6 @@ def new_tag(soup, name, attrs=()):
     return Tag(soup, name, attrs=attrs or None)
 
 
-def clean_js_json(text):
-    text = text.replace('undefined', 'null')
-
-    # drop any JS function definitions
-    text = re.sub(
-        r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}', 
-        '',
-        text,
-        flags=re.DOTALL
-    )
-    return text
-
-
 class NewYorkTimes(BasicNewsRecipe):
     if is_web_edition:
         title = 'The New York Times (Web)'
@@ -140,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe):
         return self.index_to_soup(url, raw=True)
 
     def preprocess_raw_html(self, raw_html, url):
-        cleaned = clean_js_json(raw_html)
+        cleaned = self.nyt_parser.clean_js_json(raw_html)
         return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
 
     articles_are_obfuscated = use_wayback_machine
@@ -227,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe):
         script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
         script = type(u'')(script)
         raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
-        clean_json = clean_js_json(raw_json)
+        clean_json = self.nyt_parser.clean_js_json(raw_json)
         self.nytimes_graphql_config = json.loads(clean_json)['config']
         return soup
 
diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe
index 932815d794..f35ddcda2a 100644
--- a/recipes/nytimesbook.recipe
+++ b/recipes/nytimesbook.recipe
@@ -99,7 +99,7 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
         script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
         script = type(u'')(script)
         json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
-        self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
+        self.nytimes_graphql_config = json.loads(self.nyt_parser.clean_js_json(json_data))['config']
         return soup
 
     def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py
index 3b1a951fa2..32724b2f0c 100644
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr
 
 from calibre.utils.iso8601 import parse_iso8601
 
-module_version = 11  # needed for live updates
+module_version = 12  # needed for live updates
 pprint
 
 
@@ -195,8 +195,19 @@ def article_parse(data):
     yield '</body></html>'
 
 
+def clean_js_json(text):
+    text = re.sub(r'\bundefined\b', 'null', text)
+    text = re.sub(
+        r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}',
+        '',
+        text,
+        flags=re.DOTALL
+    )
+    return text
+
+
 def json_to_html(raw):
-    data = json.loads(raw.replace(':undefined', ':null'))
+    data = json.loads(clean_js_json(raw))
     # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
     try:
         data = data['initialData']['data']