From 26c10fd4d8cae80fdbb01cc66b43d39a95f7592b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 30 May 2025 17:12:07 +0530 Subject: [PATCH] Cleanup previous PR --- recipes/nytimes.recipe | 15 ++------------- recipes/nytimes_sub.recipe | 17 ++--------------- recipes/nytimesbook.recipe | 2 +- src/calibre/web/site_parsers/nytimes.py | 15 +++++++++++++-- 4 files changed, 18 insertions(+), 31 deletions(-) diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index 2550a51c7e..bb87ca3fa1 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -82,17 +82,6 @@ def new_tag(soup, name, attrs=()): return Tag(soup, name, attrs=attrs or None) -def clean_js_json(text): - text = text.replace('undefined', 'null') - text = re.sub( - r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}', - '', - text, - flags=re.DOTALL - ) - return text - - class NewYorkTimes(BasicNewsRecipe): if is_web_edition: title = 'The New York Times (Web)' @@ -138,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe): return self.index_to_soup(url, raw=True) def preprocess_raw_html(self, raw_html, url): - cleaned = clean_js_json(raw_html) + cleaned = self.nyt_parser.clean_js_json(raw_html) return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) articles_are_obfuscated = use_wayback_machine @@ -225,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe): script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = type(u'')(script) raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} - clean_json = clean_js_json(raw_json) + clean_json = self.nyt_parser.clean_js_json(raw_json) self.nytimes_graphql_config = json.loads(clean_json)['config'] return soup diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 0f7e52b9ea..c4b1adad3c 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -82,19 +82,6 @@ def new_tag(soup, name, attrs=()): return Tag(soup, name, attrs=attrs or None) -def clean_js_json(text): - text = text.replace('undefined', 'null') - - # drop any JS function definitions - text = re.sub( - r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}', - '', - text, - flags=re.DOTALL - ) - return text - - class NewYorkTimes(BasicNewsRecipe): if is_web_edition: title = 'The New York Times (Web)' @@ -140,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe): return self.index_to_soup(url, raw=True) def preprocess_raw_html(self, raw_html, url): - cleaned = clean_js_json(raw_html) + cleaned = self.nyt_parser.clean_js_json(raw_html) return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) articles_are_obfuscated = use_wayback_machine @@ -227,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe): script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = type(u'')(script) raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} - clean_json = clean_js_json(raw_json) + clean_json = self.nyt_parser.clean_js_json(raw_json) self.nytimes_graphql_config = json.loads(clean_json)['config'] return soup diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index 932815d794..f35ddcda2a 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -99,7 +99,7 @@ class NewYorkTimesBookReview(BasicNewsRecipe): script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = type(u'')(script) json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} - self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config'] + self.nytimes_graphql_config = json.loads(self.nyt_parser.clean_js_json(json_data))['config'] return soup def nyt_graphql_query(self, qid, operationName='CollectionsQuery'): diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 3b1a951fa2..32724b2f0c 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 -module_version = 11 # needed for live updates +module_version = 12 # needed for live updates pprint @@ -195,8 +195,19 @@ def article_parse(data): yield '' +def clean_js_json(text): + text = re.sub(r'\bundefined\b', 'null', text) + text = re.sub( + r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}', + '', + text, + flags=re.DOTALL + ) + return text + + def json_to_html(raw): - data = json.loads(raw.replace(':undefined', ':null')) + data = json.loads(clean_js_json(raw)) # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) try: data = data['initialData']['data']