Cleanup previous PR

This commit is contained in:
Kovid Goyal 2025-05-30 17:12:07 +05:30
parent 4fd2a30f78
commit 26c10fd4d8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 18 additions and 31 deletions

View File

@ -82,17 +82,6 @@ def new_tag(soup, name, attrs=()):
return Tag(soup, name, attrs=attrs or None) return Tag(soup, name, attrs=attrs or None)
def clean_js_json(text):
text = text.replace('undefined', 'null')
text = re.sub(
r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}',
'',
text,
flags=re.DOTALL
)
return text
class NewYorkTimes(BasicNewsRecipe): class NewYorkTimes(BasicNewsRecipe):
if is_web_edition: if is_web_edition:
title = 'The New York Times (Web)' title = 'The New York Times (Web)'
@ -138,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe):
return self.index_to_soup(url, raw=True) return self.index_to_soup(url, raw=True)
def preprocess_raw_html(self, raw_html, url): def preprocess_raw_html(self, raw_html, url):
cleaned = clean_js_json(raw_html) cleaned = self.nyt_parser.clean_js_json(raw_html)
return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
articles_are_obfuscated = use_wayback_machine articles_are_obfuscated = use_wayback_machine
@ -225,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe):
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
script = type(u'')(script) script = type(u'')(script)
raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
clean_json = clean_js_json(raw_json) clean_json = self.nyt_parser.clean_js_json(raw_json)
self.nytimes_graphql_config = json.loads(clean_json)['config'] self.nytimes_graphql_config = json.loads(clean_json)['config']
return soup return soup

View File

@ -82,19 +82,6 @@ def new_tag(soup, name, attrs=()):
return Tag(soup, name, attrs=attrs or None) return Tag(soup, name, attrs=attrs or None)
def clean_js_json(text):
text = text.replace('undefined', 'null')
# drop any JS function definitions
text = re.sub(
r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}',
'',
text,
flags=re.DOTALL
)
return text
class NewYorkTimes(BasicNewsRecipe): class NewYorkTimes(BasicNewsRecipe):
if is_web_edition: if is_web_edition:
title = 'The New York Times (Web)' title = 'The New York Times (Web)'
@ -140,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe):
return self.index_to_soup(url, raw=True) return self.index_to_soup(url, raw=True)
def preprocess_raw_html(self, raw_html, url): def preprocess_raw_html(self, raw_html, url):
cleaned = clean_js_json(raw_html) cleaned = self.nyt_parser.clean_js_json(raw_html)
return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url) return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
articles_are_obfuscated = use_wayback_machine articles_are_obfuscated = use_wayback_machine
@ -227,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe):
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
script = type(u'')(script) script = type(u'')(script)
raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
clean_json = clean_js_json(raw_json) clean_json = self.nyt_parser.clean_js_json(raw_json)
self.nytimes_graphql_config = json.loads(clean_json)['config'] self.nytimes_graphql_config = json.loads(clean_json)['config']
return soup return soup

View File

@ -99,7 +99,7 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
script = type(u'')(script) script = type(u'')(script)
json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }} json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config'] self.nytimes_graphql_config = json.loads(self.nyt_parser.clean_js_json(json_data))['config']
return soup return soup
def nyt_graphql_query(self, qid, operationName='CollectionsQuery'): def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):

View File

@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr
from calibre.utils.iso8601 import parse_iso8601 from calibre.utils.iso8601 import parse_iso8601
module_version = 11 # needed for live updates module_version = 12 # needed for live updates
pprint pprint
@ -195,8 +195,19 @@ def article_parse(data):
yield '</body></html>' yield '</body></html>'
def clean_js_json(text):
text = re.sub(r'\bundefined\b', 'null', text)
text = re.sub(
r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}',
'',
text,
flags=re.DOTALL
)
return text
def json_to_html(raw): def json_to_html(raw):
data = json.loads(raw.replace(':undefined', ':null')) data = json.loads(clean_js_json(raw))
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
try: try:
data = data['initialData']['data'] data = data['initialData']['data']