mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Cleanup previous PR
This commit is contained in:
parent
4fd2a30f78
commit
26c10fd4d8
@ -82,17 +82,6 @@ def new_tag(soup, name, attrs=()):
|
|||||||
return Tag(soup, name, attrs=attrs or None)
|
return Tag(soup, name, attrs=attrs or None)
|
||||||
|
|
||||||
|
|
||||||
def clean_js_json(text):
|
|
||||||
text = text.replace('undefined', 'null')
|
|
||||||
text = re.sub(
|
|
||||||
r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}',
|
|
||||||
'',
|
|
||||||
text,
|
|
||||||
flags=re.DOTALL
|
|
||||||
)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
class NewYorkTimes(BasicNewsRecipe):
|
class NewYorkTimes(BasicNewsRecipe):
|
||||||
if is_web_edition:
|
if is_web_edition:
|
||||||
title = 'The New York Times (Web)'
|
title = 'The New York Times (Web)'
|
||||||
@ -138,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
return self.index_to_soup(url, raw=True)
|
return self.index_to_soup(url, raw=True)
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
cleaned = clean_js_json(raw_html)
|
cleaned = self.nyt_parser.clean_js_json(raw_html)
|
||||||
return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
|
return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
|
||||||
|
|
||||||
articles_are_obfuscated = use_wayback_machine
|
articles_are_obfuscated = use_wayback_machine
|
||||||
@ -225,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
||||||
script = type(u'')(script)
|
script = type(u'')(script)
|
||||||
raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
|
raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
|
||||||
clean_json = clean_js_json(raw_json)
|
clean_json = self.nyt_parser.clean_js_json(raw_json)
|
||||||
self.nytimes_graphql_config = json.loads(clean_json)['config']
|
self.nytimes_graphql_config = json.loads(clean_json)['config']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -82,19 +82,6 @@ def new_tag(soup, name, attrs=()):
|
|||||||
return Tag(soup, name, attrs=attrs or None)
|
return Tag(soup, name, attrs=attrs or None)
|
||||||
|
|
||||||
|
|
||||||
def clean_js_json(text):
|
|
||||||
text = text.replace('undefined', 'null')
|
|
||||||
|
|
||||||
# drop any JS function definitions
|
|
||||||
text = re.sub(
|
|
||||||
r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}',
|
|
||||||
'',
|
|
||||||
text,
|
|
||||||
flags=re.DOTALL
|
|
||||||
)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
class NewYorkTimes(BasicNewsRecipe):
|
class NewYorkTimes(BasicNewsRecipe):
|
||||||
if is_web_edition:
|
if is_web_edition:
|
||||||
title = 'The New York Times (Web)'
|
title = 'The New York Times (Web)'
|
||||||
@ -140,7 +127,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
return self.index_to_soup(url, raw=True)
|
return self.index_to_soup(url, raw=True)
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
cleaned = clean_js_json(raw_html)
|
cleaned = self.nyt_parser.clean_js_json(raw_html)
|
||||||
return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
|
return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
|
||||||
|
|
||||||
articles_are_obfuscated = use_wayback_machine
|
articles_are_obfuscated = use_wayback_machine
|
||||||
@ -227,7 +214,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
||||||
script = type(u'')(script)
|
script = type(u'')(script)
|
||||||
raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
|
raw_json = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
|
||||||
clean_json = clean_js_json(raw_json)
|
clean_json = self.nyt_parser.clean_js_json(raw_json)
|
||||||
self.nytimes_graphql_config = json.loads(clean_json)['config']
|
self.nytimes_graphql_config = json.loads(clean_json)['config']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
@ -99,7 +99,7 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
|
|||||||
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
||||||
script = type(u'')(script)
|
script = type(u'')(script)
|
||||||
json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
|
json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
|
||||||
self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
|
self.nytimes_graphql_config = json.loads(self.nyt_parser.clean_js_json(json_data))['config']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
|
def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
|
||||||
|
@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr
|
|||||||
|
|
||||||
from calibre.utils.iso8601 import parse_iso8601
|
from calibre.utils.iso8601 import parse_iso8601
|
||||||
|
|
||||||
module_version = 11 # needed for live updates
|
module_version = 12 # needed for live updates
|
||||||
pprint
|
pprint
|
||||||
|
|
||||||
|
|
||||||
@ -195,8 +195,19 @@ def article_parse(data):
|
|||||||
yield '</body></html>'
|
yield '</body></html>'
|
||||||
|
|
||||||
|
|
||||||
|
def clean_js_json(text):
|
||||||
|
text = re.sub(r'\bundefined\b', 'null', text)
|
||||||
|
text = re.sub(
|
||||||
|
r',?\s*"[^"]+"\s*:\s*function\s*\([^)]*\)\s*\{.*?\}',
|
||||||
|
'',
|
||||||
|
text,
|
||||||
|
flags=re.DOTALL
|
||||||
|
)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def json_to_html(raw):
|
def json_to_html(raw):
|
||||||
data = json.loads(raw.replace(':undefined', ':null'))
|
data = json.loads(clean_js_json(raw))
|
||||||
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
|
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
|
||||||
try:
|
try:
|
||||||
data = data['initialData']['data']
|
data = data['initialData']['data']
|
||||||
|
Loading…
x
Reference in New Issue
Block a user