mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-02-23 11:40:06 -05:00
Fix NYTimes
Avoid double cleaning of JSON
This commit is contained in:
parent
bb605226e7
commit
5c1f96269d
@ -128,8 +128,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
||||
return self.index_to_soup(url, raw=True)
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
cleaned = self.nyt_parser.clean_js_json(raw_html)
|
||||
return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
|
||||
return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url)
|
||||
|
||||
articles_are_obfuscated = use_wayback_machine
|
||||
|
||||
|
||||
@ -128,8 +128,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
||||
return self.index_to_soup(url, raw=True)
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
cleaned = self.nyt_parser.clean_js_json(raw_html)
|
||||
return self.nyt_parser.extract_html(self.index_to_soup(cleaned), url)
|
||||
return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url)
|
||||
|
||||
articles_are_obfuscated = use_wayback_machine
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr
|
||||
|
||||
from calibre.utils.iso8601 import parse_iso8601
|
||||
|
||||
module_version = 16 # needed for live updates
|
||||
module_version = 17 # needed for live updates
|
||||
pprint
|
||||
|
||||
|
||||
@ -213,7 +213,8 @@ def clean_js_json(text):
|
||||
|
||||
|
||||
def json_to_html(raw):
|
||||
data = json.loads(clean_js_json(raw))
|
||||
cleaned = clean_js_json(raw)
|
||||
data = json.JSONDecoder(strict=False).raw_decode(cleaned)[0]
|
||||
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
|
||||
try:
|
||||
data = data['initialData']['data']
|
||||
@ -290,7 +291,7 @@ def extract_html(soup, url):
|
||||
'This is an interactive article, which is supposed to be read in a browser.'
|
||||
'</p></em></body></html>'
|
||||
)
|
||||
candidates = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)
|
||||
candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
|
||||
if not candidates:
|
||||
if soup.find('script', src='https://ct.captcha-delivery.com/c.js'):
|
||||
raise ValueError('NYTimes returned a CAPTCHA page from captcha-delivery.com')
|
||||
@ -333,6 +334,6 @@ if __name__ == '__main__':
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(raw)
|
||||
print(extract_html(soup))
|
||||
print(extract_html(soup, 'moose'))
|
||||
else:
|
||||
print(json_to_html(raw))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user