mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Detect nytimes captcha pages
This commit is contained in:
parent
4be0c12647
commit
9fba8f7bae
@ -9,7 +9,7 @@ from xml.sax.saxutils import escape, quoteattr
|
|||||||
|
|
||||||
from calibre.utils.iso8601 import parse_iso8601
|
from calibre.utils.iso8601 import parse_iso8601
|
||||||
|
|
||||||
module_version = 12 # needed for live updates
|
module_version = 13 # needed for live updates
|
||||||
pprint
|
pprint
|
||||||
|
|
||||||
|
|
||||||
@ -284,7 +284,12 @@ def extract_html(soup, url):
|
|||||||
'This is an interactive article, which is supposed to be read in a browser.'
|
'This is an interactive article, which is supposed to be read in a browser.'
|
||||||
'</p></em></body></html>'
|
'</p></em></body></html>'
|
||||||
)
|
)
|
||||||
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
candidates = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)
|
||||||
|
if not candidates:
|
||||||
|
if soup.find('script', src='https://ct.captcha-delivery.com/c.js'):
|
||||||
|
raise ValueError('NYTimes returned a CAPTCHA page from captcha-delivery.com')
|
||||||
|
raise ValueError('NYTimes returned HTML without preloaded data')
|
||||||
|
script = candidates[0]
|
||||||
script = str(script)
|
script = str(script)
|
||||||
raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';')
|
raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';')
|
||||||
return json_to_html(raw)
|
return json_to_html(raw)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user