Use wayback for nytimes pages

This commit is contained in:
Kovid Goyal 2022-09-11 21:30:27 +05:30
parent 1b53ec7fdd
commit 692fa6d4fc
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 31 additions and 0 deletions

View File

@ -28,6 +28,19 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
no_javascript = True
ignore_duplicate_articles = {'title', 'url'}
encoding = 'utf-8'
articles_are_obfuscated = True
delay = 1
def get_obfuscated_article(self, url):
if not hasattr(self, 'nyt_parser'):
from calibre.live import load_module
m = load_module('calibre.web.site_parsers.nytimes')
self.nyt_parser = m
raw = self.nyt_parser.download_url(url, self.cloned_browser)
from calibre.ptempfile import PersistentTemporaryFile
with PersistentTemporaryFile(suffix='.html') as pt:
pt.write(raw)
return pt.name
def preprocess_raw_html(self, raw_html, url):
if not hasattr(self, 'nyt_parser'):

View File

@ -9,6 +9,9 @@ from xml.sax.saxutils import escape, quoteattr
from calibre.utils.iso8601 import parse_iso8601
module_version = 1 # needed for live updates
def is_heading(tn):
return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
@ -116,3 +119,18 @@ def extract_html(soup):
script = type(u'')(script)
raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
return json_to_html(raw)
def download_url(url, br):
# NYT has implemented captcha protection for its article pages, so get
# them from the wayback machine instead. However, wayback machine is
# flaky so god knows how well it will work under load
from calibre.ebooks.metadata.sources.update import search_engines_module
m = search_engines_module()
cu = m.wayback_machine_cached_url(url, br)
raw = m.get_data_for_cached_url(cu)
if raw is None:
raw = br.open_novisit(cu).read()
if not isinstance(raw, bytes):
raw = raw.encode('utf-8')
return raw