From 692fa6d4fcc75cc5c0d82cb30f32e6b2896b49a9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Sep 2022 21:30:27 +0530 Subject: [PATCH] Use wayback for nytimes pages --- recipes/nytimesbook.recipe | 13 +++++++++++++ src/calibre/web/site_parsers/nytimes.py | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index 475e55b05b..dfb62ed986 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -28,6 +28,19 @@ class NewYorkTimesBookReview(BasicNewsRecipe): no_javascript = True ignore_duplicate_articles = {'title', 'url'} encoding = 'utf-8' + articles_are_obfuscated = True + delay = 1 + + def get_obfuscated_article(self, url): + if not hasattr(self, 'nyt_parser'): + from calibre.live import load_module + m = load_module('calibre.web.site_parsers.nytimes') + self.nyt_parser = m + raw = self.nyt_parser.download_url(url, self.cloned_browser) + from calibre.ptempfile import PersistentTemporaryFile + with PersistentTemporaryFile(suffix='.html') as pt: + pt.write(raw) + return pt.name def preprocess_raw_html(self, raw_html, url): if not hasattr(self, 'nyt_parser'): diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 4e49724234..1e1e6b881c 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,6 +9,9 @@ from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 +module_version = 1 # needed for live updates + + def is_heading(tn): return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block') @@ -116,3 +119,18 @@ def extract_html(soup): script = type(u'')(script) raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';') return json_to_html(raw) + + +def download_url(url, br): + # NYT has implemented captcha protection for its article pages, so get + # them from the wayback machine instead. However, wayback machine is + # flaky so god knows how well it will work under load + from calibre.ebooks.metadata.sources.update import search_engines_module + m = search_engines_module() + cu = m.wayback_machine_cached_url(url, br) + raw = m.get_data_for_cached_url(cu) + if raw is None: + raw = br.open_novisit(cu).read() + if not isinstance(raw, bytes): + raw = raw.encode('utf-8') + return raw