Use wayback for nytimes pages

2025-07-09 03:04:10 -04:00 · 2022-09-11 21:30:27 +05:30 · 2022-09-11 21:30:27 +05:30 · 692fa6d4fc
commit 692fa6d4fc
parent 1b53ec7fdd
2 changed files with 31 additions and 0 deletions
--- a/recipes/nytimesbook.recipe
+++ b/recipes/nytimesbook.recipe
@ -28,6 +28,19 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
    no_javascript = True
    ignore_duplicate_articles = {'title', 'url'}
    encoding = 'utf-8'
+    articles_are_obfuscated = True
+    delay = 1
+
+    def get_obfuscated_article(self, url):
+        if not hasattr(self, 'nyt_parser'):
+            from calibre.live import load_module
+            m = load_module('calibre.web.site_parsers.nytimes')
+            self.nyt_parser = m
+        raw = self.nyt_parser.download_url(url, self.cloned_browser)
+        from calibre.ptempfile import PersistentTemporaryFile
+        with PersistentTemporaryFile(suffix='.html') as pt:
+            pt.write(raw)
+        return pt.name

    def preprocess_raw_html(self, raw_html, url):
        if not hasattr(self, 'nyt_parser'):
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@ -9,6 +9,9 @@ from xml.sax.saxutils import escape, quoteattr
 from calibre.utils.iso8601 import parse_iso8601


+module_version = 1  # needed for live updates
+
+
 def is_heading(tn):
    return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')

@ -116,3 +119,18 @@ def extract_html(soup):
    script = type(u'')(script)
    raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
    return json_to_html(raw)
+
+
+def download_url(url, br):
+    # NYT has implemented captcha protection for its article pages, so get
+    # them from the wayback machine instead. However, wayback machine is
+    # flaky so god knows how well it will work under load
+    from calibre.ebooks.metadata.sources.update import search_engines_module
+    m = search_engines_module()
+    cu = m.wayback_machine_cached_url(url, br)
+    raw = m.get_data_for_cached_url(cu)
+    if raw is None:
+        raw = br.open_novisit(cu).read()
+    if not isinstance(raw, bytes):
+        raw = raw.encode('utf-8')
+    return raw