Use wayback for nytimes pages

2025-07-09 03:04:10 -04:00 · 2022-09-11 21:30:27 +05:30 · 2022-09-11 21:30:27 +05:30 · 692fa6d4fc
commit 692fa6d4fc
parent 1b53ec7fdd
2 changed files with 31 additions and 0 deletions
--- a/recipes/nytimesbook.recipe
+++ b/recipes/nytimesbook.recipe
@ -28,6 +28,19 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
    no_javascript = True
    ignore_duplicate_articles = {'title', 'url'}
    encoding = 'utf-8'
    articles_are_obfuscated = True
    delay = 1
    def get_obfuscated_article(self, url):
        if not hasattr(self, 'nyt_parser'):
            from calibre.live import load_module
            m = load_module('calibre.web.site_parsers.nytimes')
            self.nyt_parser = m
        raw = self.nyt_parser.download_url(url, self.cloned_browser)
        from calibre.ptempfile import PersistentTemporaryFile
        with PersistentTemporaryFile(suffix='.html') as pt:
            pt.write(raw)
        return pt.name
    def preprocess_raw_html(self, raw_html, url):
        if not hasattr(self, 'nyt_parser'):
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@ -9,6 +9,9 @@ from xml.sax.saxutils import escape, quoteattr
 from calibre.utils.iso8601 import parse_iso8601
 module_version = 1  # needed for live updates
 def is_heading(tn):
    return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
@ -116,3 +119,18 @@ def extract_html(soup):
    script = type(u'')(script)
    raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
    return json_to_html(raw)
 def download_url(url, br):
    # NYT has implemented captcha protection for its article pages, so get
    # them from the wayback machine instead. However, wayback machine is
    # flaky so god knows how well it will work under load
    from calibre.ebooks.metadata.sources.update import search_engines_module
    m = search_engines_module()
    cu = m.wayback_machine_cached_url(url, br)
    raw = m.get_data_for_cached_url(cu)
    if raw is None:
        raw = br.open_novisit(cu).read()
    if not isinstance(raw, bytes):
        raw = raw.encode('utf-8')
    return raw