Update NYTimes

2025-08-11 09:13:57 -04:00 · 2022-09-15 12:24:41 +05:30 · 2022-09-15 12:24:41 +05:30 · 2367d3464c
commit 2367d3464c
parent b37186d3a1
4 changed files with 86 additions and 76 deletions
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -13,8 +13,9 @@ from calibre.ebooks.BeautifulSoup import Tag
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
-is_web_edition = True
+is_web_edition = False
 oldest_web_edition_article = 7  # days
 use_wayback_machine = False
 # The sections to download when downloading the web edition, comment out
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
    remove_attributes = ['style']
    conversion_options = {'flow_size': 0}
-    def preprocess_raw_html(self, raw_html, url):
+    @property
-        if not hasattr(self, 'nyt_parser'):
+    def nyt_parser(self):
        ans = getattr(self, '_nyt_parser', None)
        if ans is None:
            from calibre.live import load_module
-            m = load_module('calibre.web.site_parsers.nytimes')
+            self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
-            self.nyt_parser = m
+        return ans
    def get_nyt_page(self, url):
        if use_wayback_machine:
            from calibre import browser
            return self.nyt_parser.download_url(url, browser())
        return self.browser.open_novisit(url).read()
    def preprocess_raw_html(self, raw_html, url):
        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
        return html
    articles_are_obfuscated = use_wayback_machine
    if use_wayback_machine:
        def get_obfuscated_article(self, url):
            from calibre.ptempfile import PersistentTemporaryFile
            with PersistentTemporaryFile() as tf:
                tf.write(self.get_nyt_page(url))
            return tf.name
    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
-        try:
+        return self.index_to_soup(self.get_nyt_page(INDEX))
            soup = self.index_to_soup(INDEX)
        except Exception as err:
            if getattr(err, 'code', None) == 404:
                try:
                    soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
                except Exception as err:
                    if getattr(err, 'code', None) == 404:
                        dt = datetime.datetime.today() - datetime.timedelta(days=1)
                        soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
                    else:
                        raise
            else:
                raise
        return soup
    def read_nyt_metadata(self):
        soup = self.read_todays_paper()
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
        for section_title, slug in web_sections:
            url = 'https://www.nytimes.com/section/' + slug
            try:
-                soup = self.index_to_soup(url)
+                soup = self.index_to_soup(self.get_nyt_page(url))
            except Exception:
                self.log.error('Failed to download section:', url)
                continue
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -15,6 +15,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 is_web_edition = False
 oldest_web_edition_article = 7  # days
 use_wayback_machine = False
 # The sections to download when downloading the web edition, comment out
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
    remove_attributes = ['style']
    conversion_options = {'flow_size': 0}
-    def preprocess_raw_html(self, raw_html, url):
+    @property
-        if not hasattr(self, 'nyt_parser'):
+    def nyt_parser(self):
        ans = getattr(self, '_nyt_parser', None)
        if ans is None:
            from calibre.live import load_module
-            m = load_module('calibre.web.site_parsers.nytimes')
+            self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
-            self.nyt_parser = m
+        return ans
    def get_nyt_page(self, url):
        if use_wayback_machine:
            from calibre import browser
            return self.nyt_parser.download_url(url, browser())
        return self.browser.open_novisit(url).read()
    def preprocess_raw_html(self, raw_html, url):
        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
        return html
    articles_are_obfuscated = use_wayback_machine
    if use_wayback_machine:
        def get_obfuscated_article(self, url):
            from calibre.ptempfile import PersistentTemporaryFile
            with PersistentTemporaryFile() as tf:
                tf.write(self.get_nyt_page(url))
            return tf.name
    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
-        try:
+        return self.index_to_soup(self.get_nyt_page(INDEX))
            soup = self.index_to_soup(INDEX)
        except Exception as err:
            if getattr(err, 'code', None) == 404:
                try:
                    soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
                except Exception as err:
                    if getattr(err, 'code', None) == 404:
                        dt = datetime.datetime.today() - datetime.timedelta(days=1)
                        soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
                    else:
                        raise
            else:
                raise
        return soup
    def read_nyt_metadata(self):
        soup = self.read_todays_paper()
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
        for section_title, slug in web_sections:
            url = 'https://www.nytimes.com/section/' + slug
            try:
-                soup = self.index_to_soup(url)
+                soup = self.index_to_soup(self.get_nyt_page(url))
            except Exception:
                self.log.error('Failed to download section:', url)
                continue
--- a/recipes/nytimesbook.recipe
+++ b/recipes/nytimesbook.recipe
@ -2,14 +2,9 @@
 # vim:fileencoding=utf-8
 # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
 from __future__ import unicode_literals
 from calibre.web.feeds.news import BasicNewsRecipe
-
+use_wayback_machine = False
 def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
 def absolutize(url):
@ -28,32 +23,38 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
    no_javascript = True
    ignore_duplicate_articles = {'title', 'url'}
    encoding = 'utf-8'
    articles_are_obfuscated = True
    delay = 1
-    def get_obfuscated_article(self, url):
+    articles_are_obfuscated = use_wayback_machine
-        if not hasattr(self, 'nyt_parser'):
+
    if use_wayback_machine:
        def get_obfuscated_article(self, url):
            from calibre.ptempfile import PersistentTemporaryFile
            with PersistentTemporaryFile() as tf:
                tf.write(self.get_nyt_page(url))
            return tf.name
    @property
    def nyt_parser(self):
        ans = getattr(self, '_nyt_parser', None)
        if ans is None:
            from calibre.live import load_module
-            m = load_module('calibre.web.site_parsers.nytimes')
+            self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
-            self.nyt_parser = m
+        return ans
-        raw = self.nyt_parser.download_url(url, self.cloned_browser)
+
-        from calibre.ptempfile import PersistentTemporaryFile
+    def get_nyt_page(self, url):
-        with PersistentTemporaryFile(suffix='.html') as pt:
+        if use_wayback_machine:
-            pt.write(raw)
+            from calibre import browser
-        return pt.name
+            return self.nyt_parser.download_url(url, browser())
        return self.browser.open_novisit(url).read()
    def preprocess_raw_html(self, raw_html, url):
        if not hasattr(self, 'nyt_parser'):
            from calibre.live import load_module
            m = load_module('calibre.web.site_parsers.nytimes')
            self.nyt_parser = m
        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
        return html
    def parse_index(self):
        # return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])]
        soup = self.index_to_soup(
-            'https://www.nytimes.com/pages/books/review/index.html')
+            self.get_nyt_page('https://www.nytimes.com/pages/books/review/index.html'))
        # Find TOC
        toc = soup.find('section', id='collection-book-review').find('section').find('ol')
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@ -10,7 +10,7 @@ from pprint import pprint
 from calibre.utils.iso8601 import parse_iso8601
-module_version = 2  # needed for live updates
+module_version = 3  # needed for live updates
 pprint
@ -187,18 +187,15 @@ def extract_html(soup):
 def download_url(url, br):
-    # NYT has implemented captcha protection for its article pages, so get
+    # Get the URL from the Wayback machine
-    # them from the wayback machine instead. However, wayback machine is
+    from mechanize import Request
-    # flaky so god knows how well it will work under load
+    rq = Request(
-    from calibre.ebooks.metadata.sources.update import search_engines_module
+        'http://localhost:8090/nytimes',
-    m = search_engines_module()
+        data=json.dumps({"url": url}),
-    cu = m.wayback_machine_cached_url(url, br)
+        headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'}
-    raw = m.get_data_for_cached_url(cu)
+    )
-    if raw is None:
+    br.set_handle_gzip(True)
-        raw = br.open_novisit(cu).read()
+    return br.open_novisit(rq, timeout=3 * 60).read()
    if not isinstance(raw, bytes):
        raw = raw.encode('utf-8')
    return raw
 if __name__ == '__main__':