Update NYTimes

2025-08-11 09:13:57 -04:00 · 2022-09-15 12:24:41 +05:30 · 2022-09-15 12:24:41 +05:30 · 2367d3464c
commit 2367d3464c
parent b37186d3a1
4 changed files with 86 additions and 76 deletions
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -13,8 +13,9 @@ from calibre.ebooks.BeautifulSoup import Tag
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe

-is_web_edition = True
+is_web_edition = False
 oldest_web_edition_article = 7  # days
+use_wayback_machine = False


 # The sections to download when downloading the web edition, comment out
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
    remove_attributes = ['style']
    conversion_options = {'flow_size': 0}

-    def preprocess_raw_html(self, raw_html, url):
-        if not hasattr(self, 'nyt_parser'):
+    @property
+    def nyt_parser(self):
+        ans = getattr(self, '_nyt_parser', None)
+        if ans is None:
            from calibre.live import load_module
-            m = load_module('calibre.web.site_parsers.nytimes')
-            self.nyt_parser = m
+            self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
+        return ans
+
+    def get_nyt_page(self, url):
+        if use_wayback_machine:
+            from calibre import browser
+            return self.nyt_parser.download_url(url, browser())
+        return self.browser.open_novisit(url).read()
+
+    def preprocess_raw_html(self, raw_html, url):
        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
        return html

+    articles_are_obfuscated = use_wayback_machine
+
+    if use_wayback_machine:
+        def get_obfuscated_article(self, url):
+            from calibre.ptempfile import PersistentTemporaryFile
+            with PersistentTemporaryFile() as tf:
+                tf.write(self.get_nyt_page(url))
+            return tf.name
+
    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
-        try:
-            soup = self.index_to_soup(INDEX)
-        except Exception as err:
-            if getattr(err, 'code', None) == 404:
-                try:
-                    soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
-                except Exception as err:
-                    if getattr(err, 'code', None) == 404:
-                        dt = datetime.datetime.today() - datetime.timedelta(days=1)
-                        soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
-                    else:
-                        raise
-            else:
-                raise
-        return soup
+        return self.index_to_soup(self.get_nyt_page(INDEX))

    def read_nyt_metadata(self):
        soup = self.read_todays_paper()
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
        for section_title, slug in web_sections:
            url = 'https://www.nytimes.com/section/' + slug
            try:
-                soup = self.index_to_soup(url)
+                soup = self.index_to_soup(self.get_nyt_page(url))
            except Exception:
                self.log.error('Failed to download section:', url)
                continue
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -15,6 +15,7 @@ from calibre.web.feeds.news import BasicNewsRecipe

 is_web_edition = False
 oldest_web_edition_article = 7  # days
+use_wayback_machine = False


 # The sections to download when downloading the web edition, comment out
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
    remove_attributes = ['style']
    conversion_options = {'flow_size': 0}

-    def preprocess_raw_html(self, raw_html, url):
-        if not hasattr(self, 'nyt_parser'):
+    @property
+    def nyt_parser(self):
+        ans = getattr(self, '_nyt_parser', None)
+        if ans is None:
            from calibre.live import load_module
-            m = load_module('calibre.web.site_parsers.nytimes')
-            self.nyt_parser = m
+            self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
+        return ans
+
+    def get_nyt_page(self, url):
+        if use_wayback_machine:
+            from calibre import browser
+            return self.nyt_parser.download_url(url, browser())
+        return self.browser.open_novisit(url).read()
+
+    def preprocess_raw_html(self, raw_html, url):
        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
        return html

+    articles_are_obfuscated = use_wayback_machine
+
+    if use_wayback_machine:
+        def get_obfuscated_article(self, url):
+            from calibre.ptempfile import PersistentTemporaryFile
+            with PersistentTemporaryFile() as tf:
+                tf.write(self.get_nyt_page(url))
+            return tf.name
+
    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
-        try:
-            soup = self.index_to_soup(INDEX)
-        except Exception as err:
-            if getattr(err, 'code', None) == 404:
-                try:
-                    soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
-                except Exception as err:
-                    if getattr(err, 'code', None) == 404:
-                        dt = datetime.datetime.today() - datetime.timedelta(days=1)
-                        soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
-                    else:
-                        raise
-            else:
-                raise
-        return soup
+        return self.index_to_soup(self.get_nyt_page(INDEX))

    def read_nyt_metadata(self):
        soup = self.read_todays_paper()
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
        for section_title, slug in web_sections:
            url = 'https://www.nytimes.com/section/' + slug
            try:
-                soup = self.index_to_soup(url)
+                soup = self.index_to_soup(self.get_nyt_page(url))
            except Exception:
                self.log.error('Failed to download section:', url)
                continue
--- a/recipes/nytimesbook.recipe
+++ b/recipes/nytimesbook.recipe
@ -2,14 +2,9 @@
 # vim:fileencoding=utf-8
 # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>

-from __future__ import unicode_literals
-
 from calibre.web.feeds.news import BasicNewsRecipe

-
-def classes(classes):
-    q = frozenset(classes.split(' '))
-    return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
+use_wayback_machine = False


 def absolutize(url):
@ -28,32 +23,38 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
    no_javascript = True
    ignore_duplicate_articles = {'title', 'url'}
    encoding = 'utf-8'
-    articles_are_obfuscated = True
-    delay = 1

-    def get_obfuscated_article(self, url):
-        if not hasattr(self, 'nyt_parser'):
+    articles_are_obfuscated = use_wayback_machine
+
+    if use_wayback_machine:
+        def get_obfuscated_article(self, url):
+            from calibre.ptempfile import PersistentTemporaryFile
+            with PersistentTemporaryFile() as tf:
+                tf.write(self.get_nyt_page(url))
+            return tf.name
+
+    @property
+    def nyt_parser(self):
+        ans = getattr(self, '_nyt_parser', None)
+        if ans is None:
            from calibre.live import load_module
-            m = load_module('calibre.web.site_parsers.nytimes')
-            self.nyt_parser = m
-        raw = self.nyt_parser.download_url(url, self.cloned_browser)
-        from calibre.ptempfile import PersistentTemporaryFile
-        with PersistentTemporaryFile(suffix='.html') as pt:
-            pt.write(raw)
-        return pt.name
+            self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
+        return ans
+
+    def get_nyt_page(self, url):
+        if use_wayback_machine:
+            from calibre import browser
+            return self.nyt_parser.download_url(url, browser())
+        return self.browser.open_novisit(url).read()

    def preprocess_raw_html(self, raw_html, url):
-        if not hasattr(self, 'nyt_parser'):
-            from calibre.live import load_module
-            m = load_module('calibre.web.site_parsers.nytimes')
-            self.nyt_parser = m
        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
        return html

    def parse_index(self):
        # return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])]
        soup = self.index_to_soup(
-            'https://www.nytimes.com/pages/books/review/index.html')
+            self.get_nyt_page('https://www.nytimes.com/pages/books/review/index.html'))

        # Find TOC
        toc = soup.find('section', id='collection-book-review').find('section').find('ol')
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@ -10,7 +10,7 @@ from pprint import pprint
 from calibre.utils.iso8601 import parse_iso8601


-module_version = 2  # needed for live updates
+module_version = 3  # needed for live updates
 pprint


@ -187,18 +187,15 @@ def extract_html(soup):


 def download_url(url, br):
-    # NYT has implemented captcha protection for its article pages, so get
-    # them from the wayback machine instead. However, wayback machine is
-    # flaky so god knows how well it will work under load
-    from calibre.ebooks.metadata.sources.update import search_engines_module
-    m = search_engines_module()
-    cu = m.wayback_machine_cached_url(url, br)
-    raw = m.get_data_for_cached_url(cu)
-    if raw is None:
-        raw = br.open_novisit(cu).read()
-    if not isinstance(raw, bytes):
-        raw = raw.encode('utf-8')
-    return raw
+    # Get the URL from the Wayback machine
+    from mechanize import Request
+    rq = Request(
+        'http://localhost:8090/nytimes',
+        data=json.dumps({"url": url}),
+        headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'}
+    )
+    br.set_handle_gzip(True)
+    return br.open_novisit(rq, timeout=3 * 60).read()


 if __name__ == '__main__':