Update NYTimes

2025-07-07 18:24:30 -04:00 · 2022-09-12 07:40:24 +05:30 · 2022-09-12 07:40:24 +05:30 · ceafc1b05e
commit ceafc1b05e
parent 692fa6d4fc
2 changed files with 24 additions and 200 deletions
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -3,20 +3,20 @@
 # License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

 from __future__ import absolute_import, division, print_function, unicode_literals
-
 import datetime
-import re
 import json
+import re
 from pprint import pprint  # noqa

 from calibre import strftime
+from calibre.ebooks.BeautifulSoup import Tag
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag

 is_web_edition = True
 oldest_web_edition_article = 7  # days

+
 # The sections to download when downloading the web edition, comment out
 # the section you are not interested in
 web_sections = [
@ -92,76 +92,15 @@ class NewYorkTimes(BasicNewsRecipe):
    remove_attributes = ['style']
    conversion_options = {'flow_size': 0}

-    remove_tags = [
-        dict(attrs={'aria-label':'tools'.split()}),
-        dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
-        dict(href='#site-content #site-index'.split()),
-        dict(attrs={'aria-hidden':'true'}),
-        dict(attrs={'data-videoid':True}),
-        dict(name='button meta link time source'.split()),
-        dict(id=lambda x: x and x.startswith('story-ad-')),
-        dict(name='head'),
-        dict(role='toolbar'),
-        dict(name='a', href=lambda x: x and '#story-continues-' in x),
-        dict(name='a', href=lambda x: x and '#whats-next' in x),
-        dict(id=lambda x: x and 'sharetools-' in x),
-        dict(id='newsletter-promo supported-by-ad bottom-wrapper top-wrapper sponsor-wrapper'.split()),
-        classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
-        dict(attrs={'class': lambda x: x and (
-            'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
-    ]
-
-    def preprocess_html(self, soup):
-        article = soup.find(id='story')
-        if article is None:
-            keep_only_tags = [dict(attrs={'aria-label': 'Main content'})]
-        else:
-            # The NYT is apparently A/B testing a new page layout
-            has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
-            if has_supplemental:
-                keep_only_tags = [
-                    dict(id='story-header'),
-                    classes('story-body-supplemental story-interrupter'),
-                ]
-            else:
-                keep_only_tags = [
-                    dict(id='story'),
-                ]
-        body = new_tag(soup, 'body')
-        for spec in keep_only_tags:
-            for tag in soup.find('body').findAll(**spec):
-                body.insert(len(body.contents), tag)
-        soup.find('body').replaceWith(body)
-
-        # Add a space to the dateline
-        t = soup.find(**classes('dateline'))
-        if t is not None:
-            t.insert(0, ' ')
-
-        # Remove empty li tags
-        for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
-            if not li.contents and not li.string:
-                li.extract()
-
-        # Ensure the headline is first
-        h1 = soup.find('h1', itemprop='headline')
-        if h1 is not None:
-            h1.extract()
-            soup.find('body').contents.insert(0, h1)
-
-        # Find lazy loaded images
-        for div in soup.findAll(itemtype='http://schema.org/ImageObject', itemid=True):
-            if div.find('img') is None:
-                span = div.find('span')
-                if span is not None and self.tag_to_string(span).strip().lower() == 'image':
-                    span.name = 'img'
-                    span['src'] = div['itemid']
-
-        # Remove live storline menu
-        for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}):
-            span.parent.extract()
-
-        return soup
+    def preprocess_raw_html(self, raw_html, url):
+        if '/live/' in url:
+            self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
+        if not hasattr(self, 'nyt_parser'):
+            from calibre.live import load_module
+            m = load_module('calibre.web.site_parsers.nytimes')
+            self.nyt_parser = m
+        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
+        return html

    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
@ -323,30 +262,3 @@ class NewYorkTimes(BasicNewsRecipe):
        if is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()
-
-    # The NYT occasionally returns bogus articles for some reason just in case
-    # it is because of cookies, dont store cookies
-    def get_browser(self, *args, **kwargs):
-        return self
-
-    def clone_browser(self, *args, **kwargs):
-        return self.get_browser()
-
-    def open_novisit(self, *args, **kwargs):
-        from calibre import browser, random_user_agent
-        if not hasattr(self, 'rua_stored'):
-            self.rua_stored = random_user_agent(allow_ie=False)
-        br = browser(user_agent=self.rua_stored)
-        response = br.open_novisit(*args, **kwargs)
-        # headers = response.info()
-        # if headers.get('X-PageType') == 'vi-story':
-        #     import tempfile
-        #     with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
-        #         f.write(response.read())
-        #     import time
-        #     time.sleep(1)
-        #     br = browser()
-        #     response = br.open_novisit(*args, **kwargs)
-        return response
-
-    open = open_novisit
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -3,20 +3,20 @@
 # License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

 from __future__ import absolute_import, division, print_function, unicode_literals
-
 import datetime
-import re
 import json
+import re
 from pprint import pprint  # noqa

 from calibre import strftime
+from calibre.ebooks.BeautifulSoup import Tag
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag

 is_web_edition = False
 oldest_web_edition_article = 7  # days

+
 # The sections to download when downloading the web edition, comment out
 # the section you are not interested in
 web_sections = [
@ -92,76 +92,15 @@ class NewYorkTimes(BasicNewsRecipe):
    remove_attributes = ['style']
    conversion_options = {'flow_size': 0}

-    remove_tags = [
-        dict(attrs={'aria-label':'tools'.split()}),
-        dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
-        dict(href='#site-content #site-index'.split()),
-        dict(attrs={'aria-hidden':'true'}),
-        dict(attrs={'data-videoid':True}),
-        dict(name='button meta link time source'.split()),
-        dict(id=lambda x: x and x.startswith('story-ad-')),
-        dict(name='head'),
-        dict(role='toolbar'),
-        dict(name='a', href=lambda x: x and '#story-continues-' in x),
-        dict(name='a', href=lambda x: x and '#whats-next' in x),
-        dict(id=lambda x: x and 'sharetools-' in x),
-        dict(id='newsletter-promo supported-by-ad bottom-wrapper top-wrapper sponsor-wrapper'.split()),
-        classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
-        dict(attrs={'class': lambda x: x and (
-            'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
-    ]
-
-    def preprocess_html(self, soup):
-        article = soup.find(id='story')
-        if article is None:
-            keep_only_tags = [dict(attrs={'aria-label': 'Main content'})]
-        else:
-            # The NYT is apparently A/B testing a new page layout
-            has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
-            if has_supplemental:
-                keep_only_tags = [
-                    dict(id='story-header'),
-                    classes('story-body-supplemental story-interrupter'),
-                ]
-            else:
-                keep_only_tags = [
-                    dict(id='story'),
-                ]
-        body = new_tag(soup, 'body')
-        for spec in keep_only_tags:
-            for tag in soup.find('body').findAll(**spec):
-                body.insert(len(body.contents), tag)
-        soup.find('body').replaceWith(body)
-
-        # Add a space to the dateline
-        t = soup.find(**classes('dateline'))
-        if t is not None:
-            t.insert(0, ' ')
-
-        # Remove empty li tags
-        for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
-            if not li.contents and not li.string:
-                li.extract()
-
-        # Ensure the headline is first
-        h1 = soup.find('h1', itemprop='headline')
-        if h1 is not None:
-            h1.extract()
-            soup.find('body').contents.insert(0, h1)
-
-        # Find lazy loaded images
-        for div in soup.findAll(itemtype='http://schema.org/ImageObject', itemid=True):
-            if div.find('img') is None:
-                span = div.find('span')
-                if span is not None and self.tag_to_string(span).strip().lower() == 'image':
-                    span.name = 'img'
-                    span['src'] = div['itemid']
-
-        # Remove live storline menu
-        for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}):
-            span.parent.extract()
-
-        return soup
+    def preprocess_raw_html(self, raw_html, url):
+        if '/live/' in url:
+            self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
+        if not hasattr(self, 'nyt_parser'):
+            from calibre.live import load_module
+            m = load_module('calibre.web.site_parsers.nytimes')
+            self.nyt_parser = m
+        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
+        return html

    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
@ -323,30 +262,3 @@ class NewYorkTimes(BasicNewsRecipe):
        if is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()
-
-    # The NYT occasionally returns bogus articles for some reason just in case
-    # it is because of cookies, dont store cookies
-    def get_browser(self, *args, **kwargs):
-        return self
-
-    def clone_browser(self, *args, **kwargs):
-        return self.get_browser()
-
-    def open_novisit(self, *args, **kwargs):
-        from calibre import browser, random_user_agent
-        if not hasattr(self, 'rua_stored'):
-            self.rua_stored = random_user_agent(allow_ie=False)
-        br = browser(user_agent=self.rua_stored)
-        response = br.open_novisit(*args, **kwargs)
-        # headers = response.info()
-        # if headers.get('X-PageType') == 'vi-story':
-        #     import tempfile
-        #     with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
-        #         f.write(response.read())
-        #     import time
-        #     time.sleep(1)
-        #     br = browser()
-        #     response = br.open_novisit(*args, **kwargs)
-        return response
-
-    open = open_novisit