Update NYTimes todays paper recipe for website changes

2025-07-09 03:04:10 -04:00 · 2025-04-10 15:22:08 +05:30 · 2025-04-10 15:22:08 +05:30 · 1501da22db
commit 1501da22db
parent ef7e3df6c2
1 changed files with 89 additions and 70 deletions
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -8,10 +8,13 @@ import datetime
 import json
 import re
 import mechanize
 from calibre import strftime
 from calibre.ebooks.BeautifulSoup import Tag
 from calibre.utils.date import strptime
 from calibre.web.feeds.news import BasicNewsRecipe
 from polyglot.urllib import urlencode
 use_wayback_machine = False
@ -169,82 +172,70 @@ class NewYorkTimes(BasicNewsRecipe):
                self.compress_news_images = True
    def read_todays_paper(self):
-        INDEX = 'https://www.nytimes.com/section/todayspaper'
+        pdate = self.recipe_specific_options.get('date')
-        # INDEX = 'file:///t/raw.html'
+        templ = 'https://www.nytimes.com/issue/todayspaper/{}/todays-new-york-times'
-        d = self.recipe_specific_options.get('date')
+        if pdate and isinstance(pdate, str):
-        if d and isinstance(d, str):
+            return pdate, self.index_to_soup(templ.format(pdate))
-            INDEX = 'https://www.nytimes.com/issue/todayspaper/' + d + '/todays-new-york-times'
+        # Cant figure out how to get the date so just try todays and yesterdays dates
-        return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True))
+        date = datetime.date.today()
        pdate = date.strftime('%Y/%m/%d')
        try:
            soup = self.index_to_soup(templ.format(pdate))
        except Exception as e:
            if getattr(e, 'code', None) == 404:
                date -= datetime.timedelta(days=1)
                pdate = date.strftime('%Y/%m/%d')
                soup = self.index_to_soup(templ.format(pdate))
            else:
                raise
        self.log("Using today's paper from:", pdate)
        return pdate, soup
    def read_nyt_metadata(self):
-        soup = self.read_todays_paper()
+        pdate, soup = self.read_todays_paper()
-        pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
+        date = strptime(pdate, '%Y/%m/%d', assume_utc=False, as_utc=False)
-        date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
+        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
        self.timefmt = strftime(' [%d %b, %Y]', date)
        self.nytimes_publication_date = pdate
        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
        script = type(u'')(script)
        json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')  # }}
        self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
        return soup
    def parse_todays_page(self):
-        soup = self.read_nyt_metadata()
+        self.read_nyt_metadata()
-        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
+        query = {
-        script = type(u'')(script)
+            'operationName': 'CollectionsQuery',
-        json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
+            'variables': json.dumps({
-        data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
+                'id': '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date),
-        article_map = {}
+                'first': 10,
-        sections = []
+                'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED',
-        for key in data:
+                'isFetchMore':False,
-            if 'Article' in key:
+                'isTranslatable':False,
-                adata = data[key]
+                'isEspanol':False,
-                if adata.get('__typename') == 'Article':
+                'highlightsListUri':'nyt://per/personalized-list/__null__',
-                    url = adata.get('url')
+                'highlightsListFirst':0,
-                    summary = adata.get('summary')
+                'hasHighlightsList':False
-                    headline = adata.get('headline')
+            }, separators=',:'),
-                    if url and headline:
+            'extensions': json.dumps({
-                        title = headline['default']
+                'persistedQuery': {
-                        article_map[adata['id']] = {
+                    'version':1,
-                            'title': title, 'url': url, 'description': summary or ''}
+                    # This is an Apollo persisted query hash which you can get
-            elif 'LegacyCollection:' in key:
+                    # from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
-                lc = data[key]
+                    'sha256Hash': '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7',
-                if not lc.get('active'):
+                },
-                    continue
+            }, separators=',:')
-                for sdata in lc['groupings']:
+        }
-                    tname = sdata.get('__typename')
+        url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query)
-                    if tname != 'LegacyCollectionGrouping':
+        br = self.browser
-                        continue
+        # br.set_debug_http(True)
-                    for cont in sdata['containers']:
+        headers = dict(self.nytimes_graphql_config['gqlRequestHeaders'])
-                        if cont.get('__typename') == 'LegacyCollectionContainer':
+        headers['Accept'] = 'application/json'
-                            section_name = cont['label@stripHtml']
+        req = mechanize.Request(url, headers=headers)
-                            articles = []
+        raw = br.open(req).read()
-                            for rel in cont['relations']:
+        # open('/t/raw.json', 'wb').write(raw)
-                                if rel.get('__typename') == 'LegacyCollectionRelation':
+        return parse_todays_page(json.loads(raw), self.log)
                                    asset = rel['asset']['__ref']
                                    if asset.startswith('Article:'):
                                        articles.append(asset.partition(':')[2])
                            if articles:
                                sections.append((section_name, articles))
        feeds = []
        for section_title, article_ids in sections:
            articles = []
            for aid in article_ids:
                if aid in article_map:
                    art = article_map[aid]
                    articles.append(art)
            if articles:
                feeds.append((section_title, articles))
        def skey(x):
            name = x[0].strip()
            if name == 'The Front Page':
                return 0, ''
            return 1, name.lower()
        feeds.sort(key=skey)
        for section, articles in feeds:
            self.log('\n' + section)
            for article in articles:
                self.log(article['title'] + ' - ' + article['url'])
        # raise SystemExit(1)
        return feeds
    def parse_article_group(self, container):
        for li in container.findAll('li'):
@ -372,3 +363,31 @@ class NewYorkTimes(BasicNewsRecipe):
        if not re.search(r'/video/|/athletic/|/card/', url):
            return url
        self.log('\tSkipping ', url)
 def parse_todays_page(data, log=print):
    containers = data['data']['legacyCollection']['groupings'][0]['containers']
    feeds = []
    for cont in containers:
        if cont['__typename'] != 'LegacyCollectionContainer':
            continue
        section_name = cont['label'].strip()
        if not section_name:
            continue
        log(section_name)
        articles = []
        for rel in cont['relations']:
            if rel.get('__typename') == 'LegacyCollectionRelation':
                asset = rel['asset']
                if asset['__typename'] == 'Article':
                    title = asset['headline']['default']
                    articles.append({'title': title, 'url': asset['url'], 'description': asset['summary']})
                    log(' ', title + ':', asset['url'])
        if articles:
            feeds.append((section_name, articles))
    return feeds
 if __name__ == '__main__':
    import sys
    parse_todays_page(json.loads(open(sys.argv[-1], 'rb').read()))