Use JSON rather than HTML data from the NYT today's paper page as it is more complete

2025-07-09 03:04:10 -04:00 · 2018-11-05 12:48:02 +05:30 · 2018-11-05 12:48:02 +05:30 · e2a0807079
commit e2a0807079
parent 6ce808c499
2 changed files with 134 additions and 91 deletions
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -6,6 +6,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import datetime
 import re
 import json
 from pprint import pprint  # noqa
 from calibre import strftime
 from calibre.utils.date import strptime
@ -137,55 +139,85 @@ class NewYorkTimes(BasicNewsRecipe):
            soup.find('body').contents.insert(0, h1)
        return soup
-    def read_nyt_metadata(self):
+    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
-        soup = self.index_to_soup(INDEX)
+        try:
            soup = self.index_to_soup(INDEX)
        except Exception as err:
            if getattr(err, 'code', None) == 404:
                try:
                    soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
                except Exception as err:
                    if getattr(err, 'code', None) == 404:
                        dt = datetime.datetime.today() - datetime.timedelta(days=1)
                        soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
                    else:
                        raise
            else:
                raise
        return soup
    def read_nyt_metadata(self):
        soup = self.read_todays_paper()
        pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
        date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
        self.timefmt = strftime(' [%d %b, %Y]', date)
        return soup
    def parse_todays_sections(self, container):
        for h2 in container.findAll('h2', **classes('headline')):
            title = self.tag_to_string(h2)
            a = h2.find('a', href=True)
            url = a['href']
            if '?' in url:
                url = url.split('?')[0]
            p = h2.findParent(**classes('story-body'))
            desc = ''
            if p is not None:
                s = p.find(**classes('summary'))
                if s is not None:
                    desc = self.tag_to_string(s)
            date = ''
            d = date_from_url(url)
            if d is not None:
                date = format_date(d)
            self.log('\t', title + date, ': ', url)
            self.log('\t\t', desc)
            yield {'title': title, 'url': url, 'description': desc, 'date': date}
    def parse_todays_page(self):
        soup = self.read_nyt_metadata()
-        section = soup.find(id=lambda x: x and x.startswith('collection-todays-new-york-times'))
+        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
        script = type(u'')(script)
        data = json.loads(script[script.find('{'):].strip().rstrip(';'))['initialState']
        containers, sections = [], {}
        article_map = {}
        pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
        for key in data:
            if 'Article' in key:
                adata = data[key]
                if adata.get('__typename') == 'Article':
                    url = adata.get('url')
                    summary = adata.get('summary')
                    headline = adata.get('headline')
                    if url and headline:
                        title = data[headline['id']]['default']
                        article_map[adata['id']] = {
                            'title': title, 'url': url, 'description': summary or ''}
            elif 'Legacy' in key:
                sdata = data[key]
                tname = sdata.get('__typename')
                if tname == 'LegacyCollectionContainer':
                    containers.append(sdata['label'] or sdata['name'])
                elif tname == 'LegacyCollectionRelation':
                    m = pat.search(key)
                    grouping, container, relation = map(int, m.groups())
                    asset = sdata['asset']
                    if asset['typename'] == 'Article' and grouping == 0:
                        if container not in sections:
                            sections[container] = []
                        sections[container].append(asset['id'].split(':', 1)[1])
        feeds = []
-        for i, h1 in enumerate(section.findAll('h1')):
+        for i, section_title in enumerate(containers):
-            if i == 0:
+            if i in sections:
-                continue
+                articles = sections[i]
-            section_title = self.tag_to_string(h1)
+                if articles:
-            self.log('Found section:', section_title)
+                    self.log('\n' + section_title)
-            if i == 1:
+                    feeds.append((section_title, []))
-                container = h1.parent
+                    for artid in articles:
-                articles = list(self.parse_todays_sections(container))
+                        if artid in article_map:
-                articles += list(self.parse_todays_sections(container.findNextSibling('div')))
+                            art = article_map[artid]
-            else:
+                            feeds[-1][1].append(art)
-                articles = list(self.parse_todays_sections(h1.findNextSibling('ol')))
+                            self.log('\t' + art['title'])
-            if articles:
+
-                feeds.append((section_title, articles))
+        def skey(x):
            name = x[0].strip()
            if name == 'The Front Page':
                return 0, ''
            return 1, name.lower()
        feeds.sort(key=skey)
        return feeds
    def parse_highlights(self, container):
@ -267,8 +299,10 @@ class NewYorkTimes(BasicNewsRecipe):
        return self.get_browser()
    def open_novisit(self, *args, **kwargs):
-        from calibre import browser
+        from calibre import browser, random_user_agent
-        br = browser()
+        if not hasattr(self, 'rua_stored'):
            self.rua_stored = random_user_agent(allow_ie=False)
        br = browser(user_agent=self.rua_stored)
        response = br.open_novisit(*args, **kwargs)
        # headers = response.info()
        # if headers.get('X-PageType') == 'vi-story':
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -6,6 +6,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import datetime
 import re
 import json
 from pprint import pprint  # noqa
 from calibre import strftime
 from calibre.utils.date import strptime
@ -137,7 +139,7 @@ class NewYorkTimes(BasicNewsRecipe):
            soup.find('body').contents.insert(0, h1)
        return soup
-    def read_nyt_metadata(self):
+    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
        try:
@ -154,63 +156,68 @@ class NewYorkTimes(BasicNewsRecipe):
                        raise
            else:
                raise
        return soup
    def read_nyt_metadata(self):
        soup = self.read_todays_paper()
        pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
        date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
        self.timefmt = strftime(' [%d %b, %Y]', date)
        return soup
    def parse_todays_sections(self, container):
        for li in container.findAll('li'):
            desc = ''
            h2 = li.find('h2')
            if h2 is None:
                a = li.find('a', href=True)
                title = self.tag_to_string(a)
            else:
                title = self.tag_to_string(h2)
                a = h2.find('a', href=True)
                if a is None:
                    a = h2.findParent('a', href=True)
                    div = a.find('div', recursive=False)
                    if div is not None:
                        desc = self.tag_to_string(div)
            if a is None:
                continue
            url = a['href']
            if '?' in url:
                url = url.split('?')[0]
            if url.startswith('/'):
                url = 'https://www.nytimes.com' + url
            if not desc:
                p = li.find('p')
                if p is not None:
                    desc = self.tag_to_string(p)
            date = ''
            d = date_from_url(url)
            if d is not None:
                date = format_date(d)
            self.log('\t', title + date, ': ', url)
            self.log('\t\t', desc)
            yield {'title': title, 'url': url, 'description': desc, 'date': date}
    def parse_todays_page(self):
        soup = self.read_nyt_metadata()
-        section = soup.find(id='collection-todays-new-york-times').find('div', recursive=False)
+        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
        script = type(u'')(script)
        data = json.loads(script[script.find('{'):].strip().rstrip(';'))['initialState']
        containers, sections = [], {}
        article_map = {}
        pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
        for key in data:
            if 'Article' in key:
                adata = data[key]
                if adata.get('__typename') == 'Article':
                    url = adata.get('url')
                    summary = adata.get('summary')
                    headline = adata.get('headline')
                    if url and headline:
                        title = data[headline['id']]['default']
                        article_map[adata['id']] = {
                            'title': title, 'url': url, 'description': summary or ''}
            elif 'Legacy' in key:
                sdata = data[key]
                tname = sdata.get('__typename')
                if tname == 'LegacyCollectionContainer':
                    containers.append(sdata['label'] or sdata['name'])
                elif tname == 'LegacyCollectionRelation':
                    m = pat.search(key)
                    grouping, container, relation = map(int, m.groups())
                    asset = sdata['asset']
                    if asset['typename'] == 'Article' and grouping == 0:
                        if container not in sections:
                            sections[container] = []
                        sections[container].append(asset['id'].split(':', 1)[1])
        feeds = []
-        for i, section in enumerate(section.findAll('section')):
+        for i, section_title in enumerate(containers):
-            h2 = section.find('h2')
+            if i in sections:
-            section_title = self.tag_to_string(h2)
+                articles = sections[i]
-            self.log('\nFound section:', section_title)
+                if articles:
-            if i == 0:
+                    self.log('\n' + section_title)
-                articles = []
+                    feeds.append((section_title, []))
-                for div in section.findAll('div', recursive=False):
+                    for artid in articles:
-                    articles += list(self.parse_todays_sections(div.find('ol')))
+                        if artid in article_map:
-            else:
+                            art = article_map[artid]
-                articles = list(self.parse_todays_sections(section.find('ol')))
+                            feeds[-1][1].append(art)
-            if articles:
+                            self.log('\t' + art['title'])
-                feeds.append((section_title, articles))
+
        def skey(x):
            name = x[0].strip()
            if name == 'The Front Page':
                return 0, ''
            return 1, name.lower()
        feeds.sort(key=skey)
        return feeds
    def parse_highlights(self, container):
@ -292,8 +299,10 @@ class NewYorkTimes(BasicNewsRecipe):
        return self.get_browser()
    def open_novisit(self, *args, **kwargs):
-        from calibre import browser
+        from calibre import browser, random_user_agent
-        br = browser()
+        if not hasattr(self, 'rua_stored'):
            self.rua_stored = random_user_agent(allow_ie=False)
        br = browser(user_agent=self.rua_stored)
        response = br.open_novisit(*args, **kwargs)
        # headers = response.info()
        # if headers.get('X-PageType') == 'vi-story':