calibre/recipes/nytimes.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import datetime
import re
import json
from pprint import pprint  # noqa

from calibre import strftime
from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag

is_web_edition = True
oldest_web_edition_article = 7  # days

# The sections to download when downloading the web edition, comment out
# the section you are not interested in
web_sections = [
    ('World', 'world'),
    ('U.S.', 'us'),
    ('Politics', 'politics'),
    ('New York', 'nyregion'),
    ('Business', 'business'),
    ('Technology', 'technology'),
    ('Sports', 'sports'),
    ('Science', 'science'),
    ('Health', 'health'),
    ('Opinion', 'opinion'),
    ('Arts', 'arts'),
    ('Books', 'books'),
    ('Movies', 'movies'),
    ('Music', 'arts/music'),
    ('Television', 'arts/television'),
    ('Style', 'style'),
    ('Dining & Wine', 'dining'),
    ('Fashion & Style', 'fashion'),
    ('Home & Garden', 'garden'),
    ('Travel', 'travel'),
    ('Education', 'education'),
    ('Multimedia', 'multimedia'),
    ('Obituaries', 'obituaries'),
    ('Sunday Magazine', 'magazine')
]
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')


def date_from_url(url):
    m = url_date_pat.search(url)
    if m is not None:
        return datetime.date(*map(int, m.groups()))


def format_date(d):
    try:
        return strftime(' [%a, %d %b %Y]', d)
    except Exception:
        return strftime(' [%Y/%m/%d]', d)


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)


class NewYorkTimes(BasicNewsRecipe):

    if is_web_edition:
        title = 'The New York Times (Web)'
        description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
    else:
        title = 'The New York Times'
        description = 'Today\'s New York Times'
    encoding = 'utf-8'
    __author__ = 'Kovid Goyal'
    language = 'en'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
    compress_news_images = True
    compress_news_images_auto_size = 5
    remove_attributes = ['style']
    conversion_options = {'flow_size': 0}

    remove_tags = [
        dict(attrs={'aria-label':'tools'.split()}),
        dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
        dict(href='#site-content #site-index'.split()),
        dict(attrs={'aria-hidden':'true'}),
        dict(attrs={'data-videoid':True}),
        dict(name='button meta link'.split()),
        dict(id=lambda x: x and x.startswith('story-ad-')),
        dict(name='head'),
        dict(role='toolbar'),
        dict(name='a', href=lambda x: x and '#story-continues-' in x),
        dict(name='a', href=lambda x: x and '#whats-next' in x),
        dict(id=lambda x: x and 'sharetools-' in x),
        dict(id='newsletter-promo supported-by-ad bottom-wrapper top-wrapper sponsor-wrapper'.split()),
        classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
        dict(attrs={'class': lambda x: x and (
            'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
    ]

    def preprocess_html(self, soup):
        article = soup.find(id='story')
        # The NYT is apparently A/B testing a new page layout
        has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
        if has_supplemental:
            keep_only_tags = [
                dict(id='story-header'),
                classes('story-body-supplemental story-interrupter'),
            ]
        else:
            keep_only_tags = [
                dict(id='story'),
            ]
        body = new_tag(soup, 'body')
        for spec in keep_only_tags:
            for tag in soup.find('body').findAll(**spec):
                body.insert(len(body.contents), tag)
        soup.find('body').replaceWith(body)

        # Add a space to the dateline
        t = soup.find(**classes('dateline'))
        if t is not None:
            t.insert(0, ' ')

        # Remove empty li tags
        for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
            if not li.contents and not li.string:
                li.extract()

        # Ensure the headline is first
        h1 = soup.find('h1', itemprop='headline')
        if h1 is not None:
            h1.extract()
            soup.find('body').contents.insert(0, h1)

        # Find lazy loaded images
        for div in soup.findAll(itemtype='http://schema.org/ImageObject', itemid=True):
            if div.find('img') is None:
                span = div.find('span')
                if span is not None and self.tag_to_string(span).strip().lower() == 'image':
                    span.name = 'img'
                    span['src'] = div['itemid']
        return soup

    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
        try:
            soup = self.index_to_soup(INDEX)
        except Exception as err:
            if getattr(err, 'code', None) == 404:
                try:
                    soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
                except Exception as err:
                    if getattr(err, 'code', None) == 404:
                        dt = datetime.datetime.today() - datetime.timedelta(days=1)
                        soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
                    else:
                        raise
            else:
                raise
        return soup

    def read_nyt_metadata(self):
        soup = self.read_todays_paper()
        pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
        date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
        self.timefmt = strftime(' [%d %b, %Y]', date)
        return soup

    def parse_todays_page(self):
        soup = self.read_nyt_metadata()
        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
        script = type(u'')(script)
        data = json.loads(script[script.find('{'):script.rfind(';')].strip().rstrip(';'))['initialState']
        containers, sections = {}, {}
        article_map = {}
        gc_pat = re.compile(r'groupings.(\d+).containers.(\d+)')
        pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
        for key in data:
            if 'Article' in key:
                adata = data[key]
                if adata.get('__typename') == 'Article':
                    url = adata.get('url')
                    summary = adata.get('summary')
                    headline = adata.get('headline')
                    if url and headline:
                        title = data[headline['id']]['default']
                        article_map[adata['id']] = {
                            'title': title, 'url': url, 'description': summary or ''}
            elif 'Legacy' in key:
                sdata = data[key]
                tname = sdata.get('__typename')
                if tname == 'LegacyCollectionContainer':
                    m = gc_pat.search(key)
                    containers[int(m.group(2))] = sdata['label'] or sdata['name']
                elif tname == 'LegacyCollectionRelation':
                    m = pat.search(key)
                    grouping, container, relation = map(int, m.groups())
                    asset = sdata['asset']
                    if asset and asset['typename'] == 'Article' and grouping == 0:
                        if container not in sections:
                            sections[container] = []
                        sections[container].append(asset['id'].split(':', 1)[1])

        feeds = []
        for container_num in sorted(containers):
            section_title = containers[container_num]
            if container_num in sections:
                articles = sections[container_num]
                if articles:
                    feeds.append((section_title, []))
                    for artid in articles:
                        if artid in article_map:
                            art = article_map[artid]
                            feeds[-1][1].append(art)

        def skey(x):
            name = x[0].strip()
            if name == 'The Front Page':
                return 0, ''
            return 1, name.lower()
        feeds.sort(key=skey)
        for section, articles in feeds:
            self.log('\n' + section)
            for article in articles:
                self.log(article['title'] + ' - ' + article['url'])
        # raise SystemExit(1)
        return feeds

    def parse_article_group(self, container):
        for li in container.findAll('li'):
            article = li.find('article')
            h2 = article.find('h2')
            if h2 is not None:
                title = self.tag_to_string(h2)
                a = h2.find('a', href=True)
                if a is not None:
                    url = a['href']
                    if url.startswith('/'):
                        url = 'https://www.nytimes.com' + url
                    desc = ''
                    p = h2.findNextSibling('p')
                    if p is not None:
                        desc = self.tag_to_string(p)
                    date = ''
                    d = date_from_url(url)
                    if d is not None:
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
                        if delta.days > oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}

    def parse_web_section(self, soup, slug):

        def log(article):
            self.log('\t', article['title'] + article['date'], ':', article['url'])
            if article.get('description'):
                self.log('\t\t', article['description'])

        cid = slug.split('/')[-1]
        if cid == 'dining':
            cid = 'food'
        try:
            container = soup.find(id='collection-{}'.format(cid)).find('section')
        except AttributeError:
            container = None
        if container is None:
            raise ValueError('Failed to find articles container for slug: {}'.format(slug))
        for ol in container.findAll('ol'):
            for article in self.parse_article_group(ol):
                log(article)
                yield article

    def parse_web_sections(self):
        self.read_nyt_metadata()
        feeds = []
        for section_title, slug in web_sections:
            url = 'https://www.nytimes.com/section/' + slug
            try:
                soup = self.index_to_soup(url)
            except Exception:
                self.log.error('Failed to download section:', url)
                continue
            self.log('Found section:', section_title)
            articles = list(self.parse_web_section(soup, slug))
            if articles:
                feeds.append((section_title, articles))
            if self.test and len(feeds) >= self.test[0]:
                break
        return feeds

    def parse_index(self):
        # return [('All articles', [
        #     {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
        # ])]
        if is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()

    # The NYT occassionally returns bogus articles for some reason just in case
    # it is because of cookies, dont store cookies
    def get_browser(self, *args, **kwargs):
        return self

    def clone_browser(self, *args, **kwargs):
        return self.get_browser()

    def open_novisit(self, *args, **kwargs):
        from calibre import browser, random_user_agent
        if not hasattr(self, 'rua_stored'):
            self.rua_stored = random_user_agent(allow_ie=False)
        br = browser(user_agent=self.rua_stored)
        response = br.open_novisit(*args, **kwargs)
        # headers = response.info()
        # if headers.get('X-PageType') == 'vi-story':
        #     import tempfile
        #     with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
        #         f.write(response.read())
        #     import time
        #     time.sleep(1)
        #     br = browser()
        #     response = br.open_novisit(*args, **kwargs)
        return response

    open = open_novisit