calibre/recipes/nytimes_sub.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import datetime
import json
import re
from pprint import pprint  # noqa

from calibre import strftime
from calibre.ebooks.BeautifulSoup import Tag
from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe

use_wayback_machine = False


# The sections to download when downloading the web edition, comment out
# the section you are not interested in
web_sections = [
    ('World', 'world'),
    ('U.S.', 'us'),
    ('Politics', 'politics'),
    ('New York', 'nyregion'),
    ('Business', 'business'),
    ('Technology', 'technology'),
    ('Sports', 'sports'),
    ('Science', 'science'),
    ('Health', 'health'),
    ('Opinion', 'opinion'),
    ('Arts', 'arts'),
    # ('Books', 'books'),
    ('Movies', 'movies'),
    ('Music', 'arts/music'),
    ('Television', 'arts/television'),
    ('Style', 'style'),
    ('Dining & Wine', 'dining'),
    ('Fashion & Style', 'fashion'),
    ('Home & Garden', 'garden'),
    ('Travel', 'travel'),
    ('Education', 'education'),
    ('Multimedia', 'multimedia'),
    ('Obituaries', 'obituaries'),
    ('Sunday Magazine', 'magazine')
]
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')


def date_from_url(url):
    m = url_date_pat.search(url)
    if m is not None:
        return datetime.date(*map(int, m.groups()))


def format_date(d):
    try:
        return strftime(' [%a, %d %b %Y]', d)
    except Exception:
        return strftime(' [%Y/%m/%d]', d)


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)


class NewYorkTimes(BasicNewsRecipe):
    title = 'The New York Times'
    description = (
        'New York Times. Todays Paper '
        'Use advanced menu to make changes to fetch Web Edition'
    )
    encoding = 'utf-8'
    __author__ = 'Kovid Goyal'
    language = 'en_US'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
    is_web_edition = False
    oldest_web_edition_article = 7  # days

    extra_css = """
        .byl, .time { font-size:small; color:#202020; }
        .cap { font-size:small; text-align:center; }
        .cred { font-style:italic; font-size:small; }
        em, blockquote { color: #202020; }
        .sc { font-variant: small-caps; }
        .lbl { font-size:small; color:#404040; }
        img { display:block; margin:0 auto; }
    """

    @property
    def nyt_parser(self):
        ans = getattr(self, '_nyt_parser', None)
        if ans is None:
            from calibre.live import load_module
            self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
        return ans

    def get_nyt_page(self, url, skip_wayback=False):
        if use_wayback_machine and not skip_wayback:
            from calibre import browser
            return self.nyt_parser.download_url(url, browser())
        return self.index_to_soup(url, raw=True)

    def preprocess_raw_html(self, raw_html, url):
        return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url)

    articles_are_obfuscated = use_wayback_machine

    if use_wayback_machine:
        def get_obfuscated_article(self, url):
            from calibre.ptempfile import PersistentTemporaryFile
            with PersistentTemporaryFile() as tf:
                tf.write(self.get_nyt_page(url))
            return tf.name

    recipe_specific_options = {
        'web': {
            'short': 'Type in yes, if you want Web Edition',
            'default': 'Todays Paper'
        },
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)',
            'default': str(oldest_web_edition_article)
        },
        'date': {
            'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper',
            'long': 'For example, 2024/07/16'
        },
        'res': {
            'short': (
                'For hi-res images, select a resolution from the following\noptions: '
                'popup, jumbo, mobileMasterAt3x, superJumbo'
            ),
            'long': (
                'This is useful for non e-ink devices, and for a lower file size\nthan '
                'the default, use mediumThreeByTwo440, mediumThreeByTwo225, articleInline.'
            ),
        },
        'comp': {
            'short': 'Compress News Images?',
            'long': 'enter yes',
            'default': 'no'
        }
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        c = self.recipe_specific_options.get('comp')
        d = self.recipe_specific_options.get('days')
        w = self.recipe_specific_options.get('web')
        if w and isinstance(w, str):
            if w == 'yes':
                self.is_web_edition = True
        if d and isinstance(d, str):
            self.oldest_web_edition_article = float(d)
        if c and isinstance(c, str):
            if c.lower() == 'yes':
                self.compress_news_images = True

    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
        d = self.recipe_specific_options.get('date')
        if d and isinstance(d, str):
            INDEX = 'https://www.nytimes.com/issue/todayspaper/' + d + '/todays-new-york-times'
        return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True))

    def read_nyt_metadata(self):
        soup = self.read_todays_paper()
        pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
        date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
        self.timefmt = strftime(' [%d %b, %Y]', date)
        return soup

    def parse_todays_page(self):
        soup = self.read_nyt_metadata()
        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
        script = type(u'')(script)
        json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
        data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
        containers, sections = {}, {}
        article_map = {}
        gc_pat = re.compile(r'groupings.(\d+).containers.(\d+)')
        pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
        for key in data:
            if 'Article' in key:
                adata = data[key]
                if adata.get('__typename') == 'Article':
                    url = adata.get('url')
                    summary = adata.get('summary')
                    headline = adata.get('headline')
                    if url and headline:
                        title = data[headline['id']]['default']
                        article_map[adata['id']] = {
                            'title': title, 'url': url, 'description': summary or ''}
            elif 'Legacy' in key:
                sdata = data[key]
                tname = sdata.get('__typename')
                if tname == 'LegacyCollectionContainer':
                    m = gc_pat.search(key)
                    containers[int(m.group(2))] = sdata['label'] or sdata['name']
                elif tname == 'LegacyCollectionRelation':
                    m = pat.search(key)
                    grouping, container, relation = map(int, m.groups())
                    asset = sdata['asset']
                    if asset and asset['typename'] == 'Article' and grouping == 0:
                        if container not in sections:
                            sections[container] = []
                        sections[container].append(asset['id'].split(':', 1)[1])

        feeds = []
        for container_num in sorted(containers):
            section_title = containers[container_num]
            if container_num in sections:
                articles = sections[container_num]
                if articles:
                    feeds.append((section_title, []))
                    for artid in articles:
                        if artid in article_map:
                            art = article_map[artid]
                            feeds[-1][1].append(art)

        def skey(x):
            name = x[0].strip()
            if name == 'The Front Page':
                return 0, ''
            return 1, name.lower()
        feeds.sort(key=skey)
        for section, articles in feeds:
            self.log('\n' + section)
            for article in articles:
                self.log(article['title'] + ' - ' + article['url'])
        # raise SystemExit(1)
        return feeds

    def parse_article_group(self, container):
        for li in container.findAll('li'):
            article = li.find('article')
            if article is None:
                a = li.find('a', href=True)
                if a is not None:
                    title = self.tag_to_string(li.find(['h3', 'h2'])).strip()
                    paras = li.findAll('p')
                    if not title:
                        title = self.tag_to_string(paras[0]).strip()
                    if not title:
                        raise ValueError('No title found in article')
                    url = a['href']
                    if url.startswith('/'):
                        url = 'https://www.nytimes.com' + url
                    desc = ''
                    if len(paras) > 0:
                        desc = self.tag_to_string(paras[-1])
                    date = ''
                    d = date_from_url(url)
                    if d is not None:
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
                        if delta.days > self.oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}
                continue
            h2 = article.find(['h2', 'h3'])
            if h2 is not None:
                title = self.tag_to_string(h2)
                a = h2.find('a', href=True)
                if a is not None:
                    url = a['href']
                    if url.startswith('/'):
                        url = 'https://www.nytimes.com' + url
                    desc = ''
                    p = h2.findNextSibling('p')
                    if p is not None:
                        desc = self.tag_to_string(p)
                    date = ''
                    d = date_from_url(url)
                    if d is not None:
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
                        if delta.days > self.oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}

    def parse_web_section(self, soup, slug):

        def log(article):
            self.log('\t', article['title'] + article['date'], ':', article['url'])
            if article.get('description'):
                self.log('\t\t', article['description'])

        cid = slug.split('/')[-1]
        if cid == 'dining':
            cid = 'food'
        try:
            container = soup.find(id='collection-{}'.format(cid)).find('section')
        except AttributeError:
            container = None
        if container is None:
            raise ValueError('Failed to find articles container for slug: {}'.format(slug))
        for ol in container.findAll('ol'):
            for article in self.parse_article_group(ol):
                log(article)
                yield article

    def parse_web_sections(self):
        self.read_nyt_metadata()
        feeds = []
        for section_title, slug in web_sections:
            url = 'https://www.nytimes.com/section/' + slug
            try:
                soup = self.index_to_soup(self.get_nyt_page(url))
            except Exception:
                self.log.error('Failed to download section:', url)
                continue
            self.log('Found section:', section_title)
            articles = list(self.parse_web_section(soup, slug))
            if articles:
                feeds.append((section_title, articles))
            if self.test and len(feeds) >= self.test[0]:
                break
        return feeds

    def parse_index(self):
        # return [('All articles', [
        #     {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
        # ])]
        if self.is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()

    def get_browser(self, *args, **kwargs):
        kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        br.addheaders += [
            ('Referer', 'https://www.google.com/'),
            ('X-Forwarded-For', '66.249.66.1')
        ]
        return br

    def preprocess_html(self, soup):
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str):
            res = '-' + w
            for img in soup.findAll('img', attrs={'src':True}):
                if '-article' in img['src']:
                    ext = img['src'].split('?')[0].split('.')[-1]
                    img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext
        for c in soup.findAll('div', attrs={'class':'cap'}):
            for p in c.findAll(['p', 'div']):
                p.name = 'span'
        return soup

    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        if not re.search(r'/video/|/athletic/', url):
            return url