calibre/recipes/nytimes_sub.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals
import datetime
import json
import re
from pprint import pprint  # noqa

from calibre import strftime
from calibre.ebooks.BeautifulSoup import Tag
from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe

is_web_edition = False
oldest_web_edition_article = 7  # days
use_wayback_machine = True


# The sections to download when downloading the web edition, comment out
# the section you are not interested in
web_sections = [
    ('World', 'world'),
    ('U.S.', 'us'),
    ('Politics', 'politics'),
    ('New York', 'nyregion'),
    ('Business', 'business'),
    ('Technology', 'technology'),
    ('Sports', 'sports'),
    ('Science', 'science'),
    ('Health', 'health'),
    ('Opinion', 'opinion'),
    ('Arts', 'arts'),
    ('Books', 'books'),
    ('Movies', 'movies'),
    ('Music', 'arts/music'),
    ('Television', 'arts/television'),
    ('Style', 'style'),
    ('Dining & Wine', 'dining'),
    ('Fashion & Style', 'fashion'),
    ('Home & Garden', 'garden'),
    ('Travel', 'travel'),
    ('Education', 'education'),
    ('Multimedia', 'multimedia'),
    ('Obituaries', 'obituaries'),
    ('Sunday Magazine', 'magazine')
]
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')


def date_from_url(url):
    m = url_date_pat.search(url)
    if m is not None:
        return datetime.date(*map(int, m.groups()))


def format_date(d):
    try:
        return strftime(' [%a, %d %b %Y]', d)
    except Exception:
        return strftime(' [%Y/%m/%d]', d)


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)


class NewYorkTimes(BasicNewsRecipe):

    if is_web_edition:
        title = 'The New York Times (Web)'
        description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
    else:
        title = 'The New York Times'
        description = 'Today\'s New York Times'
    encoding = 'utf-8'
    __author__ = 'Kovid Goyal'
    language = 'en'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
    compress_news_images = True
    compress_news_images_auto_size = 5
    conversion_options = {'flow_size': 0}
    delay = 0 if use_wayback_machine else 1

    @property
    def nyt_parser(self):
        ans = getattr(self, '_nyt_parser', None)
        if ans is None:
            from calibre.live import load_module
            self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
        return ans

    def get_nyt_page(self, url, skip_wayback=False):
        if use_wayback_machine and not skip_wayback:
            from calibre import browser
            return self.nyt_parser.download_url(url, browser())
        return self.browser.open_novisit(url).read()

    def preprocess_raw_html(self, raw_html, url):
        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
        return html

    articles_are_obfuscated = use_wayback_machine

    if use_wayback_machine:
        def get_obfuscated_article(self, url):
            from calibre.ptempfile import PersistentTemporaryFile
            with PersistentTemporaryFile() as tf:
                tf.write(self.get_nyt_page(url))
            return tf.name

    def read_todays_paper(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
        return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True))

    def read_nyt_metadata(self):
        soup = self.read_todays_paper()
        pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
        date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
        self.timefmt = strftime(' [%d %b, %Y]', date)
        return soup

    def parse_todays_page(self):
        soup = self.read_nyt_metadata()
        script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
        script = type(u'')(script)
        json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
        data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
        containers, sections = {}, {}
        article_map = {}
        gc_pat = re.compile(r'groupings.(\d+).containers.(\d+)')
        pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
        for key in data:
            if 'Article' in key:
                adata = data[key]
                if adata.get('__typename') == 'Article':
                    url = adata.get('url')
                    summary = adata.get('summary')
                    headline = adata.get('headline')
                    if url and headline:
                        title = data[headline['id']]['default']
                        article_map[adata['id']] = {
                            'title': title, 'url': url, 'description': summary or ''}
            elif 'Legacy' in key:
                sdata = data[key]
                tname = sdata.get('__typename')
                if tname == 'LegacyCollectionContainer':
                    m = gc_pat.search(key)
                    containers[int(m.group(2))] = sdata['label'] or sdata['name']
                elif tname == 'LegacyCollectionRelation':
                    m = pat.search(key)
                    grouping, container, relation = map(int, m.groups())
                    asset = sdata['asset']
                    if asset and asset['typename'] == 'Article' and grouping == 0:
                        if container not in sections:
                            sections[container] = []
                        sections[container].append(asset['id'].split(':', 1)[1])

        feeds = []
        for container_num in sorted(containers):
            section_title = containers[container_num]
            if container_num in sections:
                articles = sections[container_num]
                if articles:
                    feeds.append((section_title, []))
                    for artid in articles:
                        if artid in article_map:
                            art = article_map[artid]
                            feeds[-1][1].append(art)

        def skey(x):
            name = x[0].strip()
            if name == 'The Front Page':
                return 0, ''
            return 1, name.lower()
        feeds.sort(key=skey)
        for section, articles in feeds:
            self.log('\n' + section)
            for article in articles:
                self.log(article['title'] + ' - ' + article['url'])
        # raise SystemExit(1)
        return feeds

    def parse_article_group(self, container):
        for li in container.findAll('li'):
            article = li.find('article')
            if article is None:
                a = li.find('a', href=True)
                if a is not None:
                    title = self.tag_to_string(li.find('h3'))
                    url = a['href']
                    if url.startswith('/'):
                        url = 'https://www.nytimes.com' + url
                    desc = ''
                    p = li.find('p')
                    if p is not None:
                        desc = self.tag_to_string(p)
                    date = ''
                    d = date_from_url(url)
                    if d is not None:
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
                        if delta.days > oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}
                continue
            h2 = article.find('h2')
            if h2 is not None:
                title = self.tag_to_string(h2)
                a = h2.find('a', href=True)
                if a is not None:
                    url = a['href']
                    if url.startswith('/'):
                        url = 'https://www.nytimes.com' + url
                    desc = ''
                    p = h2.findNextSibling('p')
                    if p is not None:
                        desc = self.tag_to_string(p)
                    date = ''
                    d = date_from_url(url)
                    if d is not None:
                        date = format_date(d)
                        today = datetime.date.today()
                        delta = today - d
                        if delta.days > oldest_web_edition_article:
                            self.log.debug('\tSkipping article', title, 'as it is too old')
                            continue
                    yield {'title': title, 'url': url, 'description': desc, 'date': date}

    def parse_web_section(self, soup, slug):

        def log(article):
            self.log('\t', article['title'] + article['date'], ':', article['url'])
            if article.get('description'):
                self.log('\t\t', article['description'])

        cid = slug.split('/')[-1]
        if cid == 'dining':
            cid = 'food'
        try:
            container = soup.find(id='collection-{}'.format(cid)).find('section')
        except AttributeError:
            container = None
        if container is None:
            raise ValueError('Failed to find articles container for slug: {}'.format(slug))
        for ol in container.findAll('ol'):
            for article in self.parse_article_group(ol):
                log(article)
                yield article

    def parse_web_sections(self):
        self.read_nyt_metadata()
        feeds = []
        for section_title, slug in web_sections:
            url = 'https://www.nytimes.com/section/' + slug
            try:
                soup = self.index_to_soup(self.get_nyt_page(url))
            except Exception:
                self.log.error('Failed to download section:', url)
                continue
            self.log('Found section:', section_title)
            articles = list(self.parse_web_section(soup, slug))
            if articles:
                feeds.append((section_title, articles))
            if self.test and len(feeds) >= self.test[0]:
                break
        return feeds

    def parse_index(self):
        # return [('All articles', [
        #     {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
        # ])]
        if is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()