calibre/recipes/nytimes.recipe

#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.date import strptime

is_web_edition = True
# The sections to download when downloading the web edition, comment out
# the section you are not interested in
web_sections = [
    ('World', 'world'),
    ('U.S.', 'us'),
    ('Politics', 'politics'),
    ('New York', 'nyregion'),
    ('Business', 'business'),
    ('Technology', 'technology'),
    ('Sports', 'sports'),
    ('Science', 'science'),
    ('Health', 'health'),
    ('Opinion', 'opinion'),
    ('Arts', 'arts'),
    ('Books', 'books'),
    ('Movies', 'movies'),
    ('Music', 'arts/music'),
    ('Television', 'arts/television'),
    ('Style', 'style'),
    ('Dining & Wine', 'dining'),
    ('Fashion & Style', 'fashion'),
    ('Home & Garden', 'garden'),
    ('Travel', 'travel'),
    ('Education', 'education'),
    ('Multimedia', 'multimedia'),
    ('Obituaries', 'obituaries'),
    ('Sunday Magazine', 'magazine')
]


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class NewYorkTimes(BasicNewsRecipe):

    title = 'The New York Times'
    if is_web_edition:
        description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
    else:
        description = 'Today\'s New York Times'
    encoding = 'utf-8'
    __author__ = 'Kovid Goyal'
    language = 'en'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
    compress_news_images = True
    compress_news_images_auto_size = 5

    keep_only_tags = [
        dict(id='story-header'),
        classes('story-body-supplemental story-interrupter'),
    ]
    remove_tags = [
        dict(attrs={'aria-label':'tools'.split()}),
        dict(attrs={'data-videoid':True}),
        dict(name='button meta link'.split()),
        dict(id=lambda x: x and x.startswith('story-ad-')),
        dict(name='a', href=lambda x: x and '#story-continues-' in x),
        dict(name='a', href=lambda x: x and '#whats-next' in x),
        dict(id=lambda x: x and 'sharetools-' in x),
        dict(id='newsletter-promo'.split()),
        classes('story-print-citation'),
    ]

    def read_nyt_metadata(self):
        INDEX = 'https://www.nytimes.com/section/todayspaper'
        # INDEX = 'file:///t/raw.html'
        soup = self.index_to_soup(INDEX)
        pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
        date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
        self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
        self.timefmt = date.strftime(' [%d %b, %Y]')
        return soup

    def parse_todays_sections(self, container):
        for h2 in container.findAll('h2', **classes('headline')):
            title = self.tag_to_string(h2)
            a = h2.find('a', href=True)
            url = a['href']
            if '?' in url:
                url = url.split('?')[0]
            p = h2.findParent(**classes('story-body'))
            desc = ''
            if p is not None:
                s = p.find(**classes('summary'))
                if s is not None:
                    desc = self.tag_to_string(s)
            self.log('\t', title, ': ', url)
            self.log('\t\t', desc)
            yield {'title': title, 'url': url, 'description': desc}

    def parse_todays_page(self):
        soup = self.read_nyt_metadata()
        section = soup.find(id=lambda x: x and x.startswith('collection-todays-new-york-times'))
        feeds = []
        for i, h1 in enumerate(section.findAll('h1')):
            if i == 0:
                continue
            section_title = self.tag_to_string(h1)
            self.log('Found section:', section_title)
            if i == 1:
                container = h1.parent
                articles = list(self.parse_todays_sections(container))
                articles += list(self.parse_todays_sections(container.findNextSibling('div')))
            else:
                articles = list(self.parse_todays_sections(h1.findNextSibling('ol')))
            if articles:
                feeds.append((section_title, articles))
        return feeds

    def parse_highlights(self, container):
        for article in container.findAll('article', **classes('story')):
            h2 = article.find('h2')
            if h2 is not None:
                title = self.tag_to_string(h2)
                a = h2.find('a', href=True)
                if a is not None:
                    url = a['href']
                    desc = ''
                    p = article.find(**classes('summary'))
                    if p is not None:
                        desc = self.tag_to_string(p)
                    yield {'title': title, 'url': url, 'description': desc}

    def parse_web_section(self, soup, slug):

        def log(article):
            self.log('\t', article['title'], ':', article['url'])
            if article.get('description'):
                self.log('\t\t', article['description'])

        container = soup.find(itemtype='http://schema.org/CollectionPage')
        highlights = container.find('section', **classes('highlights'))
        for article in self.parse_highlights(highlights):
            log(article)
            yield article
        extra = container.find('section', attrs={'data-collection-type': True})
        if extra is not None:
            title = self.tag_to_string(extra.find('h2'))
            for article in self.parse_highlights(extra):
                article['title'] = '{}: {}'.format(title, article['title'])
                log(article)
                yield article

    def parse_web_sections(self):
        self.read_nyt_metadata()
        feeds = []
        for section_title, slug in web_sections:
            url = 'https://www.nytimes.com/section/' + slug
            try:
                soup = self.index_to_soup(url)
            except Exception:
                self.log.error('Failed to download section:', url)
                continue
            self.log('Found section:', section_title)
            articles = list(self.parse_web_section(soup, slug))
            if articles:
                feeds.append((section_title, articles))
            if self.test and len(feeds) >= self.test[0]:
                break
        return feeds

    def parse_index(self):
        if is_web_edition:
            return self.parse_web_sections()
        return self.parse_todays_page()