calibre/recipes/nytimes_tech.recipe

#!/usr/bin/env python2
# encoding: utf-8

from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = 'zotzo'
__docformat__ = 'restructuredtext en'

from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(
        attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
    )


class NYTimesTechnology(BasicNewsRecipe):
    title = 'New York Times Technology Beat'
    language = 'en'
    description = 'The latest in technology - Gadgetwise'
    publisher = 'The New York Times'
    category = 'Technology'
    oldest_article = 14
    max_articles_per_feed = 25
    remove_empty_feeds = True
    no_stylesheets = True
    language = 'en'
    feeds = [
             (u'Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
    ]
    keep_only_tags = [
        dict(id='story'),
    ]
    remove_tags = [
        dict(attrs={'aria-label':'tools'.split()}),
        dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
        dict(href='#site-content #site-index'.split()),
        dict(attrs={'aria-hidden':'true'}),
        dict(attrs={'data-videoid':True}),
        dict(name='button meta link'.split()),
        dict(id=lambda x: x and x.startswith('story-ad-')),
        dict(name='head'),
        dict(role='toolbar'),
        dict(name='a', href=lambda x: x and '#story-continues-' in x),
        dict(name='a', href=lambda x: x and '#whats-next' in x),
        dict(id=lambda x: x and 'sharetools-' in x),
        dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()),
        classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
        dict(attrs={'class': lambda x: x and (
            'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
    ]

    def preprocess_html(self, soup):
        # Add a space to the dateline
        t = soup.find(**classes('dateline'))
        if t is not None:
            t.insert(0, ' ')

        # Remove empty li tags
        for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
            if not li.contents and not li.string:
                li.extract()

        # Ensure the headline is first
        h1 = soup.find('h1', itemprop='headline')
        if h1 is not None:
            h1.extract()
            soup.find('body').contents.insert(0, h1)
        return soup