calibre/recipes/financial_times.recipe

#!/usr/bin/env python2
# vim:fileencoding=utf-8
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
www.ft.com
'''

from calibre.web.feeds.news import BasicNewsRecipe
from urllib import unquote


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class FinancialTimes_rss(BasicNewsRecipe):
    title = 'Financial Times'
    __author__ = 'Darko Miletic'
    description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy."  # noqa
    publisher = 'The Financial Times Ltd.'
    category = 'news, finances, politics, World'
    oldest_article = 2
    language = 'en'
    max_articles_per_feed = 250
    no_stylesheets = True
    use_embedded_content = False
    needs_subscription = True
    encoding = 'utf8'
    ignore_duplicate_articles = {'title'}
    remove_empty_feeds = True
    publication_type = 'newspaper'
    masthead_url = 'http://im.media.ft.com/m/img/masthead_main.jpg'
    handle_gzip = True
    LOGIN = 'https://www.ft.com/login?location=/'
    INDEX = 'https://www.ft.com'
    LOGOUT = 'https://myaccount.ft.com/logout'

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open(self.INDEX)
        if self.username is not None and self.password is not None:
            br.open(self.LOGIN)
            br.select_form(name='enter-email-form')
            br['email'] = self.username
            br.submit()
            br.select_form(name='enter-password-form')
            br['password'] = self.password
            br.submit()
        return br

    def cleanup(self):
        self.browser.open(self.LOGOUT)

    keep_only_tags = [
        classes('article__header--wrapper article__time-byline article__body n-content-image barrier-grid__heading')
    ]

    remove_tags = [
        classes('n-content-related-box tour-tip')
    ]

    remove_attributes = ['width', 'height', 'lang', 'style']

    feeds = [
        (u'UK', u'https://www.ft.com/rss/home/uk'),
        (u'US', u'https://www.ft.com/rss/home/us'),
        (u'Asia', u'https://www.ft.com/rss/home/asia'),
        (u'Middle East', u'https://www.ft.com/rss/home/middleeast')
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', srcset=True):
            src = img['srcset'].split(',')[0].strip()
            src = unquote(src.rpartition('/')[2].partition('?')[0])
            img['src'] = src
        return soup