calibre/recipes/telegraph_uk.recipe

__license__ = 'GPL v3'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
telegraph.co.uk
'''
from calibre.web.feeds.news import BasicNewsRecipe
import json


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(
        attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
    )


def absolutize(url):
    if url.startswith('/'):
        url = 'http://www.telegraph.co.uk' + url
    return url


class TelegraphUK(BasicNewsRecipe):
    title = 'The Telegraph (UK)'
    __author__ = 'A10KiloHam, based on work by Darko Miletic and Sujata Raman'
    description = 'News from United Kingdom'
    oldest_article = 2
    category = 'news, politics, UK'
    publisher = 'Telegraph Media Group ltd.'
    max_articles_per_feed = 100
    no_stylesheets = True
    language = 'en_GB'
    encoding = 'utf-8'
    needs_subscription = True
    ignore_duplicate_articles = {'title', 'url'}
    remove_empty_feeds = True
    use_embedded_content = False
    INDEX = 'https://www.telegraph.co.uk/'
    LOGIN = 'https://secure.telegraph.co.uk/customer/secure/login/?redirectTo=https%3A%2F%2Fwww.telegraph.co.uk%2F'
    PREFIX = u'https://www.telegraph.co.uk'

    feeds = [(u'News', u'http://www.telegraph.co.uk/news/rss.xml'),
             (u'Politics', u'https://www.telegraph.co.uk/politics/rss.xml'),
             (u'Business', u'http://www.telegraph.co.uk/business/rss.xml'),
             (u'Money', u'http://www.telegraph.co.uk/money/rss.xml'),
             (u'Technology', u'http://www.telegraph.co.uk/technology/rss.xml'),
             (u'Science', u'http://www.telegraph.co.uk/science/rss.xml'),
             (u'Opinion', u'http://www.telegraph.co.uk/opinion/rss.xml'),
             (u'Travel', u'http://www.telegraph.co.uk/travel/rss.xml'),
             (u'Culture', u'http://www.telegraph.co.uk/culture/rss.xml'),
             (u'Lifestyle', u'http://www.telegraph.co.uk/lifestyle/rss.xml'),
             (u'Money', u'http://www.telegraph.co.uk/opinion/rss.xml'),
             (u'Opinion', u'http://www.telegraph.co.uk/money/rss.xml'),
             (u'Fashion', u'http://www.telegraph.co.uk/fashion/rss.xml')]

    keep_only_tags = [
        classes(
            'lead-asset-image-container headline__heading footer-author article-author__meta'
        ),
        dict(itemprop='articleBody'),
    ]

    remove_tags = [
        dict(name=['link', 'meta', 'style']),
        classes('videoPlayer'),
    ]
    remove_attributes = 'width height'.split()

    def get_cover_url(self):
        from datetime import date
        cover = 'http://img.kiosko.net/' + str(
            date.today().year
        ) + '/' + date.today().strftime('%m') + '/' + date.today(
        ).strftime('%d') + '/uk/daily_telegraph.750.jpg'
        br = BasicNewsRecipe.get_browser(self)
        try:
            br.open(cover)
        except:
            index = 'http://en.kiosko.net/uk/np/daily_telegraph.html'
            soup = self.index_to_soup(index)
            for image in soup.findAll('img', src=True):
                if image['src'].endswith('750.jpg'):
                    return image['src']
            self.log("\nCover unavailable")
            cover = None
        return cover

    def get_browser(self, *a, **kw):
        USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
        br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT)
        self.log('Forming login request...')
        if self.username is not None and self.password is not None:
            self.log('Starting login process...')
            br.open(self.LOGIN)
            br.select_form(nr=0)
            br['email'] = self.username
            br['password'] = self.password
            self.log('Sending login request...')
            br.submit()
        return br

    def get_article_url(self, article):
        url = article.get('link', None)
        if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url:
            url = None
        return url

    def preprocess_html(self, soup):
        for img in soup.findAll(attrs={'data-frz-src-array': True}):
            img['style'] = ''
            img.name = 'img'
            d = json.loads(img['data-frz-src-array'].replace("'", '"'))
            for item in d:
                if int(item.get('width', 0)) > 700:
                    img['src'] = absolutize(item['src'])
                    break
        for img in soup.findAll('div', attrs={'data-js': 'LazyImage'}):
            img['style'] = ''
            img.name = 'img'
            img['src'] = img['data-srcset'].split()[0]
        return soup