#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2016, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class TheDailyMail(BasicNewsRecipe): title = u'The Daily Mail' oldest_article = 2 language = 'en_GB' __author__ = 'Kovid Goyal' simultaneous_downloads = 1 max_articles_per_feed = 50 use_embedded_content = False compress_news_images = True compress_news_images_auto_size = 8 no_stylesheets = True keep_only_tags = [ dict(name='h1'), classes('author-section byline-section'), dict(itemprop='articleBody'), ] remove_tags = [ classes('related-carousel') ] feeds = [ (u'Home', u'http://www.dailymail.co.uk/home/index.rss'), (u'News', u'http://www.dailymail.co.uk/news/index.rss'), (u'Sport', u'http://www.dailymail.co.uk/sport/index.rss'), (u'TV and Showbiz', u'http://www.dailymail.co.uk/tvshowbiz/index.rss'), (u'Femail', u'http://www.dailymail.co.uk/femail/index.rss'), (u'Health', u'http://www.dailymail.co.uk/health/index.rss'), (u'Science and Technology', u'http://www.dailymail.co.uk/sciencetech/index.rss'), (u'Money', u'http://www.dailymail.co.uk/money/index.rss'), (u'Property', u'http://www.dailymail.co.uk/property/index.rss'), (u'Motoring', u'http://www.dailymail.co.uk/motoring/index.rss'), (u'Travel', u'http://www.dailymail.co.uk/travel/index.rss') ] def preprocess_html(self, soup): for img in soup.findAll(attrs={'data-src':True}): img['src'] = img['data-src'] all_h1s = soup.findAll('h1') for h1 in all_h1s[1:]: h1.extract() return soup