#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2009, Matthew Briggs' __docformat__ = 'restructuredtext en' ''' http://www.theaustralian.news.com.au/ ''' from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class DailyTelegraph(BasicNewsRecipe): title = u'The Australian' __author__ = u'Kovid Goyal' description = (u'National broadsheet newspaper from down under - colloquially known as The Oz' '. You will need to have a subscription to ' 'http://www.theaustralian.com.au to get full articles.') language = 'en_AU' oldest_article = 2 needs_subscription = 'optional' max_articles_per_feed = 30 remove_javascript = True no_stylesheets = True encoding = 'utf8' remove_empty_feeds = True ignore_duplicate_articles = {'url'} keep_only_tags = [ classes('story-headline'), dict(id='story'), ] remove_tags = [ dict(id='comments'), classes('story-info story-header-tools module-controls story-sidebar' ' story-footer story-footer-tools story-extras story-related vms-nav' ' vms-endcard vms-discover share-tools story-comments-link' ' vms-controls ooyala-player vms-countdown vms-header comments') ] feeds = [ (u'National News', u'https://www.news.com.au/content-feeds/latest-news-national/'), (u'World News', u'https://www.news.com.au/content-feeds/latest-news-world/'), (u'Entertainment', u'https://www.news.com.au/content-feeds/latest-news-entertainment/'), (u'Technology', u'https://www.news.com.au/content-feeds/latest-news-technology/'), (u'Lifestyle', u'https://www.news.com.au/content-feeds/latest-news-lifestyle/'), (u'Sport', u'https://www.news.com.au/content-feeds/latest-news-sport/'), (u'Finance', u'https://www.news.com.au/content-feeds/latest-news-finance/'), (u'Travel', u'https://www.news.com.au/content-feeds/latest-news-travel/'), ] def get_article_url(self, article): ans = article.link if '/video/' in ans: return return ans def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] return soup