From b7d82235b88e4a1cace9bf0e9579ed169e1b9512 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 15 May 2020 09:41:15 +0530 Subject: [PATCH] Update ABC News --- recipes/abc_au.recipe | 113 +++++++++++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 40 deletions(-) diff --git a/recipes/abc_au.recipe b/recipes/abc_au.recipe index eae6cee270..974cc3c0d7 100644 --- a/recipes/abc_au.recipe +++ b/recipes/abc_au.recipe @@ -1,61 +1,94 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import unicode_literals, division, absolute_import, print_function + __license__ = 'GPL v3' -__copyright__ = '2011, Pat Stapleton ' +__copyright__ = '2020, Pat Stapleton ' ''' -abc.net.au/news +Recipe for ABC News Australia (online) ''' -import re -from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe class ABCNews(BasicNewsRecipe): - title = 'ABC News' - __author__ = 'Pat Stapleton, Dean Cording, James Cridland' + title = 'ABC News' + language = 'en_AU' + __author__ = 'Pat Stapleton' description = 'From the Australian Broadcasting Corporation. The ABC is owned and funded by the Australian Government, but is editorially independent.' masthead_url = 'https://www.abc.net.au/cm/lb/8212706/data/news-logo-2017---desktop-print-data.png' cover_url = 'https://www.abc.net.au/news/linkableblob/8413676/data/abc-news-og-data.jpg' cover_margins = (0,20,'#000000') - oldest_article = 2 - handle_gzip = True - no_stylesheets = True - use_embedded_content = False scale_news_images_to_device = True - encoding = 'utf8' - publisher = 'ABC News' - category = 'Australia,News' - language = 'en_AU' + oldest_article = 7 # days + max_articles_per_feed = 100 publication_type = 'newspaper' + +# auto_cleanup = True # enable this as a backup option if recipe stops working + +# use_embedded_content = False # if set to true will assume that all the article content is within the feed (i.e. won't try to fetch more data) + + no_stylesheets = True + remove_javascript = True + + keep_only_tags = [dict(id='content')] # the article content is contained in
tag + + # ************************************ + # Regular expressions for remove_tags: + # ************************************ + # remove aside tag - used for overlapping boxes within article + # aside_reg_exp = '^.*aside.*$' + + # ************************************ + # Clear out all the unwanted html tags: + # ************************************ + remove_tags = [ +# dict(name='aside', attrs={'name': re.compile(aside_reg_exp, re.IGNORECASE)}) + { + 'name': ['meta', 'link', 'noscript', 'aside'] + }, + { + 'attrs': { + 'data-component': ['Ticker', 'PublishedDate', 'Timestamp', 'Link', 'ShareLink', 'ShareUtility', + 'RelatedStories', 'ArticleTopStories', 'ArticleTopStoriesCard', 'ArticleJustInStories', + 'RelatedTopics', 'Player', 'ArticleSidebar', 'TopStoriesSidebar', 'UtilityBar'] + } + } + ] + + # ************************************ + # Tidy up the output to look neat for reading + # ************************************ + remove_attributes = ['width', 'height', 'style'] extra_css = '.byline{font-size:smaller;margin-bottom:10px;}.inline-caption{display:block;font-size:smaller;text-decoration: none;}' - preprocess_regexps = [(re.compile( - r'