From da6c7c6c3c69a57889de218b413b740eb57c0d9b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 2 Apr 2021 13:09:46 +0530 Subject: [PATCH] Update The Straits Times --- recipes/straitstimes.recipe | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/recipes/straitstimes.recipe b/recipes/straitstimes.recipe index 9ac9af1e86..a2badde417 100644 --- a/recipes/straitstimes.recipe +++ b/recipes/straitstimes.recipe @@ -5,10 +5,15 @@ __copyright__ = '2009, Darko Miletic ' www.straitstimes.com ''' -import re from calibre.web.feeds.recipes import BasicNewsRecipe +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + class StraitsTimes(BasicNewsRecipe): title = 'The Straits Times' __author__ = 'Darko Miletic' @@ -25,23 +30,14 @@ class StraitsTimes(BasicNewsRecipe): conversion_options = { 'comments': description, 'tags': category, 'language': language, 'publisher': publisher } - - preprocess_regexps = [ - (re.compile( - r'', - re.IGNORECASE | re.DOTALL), - lambda m:''), - (re.compile(r'', re.IGNORECASE | re.DOTALL), - lambda m: ''), + keep_only_tags = [ + classes('node-header node-subheadline group-byline-info group-updated-timestamp group-image-frame field-name-body') ] remove_tags = [ - dict(name=['object', 'link', 'map', 'style']), - dict(attrs={'class': 'st2014-realted-links'}), + classes('st_telegram_boilerplate'), + dict(name='source'), ] - keep_only_tags = [dict(name='div', attrs={'class': 'story'})] - remove_tags_after = dict(name='div', attrs={'class': 'hr_thin'}) - feeds = [ (u'Top of the News' , u'http://www.straitstimes.com/print/top-of-the-news/rss.xml') ,(u'World' , u'http://www.straitstimes.com/print/world/rss.xml')