Update The Straits Times

This commit is contained in:
Kovid Goyal 2021-04-02 13:09:46 +05:30
parent aabe59dd54
commit da6c7c6c3c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -5,10 +5,15 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
www.straitstimes.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class StraitsTimes(BasicNewsRecipe):
title = 'The Straits Times'
__author__ = 'Darko Miletic'
@ -25,23 +30,14 @@ class StraitsTimes(BasicNewsRecipe):
conversion_options = {
'comments': description, 'tags': category, 'language': language, 'publisher': publisher
}
preprocess_regexps = [
(re.compile(
r'<meta name="description" content="[^"]+"\s*/?>',
re.IGNORECASE | re.DOTALL),
lambda m:''),
(re.compile(r'<!--.+?-->', re.IGNORECASE | re.DOTALL),
lambda m: ''),
keep_only_tags = [
classes('node-header node-subheadline group-byline-info group-updated-timestamp group-image-frame field-name-body')
]
remove_tags = [
dict(name=['object', 'link', 'map', 'style']),
dict(attrs={'class': 'st2014-realted-links'}),
classes('st_telegram_boilerplate'),
dict(name='source'),
]
keep_only_tags = [dict(name='div', attrs={'class': 'story'})]
remove_tags_after = dict(name='div', attrs={'class': 'hr_thin'})
feeds = [
(u'Top of the News' , u'http://www.straitstimes.com/print/top-of-the-news/rss.xml')
,(u'World' , u'http://www.straitstimes.com/print/world/rss.xml')