Update MIT Technology Review

This commit is contained in:
Kovid Goyal 2022-01-03 17:26:35 +05:30
parent 6a3309175a
commit 9aa548bd3b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -7,8 +7,7 @@ __copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
''' '''
technologyreview.com technologyreview.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
import re
def absurl(x): def absurl(x):
@ -38,26 +37,14 @@ class MitTechnologyReview(BasicNewsRecipe):
tags = 'news, technology, science' tags = 'news, technology, science'
no_stylesheets = True no_stylesheets = True
"""
regex for class names
"""
articleHeaderRegex= '^.*contentHeader__wrapper.*$'
editorLetterHeaderRegex = "^.*contentHeader--vertical__wrapper.*$"
articleContentRegex = "^.*contentbody__wrapper.*$"
imagePlaceHolderRegex = "^.*image__placeholder.*$"
advertisementRegex = "^.*sliderAd__wrapper.*$"
keep_only_tags = [ keep_only_tags = [
dict(name='header', attrs={'class': re.compile(editorLetterHeaderRegex, re.IGNORECASE)}), prefixed_classes('contentHeader contentArticleHeader contentBody')
dict(name='header', attrs={'class': re.compile(articleHeaderRegex, re.IGNORECASE)}),
dict(name='div', attrs={'class': re.compile(articleContentRegex, re.IGNORECASE)})
] ]
remove_tags = [ remove_tags = [
dict(name="aside"), dict(name="aside"),
dict(name="svg"), dict(name="svg"),
dict(name="blockquote"), dict(name="blockquote"),
dict(name="img", attrs={'class': re.compile(imagePlaceHolderRegex, re.IGNORECASE)}), prefixed_classes('image__placeholder sliderAd__wrapper'),
dict(name="div", attrs={'class': re.compile(advertisementRegex, re.IGNORECASE)}),
] ]
def parse_index(self): def parse_index(self):