From 9aa548bd3be1b0cd1fa4d7ea160171353cada942 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 3 Jan 2022 17:26:35 +0530 Subject: [PATCH] Update MIT Technology Review --- recipes/mit_technology_review.recipe | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/recipes/mit_technology_review.recipe b/recipes/mit_technology_review.recipe index e67bef1e0b..50db7b8e08 100644 --- a/recipes/mit_technology_review.recipe +++ b/recipes/mit_technology_review.recipe @@ -7,8 +7,7 @@ __copyright__ = '2015 Michael Marotta ' ''' technologyreview.com ''' -from calibre.web.feeds.news import BasicNewsRecipe -import re +from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes def absurl(x): @@ -38,26 +37,14 @@ class MitTechnologyReview(BasicNewsRecipe): tags = 'news, technology, science' no_stylesheets = True - """ - regex for class names - """ - articleHeaderRegex= '^.*contentHeader__wrapper.*$' - editorLetterHeaderRegex = "^.*contentHeader--vertical__wrapper.*$" - articleContentRegex = "^.*contentbody__wrapper.*$" - imagePlaceHolderRegex = "^.*image__placeholder.*$" - advertisementRegex = "^.*sliderAd__wrapper.*$" - keep_only_tags = [ - dict(name='header', attrs={'class': re.compile(editorLetterHeaderRegex, re.IGNORECASE)}), - dict(name='header', attrs={'class': re.compile(articleHeaderRegex, re.IGNORECASE)}), - dict(name='div', attrs={'class': re.compile(articleContentRegex, re.IGNORECASE)}) + prefixed_classes('contentHeader contentArticleHeader contentBody') ] remove_tags = [ dict(name="aside"), dict(name="svg"), dict(name="blockquote"), - dict(name="img", attrs={'class': re.compile(imagePlaceHolderRegex, re.IGNORECASE)}), - dict(name="div", attrs={'class': re.compile(advertisementRegex, re.IGNORECASE)}), + prefixed_classes('image__placeholder sliderAd__wrapper'), ] def parse_index(self):