From adef43ddab029af03b0900664cdfc4753a4023e7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 1 Jan 2021 19:55:56 +0530 Subject: [PATCH] Update USA Today --- recipes/usatoday.recipe | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/recipes/usatoday.recipe b/recipes/usatoday.recipe index a3ca70d14c..570f343aa6 100644 --- a/recipes/usatoday.recipe +++ b/recipes/usatoday.recipe @@ -9,14 +9,18 @@ usatoday.com from calibre.web.feeds.news import BasicNewsRecipe +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + class USAToday(BasicNewsRecipe): title = 'USA Today' __author__ = 'Kovid Goyal' description = 'newspaper' encoding = 'utf-8' - publisher = 'usatoday.com' - category = 'news, usa' language = 'en' use_embedded_content = False @@ -24,7 +28,16 @@ class USAToday(BasicNewsRecipe): max_articles_per_feed = 15 no_stylesheets = True remove_empty_feeds = True - filterDuplicates = True + + keep_only_tags = [ + classes('gnt_ar_hl gnt_ar_by gnt_ar_b topper__inner in-depth-content'), + ] + + remove_tags = [ + classes('component--pullquote__icon gnt_ss'), + dict(attrs={'aria-label': ['advertisement']}), + dict(name=['link', 'media-gallery']), + ] extra_css = ''' h1, h2 { @@ -64,15 +77,3 @@ class USAToday(BasicNewsRecipe): ('Most Popular', 'http://rssfeeds.usatoday.com/usatoday-mostviewedarticles&x=1'), ] - - auto_cleanup = True - - def get_masthead_url(self): - masthead = 'http://i.usatoday.net/mobile/_common/_images/565x73_usat_mobile.gif' - br = BasicNewsRecipe.get_browser(self) - try: - br.open(masthead) - except: - self.log("\nCover unavailable") - masthead = None - return masthead