#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' usatoday.com ''' from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) def absurl(url): if url.startswith('/'): return 'https://www.usatoday.com' + url class USAToday(BasicNewsRecipe): title = 'USA Today' __author__ = 'Kovid Goyal, unkn0wn' description = 'newspaper' encoding = 'utf-8' language = 'en_US' use_embedded_content = False timefmt = ' [%d %b %Y]' max_articles_per_feed = 25 no_stylesheets = True keep_only_tags = [ classes('gnt_ar_hl gnt_ar_by gnt_ar_b topper__inner in-depth-content'), ] remove_tags = [ classes('component--pullquote__icon gnt_ss gnt_em_vp__tp'), dict(attrs={'aria-label': ['advertisement']}), dict(name=['link', 'media-gallery', 'button']), ] extra_css = ''' h1, h2 { font-size:xx-large; font-family:Arial,Helvetica,sans-serif;} #post-attributes, .info, .clear { font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif; } #post-body, #content { font-size:medium; font-family:Arial,Helvetica,sans-serif; } .gnt_em_img_ccw__cap { font-size:small; text-align:center; } ''' ignore_duplicate_articles = {'title', 'url'} resolve_internal_links = True remove_empty_feeds = True def parse_index(self): index = 'https://www.usatoday.com/' sections = [ 'news', 'opinion', 'tech', 'entertainment', 'money', 'sports', 'travel', 'life', 'investigations', ] feeds = [] soup = self.index_to_soup(index) for sec in sections: section = sec.capitalize() self.log(section) articles = [] for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/story/' + sec + '/')}): url = absurl(a['href'].split('?')[0]) if url == index + '/story/' + sec + '/': continue title = self.tag_to_string(a) self.log('\t', title, '\n\t\t', url) articles.append({'title': title, 'url': url}) if articles: feeds.append((section, articles)) return feeds def preprocess_html(self, soup): for img in soup.findAll('img', src=True): img['src'] = 'https://www.usatoday.com' + img['src'] for div in soup.findAll(attrs={'data-c-caption':True}): div.string = div['data-c-caption'] return soup