#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes ''' Hamilton Spectator Calibre Recipe ''' def absurl(url): if url.startswith('/'): return 'https://www.thespec.com' + url class HamiltonSpectator(BasicNewsRecipe): title = u'Hamilton Spectator' max_articles_per_feed = 50 __author__ = u'unkn0wn' publisher = u'thespec.com' description = u'Ontario Canada Newspaper' category = u'News, Ontario, Canada' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'en_CA' encoding = 'utf-8' remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://bloximages.chicago2.vip.townnews.com/thespec.com/content/tncms/custom/image/c0094646-1108-11ee-8af0-b3954ce40e5e.png' ignore_duplicate_articles = {'title', 'url'} extra_css = ''' .caption { font-size:small; text-align:center; } .authorList, .endnote_contrib { font-size:small; } ''' keep_only_tags = [ classes( 'headline asset-summary authorList articleMainArt asset-body' ) ] remove_tags = [ dict(name=['svg', 'button']), dict(attrs={'id':['tncms-region-article_instory_top', 'tncms-region-article_bottom', 'asset-video-primary']}), classes( 'tnt-blurred-image share-container subscriber-offers access-offers-in-page ' 'access-offers-wrapper tnt-ads-container adLabelWrapperManual shareIcons ' 'articleFeedbackCTA comments-container card-image' ) ] def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-srcset':True}): for x in img['data-srcset'].split(','): if '640w' in x.split(): img['src'] = x.split()[0] return soup def parse_index(self): index = 'https://www.thespec.com/' sections = [ 'news', 'politics', 'opinion', 'business', 'sports', 'life', 'entertainment' ] feeds = [] soup = self.index_to_soup(index) for sec in sections: section = sec.capitalize() self.log(section) articles = [] for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}): url = absurl(a['href'].split('#')[0]) if url in {index + sec + '/', index + sec}: continue if not url.endswith('.html'): continue title = self.tag_to_string(a) self.log('\t', title, '\n\t\t', url) articles.append({'title': title, 'url': url}) if articles: feeds.append((section, articles)) return feeds