From 285a538e37c599f9f21731c0316394267192e2fc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 21 Jul 2023 13:58:17 +0530 Subject: [PATCH] Write a parse_index for Toronto Star It still needs subscription which I cant be bothered with --- recipes/thestar.recipe | 56 +++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/recipes/thestar.recipe b/recipes/thestar.recipe index f25e9903ad..d50e0dac6a 100644 --- a/recipes/thestar.recipe +++ b/recipes/thestar.recipe @@ -4,13 +4,15 @@ __copyright__ = '2009-2013, Darko Miletic ' www.thestar.com ''' +from collections import defaultdict -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) +from calibre.web.feeds.news import BasicNewsRecipe, classes -from calibre.web.feeds.news import BasicNewsRecipe +def absolutize(url): + if url.startswith('/'): + url = 'https://www.thestar.com' + url + return url class TheTorontoStar(BasicNewsRecipe): @@ -25,28 +27,38 @@ class TheTorontoStar(BasicNewsRecipe): delay = 2 publisher = 'The Toronto Star' encoding = 'utf-8' - masthead_url = 'http://www.thestar.com/etc/designs/thestar/images/general/logoLrg.png' keep_only_tags = [ - classes('o-main-content') + classes('headline asset-summary metaPrimary tsArticleContainer') ] remove_tags = [ - classes( - 'article-continue-basic-container label-modal-bottom print-header' - ' cta-container share-toolbar-container c-related-articles c-partner-articles c-more-articles c-top-articles'), + classes('share-container shareIcons articleFeedbackCTA asset-comments'), dict(name='button') ] - remove_tags_after = [ - classes('article-content-container') - ] - feeds = [ - (u'News', u'http://www.thestar.com/feeds.articles.news.rss'), - (u'Opinion', u'http://www.thestar.com/feeds.articles.opinion.rss'), - (u'Business', u'http://www.thestar.com/feeds.articles.business.rss'), - (u'Sports', u'http://www.thestar.com/feeds.articles.sports.rss'), - (u'Entertainment', u'http://www.thestar.com/feeds.articles.entertainment.rss'), - (u'Living', u'http://www.thestar.com/feeds.articles.life.rss'), - (u'Travel', u'http://www.thestar.com/feeds.articles.life.travel.rss'), - (u'Technology', u'http://www.thestar.com/feeds.articles.life.technology.rss') - ] + def parse_section(self, section): + for article in section.findAll('article'): + a = article.find('a', attrs={'class': 'tnt-asset-link', 'href':True, 'aria-label': True}) + if a is not None: + title = a['aria-label'] + url = absolutize(a['href']) + desc = '' + summ = article.find(attrs={'class':'tnt-summary'}) + if summ is not None: + desc = self.tag_to_string(summ) + section = '' + sec = article.find(attrs={'class':'label-flag-section'}) + if sec is not None: + section = self.tag_to_string(sec).strip().lower().capitalize() + if section == 'Gta': + section = 'GTA' + self.log(section + ':', title) + yield section, {'title': title, 'url': url, 'description': desc} + + def parse_index(self): + soup = self.index_to_soup('https://www.thestar.com/') + ans = defaultdict(list) + for section in soup.findAll('section', attrs={'class': 'block'}): + for sec, article in self.parse_section(section): + ans[sec].append(article) + return list(ans.items())