From c6f40028927bf80a1ef210823e3cb4757b4844b2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 23 Nov 2022 15:00:27 +0530 Subject: [PATCH] Minor updates to various recipes --- recipes/frontline.recipe | 91 +++++++++---------- recipes/nautilus.recipe | 2 +- recipes/open_magazine.recipe | 7 +- recipes/outlook_india.recipe | 5 +- ...heeconomictimes_india_print_edition.recipe | 9 +- 5 files changed, 59 insertions(+), 55 deletions(-) diff --git a/recipes/frontline.recipe b/recipes/frontline.recipe index eb4d1459e2..b29f86f61c 100644 --- a/recipes/frontline.recipe +++ b/recipes/frontline.recipe @@ -1,8 +1,6 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 +from collections import defaultdict from calibre.web.feeds.news import BasicNewsRecipe, classes - class Frontline(BasicNewsRecipe): title = u'Frontline' __author__ = 'unkn0wn' @@ -12,56 +10,57 @@ class Frontline(BasicNewsRecipe): remove_javascript = True use_embedded_content = False encoding = 'utf-8' - oldest_article = 14 - max_articles_per_feed = 50 ignore_duplicate_articles = {'url'} - # masthead_url = 'https://fl.thgim.com/static/theme/default/base/img/fllogo.png' + masthead_url = 'https://frontline.thehindu.com/theme/images/fl-online/frontline-logo.png' remove_attributes = ['height', 'width'] - - def get_cover_url(self): - soup = self.index_to_soup( - 'https://frontline.thehindu.com/current-issue/') - tag = soup.find(attrs={'class': 'sptar-image'}) - if tag: - self.cover_url = tag.find('img')['data-original'] - return super().get_cover_url() - - # https://fl.thgim.com/incoming/b5zy2g/article38454943.ece/alternates/FREE_100/coverpng + resolve_internal_links = True + extra_css = ''' + .overline{ font-size:small; color:#404040; } + .person-name { font-size:small; font-weight:bold; } + .lead-img-caption, .caption-cont { font-size:small; text-align:center; } + ''' keep_only_tags = [ - classes( - 'overline mainart-title marginBottom10px articleBottomLine swiper-slide slide-caption artlead-text body-main article-container' - ) + classes('article') ] - remove_tags = [classes('dispatche-middle bigtitle')] - - remove_tags_after = [ - classes('body-main'), + + remove_tags = [ + classes('shareicon-article articleBottomLine secheader mobilesocialicons'), + dict(name='h2', attrs={'class':'title'}) ] - - feeds = [ - ('Cover Story', - 'https://frontline.thehindu.com/cover-story/feeder/default.rss'), - ('The Nation', - 'https://frontline.thehindu.com/the-nation/feeder/default.rss'), - ('World Affairs', - 'https://frontline.thehindu.com/world-affairs/feeder/default.rss'), - ('Politics', - 'https://frontline.thehindu.com/politics/feeder/default.rss'), - ('Arts & Culture', - 'https://frontline.thehindu.com/arts-and-culture/feeder/default.rss'), - ('Social Issues', - 'https://frontline.thehindu.com/social-issues/feeder/default.rss'), - ('Books', 'https://frontline.thehindu.com/books/feeder/default.rss'), - ('Columns', - 'https://frontline.thehindu.com/columns/feeder/default.rss'), - ('Others', 'https://frontline.thehindu.com/other/feeder/default.rss'), - ] - + def preprocess_html(self, soup): - for source in soup.findAll('source', srcset=True, attrs={'media':'(min-width: 1600px)'}): - source.name = 'img' - source['src'] = source['srcset'] for img in soup.findAll('img', attrs={'data-original':True}): img['src'] = img['data-original'] + for cap in soup.findAll(**classes('caption-cont')): + cap.name = 'figcaption' return soup + + def parse_index(self): + soup = self.index_to_soup('https://frontline.thehindu.com/magazine/') + issue = soup.find(**classes('sptar-archive-item')).find('a')['href'] + self.log(issue) + soup = self.index_to_soup(issue) + time = soup.find(**classes('date')).findNext('h3') + if time: + self.timefmt = ' ' + self.tag_to_string(time) + self.log('Downloading Issue:', self.timefmt) + self.cover_url = soup.find(**classes('sptar-cover-item')).find('img')['data-original'].replace('FREE_320', 'FREE_810') + feeds_dict = defaultdict(list) + for div in soup.findAll('div', attrs={'class':'brief-list-item'}): + a = div.find(**classes('brief-title')).find('a') + url = a['href'] + title = self.tag_to_string(a) + section = 'Articles' + cat = div.find(**classes('brief-cat')) + if cat: + section = self.tag_to_string(cat) + desc = '' + art = div.find(**classes('artbody')) + if art: + desc = self.tag_to_string(art) + if not url or not title: + continue + self.log(section, '\n\t', title, '\n\t', desc, '\n\t\t', url) + feeds_dict[section].append({"title": title, "url": url}) + return [(section, articles) for section, articles in feeds_dict.items()] \ No newline at end of file diff --git a/recipes/nautilus.recipe b/recipes/nautilus.recipe index 2572de4a93..4470af6aa2 100644 --- a/recipes/nautilus.recipe +++ b/recipes/nautilus.recipe @@ -64,7 +64,7 @@ class Nautilus(BasicNewsRecipe): def get_cover_url(self): soup = self.index_to_soup('https://www.presspassnow.com/nautilus/issues/') - div = soup.find('div', **classes('image-fade_in_back')) + div = soup.find('li', **classes('product')) if div: self.cover_url = div.find('img', src=True)['src'] return getattr(self, 'cover_url', self.cover_url) diff --git a/recipes/open_magazine.recipe b/recipes/open_magazine.recipe index 94aab6020b..be8c04930b 100644 --- a/recipes/open_magazine.recipe +++ b/recipes/open_magazine.recipe @@ -24,7 +24,7 @@ class OpenMagazine(BasicNewsRecipe): soup = self.index_to_soup('https://openthemagazine.com/') tag = soup.find(attrs={'class': 'magazine-item mr-1'}) if tag: - self.cover_url = tag.find('img')['data-src'] + self.cover_url = tag.find('img')['src'] return getattr(self, 'cover_url', None) keep_only_tags = [ @@ -48,8 +48,3 @@ class OpenMagazine(BasicNewsRecipe): ('Art & Culture', 'https://openthemagazine.com/art-culture/feed'), ('Cinema', 'https://openthemagazine.com/cinema/feed'), ] - - def preprocess_html(self, soup): - for img in soup.findAll('img', attrs={'data-src':True}): - img['src'] = img['data-src'] - return soup diff --git a/recipes/outlook_india.recipe b/recipes/outlook_india.recipe index 0693ba0dca..8c4f5bb7e1 100644 --- a/recipes/outlook_india.recipe +++ b/recipes/outlook_india.recipe @@ -18,11 +18,13 @@ class outlook(BasicNewsRecipe): remove_attributes = ['height', 'width', 'style'] ignore_duplicate_articles = {'url'} resolve_internal_links = True + masthead_url = 'https://www.outlookindia.com/images/home_new_v4/logo_outlook.svg' keep_only_tags = [classes('__story_detail')] remove_tags = [ classes( - 'social_sharing_article left_trending left-sticky __tag_links next_prev_stories downarrow uparrow more_from_author_links next prev' + 'social_sharing_article left_trending left-sticky __tag_links' + ' next_prev_stories downarrow uparrow more_from_author_links next prev __related_stories_thumbs' ) ] @@ -32,6 +34,7 @@ class outlook(BasicNewsRecipe): a = div.find('a', href=lambda x: x and x.startswith('/magazine/issue/')) url = a['href'] self.log('Downloading issue:', url) + self.timefmt = ' [' + self.tag_to_string(a) + ']' soup = self.index_to_soup('https://www.outlookindia.com' + url) cover = soup.find(**classes('listingPage_lead_story')) self.cover_url = cover.find('img', attrs={'src': True})['src'] diff --git a/recipes/theeconomictimes_india_print_edition.recipe b/recipes/theeconomictimes_india_print_edition.recipe index d1cb6c6575..35e358b7e9 100644 --- a/recipes/theeconomictimes_india_print_edition.recipe +++ b/recipes/theeconomictimes_india_print_edition.recipe @@ -22,7 +22,7 @@ class TheEconomicTimes(BasicNewsRecipe): masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/9/98/The_Economic_Times_logo.svg' ignore_duplicate_articles = {'title', 'url'} extra_css = ''' - .summary {font-weight:normal; font-size:normal; font-style:italic;} + .summary {color:#404040; font-style:italic;} time{font-size:small;} ''' @@ -59,6 +59,10 @@ class TheEconomicTimes(BasicNewsRecipe): soup = self.index_to_soup( 'https://economictimes.indiatimes.com/print_edition.cms' ) + date = soup.find(**classes('labelDate')) + if date: + self.timefmt = ' [' + self.tag_to_string(date).strip() + ']' + self.log(self.timefmt) ans = self.et_parse_index(soup) return ans @@ -91,6 +95,9 @@ class TheEconomicTimes(BasicNewsRecipe): return feeds def preprocess_html(self, soup): + h2 = soup.find(**classes('summary')) + if h2: + h2.name = 'p' for image in soup.findAll('img', attrs={'src': True}): image['src'] = image['src'].replace("width-300", "width-640") for img in soup.findAll('img', attrs={'data-original': True}):