diff --git a/recipes/indian_express.recipe b/recipes/indian_express.recipe index 0b91fb7f7c..4c7a2175a7 100644 --- a/recipes/indian_express.recipe +++ b/recipes/indian_express.recipe @@ -3,66 +3,115 @@ from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + return dict( + attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} + ) class IndianExpress(BasicNewsRecipe): title = u'Indian Express' language = 'en_IN' __author__ = 'Krittika Goyal' - oldest_article = 1 # days + oldest_article = 1.15 # days max_articles_per_feed = 25 encoding = 'utf-8' masthead_url = 'https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg' no_stylesheets = True use_embedded_content = False - remove_attributes = ['style','height','width'] + remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url'} - extra_css = '#storycenterbyline {font-size:small};' + extra_css = ''' + #storycenterbyline {font-size:small;} + #img-cap {font-size:small;} + ''' + resolve_internal_links = True + remove_empty_feeds = True - keep_only_tags = [ - classes('heading-part full-details') - ] + keep_only_tags = [classes('heading-part full-details')] remove_tags = [ - dict(name='div', attrs={'id':'ie_story_comments'}), - dict(name='img', attrs={'src':'https://images.indianexpress.com/2021/06/explained-button-300-ie.jpeg'}), - dict(name='a', attrs={'href':'https://indianexpress.com/section/explained/?utm_source=newbanner'}), - dict(name='img', attrs={'src':'https://images.indianexpress.com/2021/06/opinion-button-300-ie.jpeg'}), - dict(name='a', attrs={'href':'https://indianexpress.com/section/opinion/?utm_source=newbanner'}), + dict(name='div', attrs={'id': 'ie_story_comments'}), + dict( + name='img', + attrs={ + 'src': + 'https://images.indianexpress.com/2021/06/explained-button-300-ie.jpeg' + } + ), + dict( + name='a', + attrs={ + 'href': + 'https://indianexpress.com/section/explained/?utm_source=newbanner' + } + ), + dict( + name='img', + attrs={ + 'src': + 'https://images.indianexpress.com/2021/06/opinion-button-300-ie.jpeg' + } + ), + dict( + name='a', + attrs={ + 'href': + 'https://indianexpress.com/section/opinion/?utm_source=newbanner' + } + ), classes( 'share-social appstext ie-int-campign-ad ie-breadcrumb custom_read_button unitimg copyright' ' storytags pdsc-related-modify news-guard premium-story append_social_share' + ' digital-subscriber-only h-text-widget' ) ] + feeds = [ ('Front Page', 'https://indianexpress.com/print/front-page/feed/'), - ('Op-Ed', 'http://indianexpress.com/section/opinion/feed/'), - ('Science & Technology', 'http://indianexpress.com/section/technology/feed/'), - ('Movie Reviews', 'https://indianexpress.com/section/entertainment/movie-review/feed/'), - ('Sunday Eye', 'https://indianexpress.com/print/eye/feed/'), - ('Explained', 'https://indianexpress.com/section/explained/feed/'), - ('Delhi Confidential', 'https://indianexpress.com/section/delhi-confidential/feed'), - ('Economy', 'https://indianexpress.com/print/economy/feed'), - ('Express Network', 'https://indianexpress.com/print/express-network/'), - ('Research', 'https://indianexpress.com/section/research/feed/'), - ('UPSC-CSE Key','https://indianexpress.com/section/upsc-current-affairs/feed/'), - ('World','https://indianexpress.com/section/world/feed/'), - ('Business', 'https://indianexpress.com/section/business/feed/'), - ('Political Pulse', 'https://indianexpress.com/section/political-pulse/feed'), ('India', 'https://indianexpress.com/section/india/feed/'), + ('Express Network', 'https://indianexpress.com/print/express-network/feed/'), + ( + 'Delhi Confidential', + 'https://indianexpress.com/section/delhi-confidential/feed/' + ), + ('Op-Ed', 'http://indianexpress.com/section/opinion/feed/'), + ('Explained', 'https://indianexpress.com/section/explained/feed/'), + ( + 'UPSC-CSE Key', + 'https://indianexpress.com/section/upsc-current-affairs/feed/' + ), + ('Research', 'https://indianexpress.com/section/research/feed/'), + ('Economy', 'https://indianexpress.com/print/economy/feed/'), ('Business', 'https://indianexpress.com/section/business/feed/'), - ('Education', 'https://indianexpress.com/section/education/feed/'), + ('World', 'https://indianexpress.com/section/world/feed/'), + ( + 'Movie Reviews', + 'https://indianexpress.com/section/entertainment/movie-review/feed/' + ), + ('Sunday Eye', 'https://indianexpress.com/print/eye/feed/'), + # ('Science & Technology', 'http://indianexpress.com/section/technology/feed/'), + # ('Political Pulse', 'https://indianexpress.com/section/political-pulse/feed'), + # ('Education', 'https://indianexpress.com/section/education/feed/'), # Want to add more? go-to:https://indianexpress.com/syndication/ ] def get_cover_url(self): - soup = self.index_to_soup('https://www.magzter.com/IN/The-Indian-Express-Ltd./The-Indian-Express-Mumbai/Newspaper/') - for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): + soup = self.index_to_soup( + 'https://www.magzter.com/IN/The-Indian-Express-Ltd./The-Indian-Express-Mumbai/Newspaper/' + ) + for citem in soup.findAll( + 'meta', content=lambda s: s and s.endswith('view/3.jpg') + ): return citem['content'] def preprocess_html(self, soup): + h2 = soup.findAll('h2') + for sub in h2: + sub.name = 'h4' + for span in soup.findAll( + 'span', attrs={'class': ['ie-custom-caption', 'custom-caption']} + ): + span['id'] = 'img-cap' for img in soup.findAll('img'): noscript = img.findParent('noscript') if noscript is not None: