From 06325e93ba5c5e80817d1a4b8ad62a9ef6603a86 Mon Sep 17 00:00:00 2001 From: Shiva Prasad Date: Fri, 11 Jun 2021 02:51:22 +0530 Subject: [PATCH] Fix & improve The Hindu recipe Summary: * Fix: loading lead image of articles * Fix: avoid duplicating of subheading * Add: article summary to show up in TOC --- recipes/hindu.recipe | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/recipes/hindu.recipe b/recipes/hindu.recipe index b29b8f17ce..2ba4c55a4e 100644 --- a/recipes/hindu.recipe +++ b/recipes/hindu.recipe @@ -3,7 +3,7 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' from calibre.web.feeds.news import BasicNewsRecipe -import string +import string, re def classes(classes): @@ -30,10 +30,26 @@ class TheHindu(BasicNewsRecipe): ] def preprocess_html(self, soup): - for img in soup.findAll('img', attrs={'data-src-template': True}): - img['src'] = img['data-src-template'].replace('BINARY/thumbnail', 'alternates/FREE_660') + img = soup.find('img', attrs={'class': 'lead-img'}) + try: + src = img.parent.find('source').get('srcset') + img['src'] = re.sub(r'(ALTERNATES)/.+?/', r'\1/FREE_660/', src) + except (TypeError, AttributeError): + pass + # Remove duplicate intro + for h in soup.findAll('h2', attrs={'class': 'intro'})[1:]: + h.extract() return soup + def populate_article_metadata(self, article, soup, first): + try: + desc = soup.find('meta', attrs={'name': 'description'}).get('content') + if not desc.startswith('Todays paper'): + desc += '...' if len(desc) >= 199 else '' # indicate truncation + article.text_summary = article.summary = desc + except AttributeError: + return + def articles_from_soup(self, soup): ans = [] div = soup.find('section', attrs={'id': 'section_'})