diff --git a/recipes/fe_india.recipe b/recipes/fe_india.recipe index 56d84b9f65..9bcdfb0660 100644 --- a/recipes/fe_india.recipe +++ b/recipes/fe_india.recipe @@ -63,6 +63,11 @@ class FE_India(BasicNewsRecipe): ('Money','https://www.financialexpress.com/money/feed'), ] + def get_cover_url(self): + soup = self.index_to_soup('https://www.magzter.com/IN/The-Indian-Express-Ltd./Financial-Express-Mumbai/Business/') + for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): + return citem['content'] + def preprocess_html(self, soup, *a): for img in soup.findAll(attrs={'data-src': True}): img['src'] = img['data-src'] diff --git a/recipes/hindustan_times.recipe b/recipes/hindustan_times.recipe index 04cb111ce8..dd6168943e 100644 --- a/recipes/hindustan_times.recipe +++ b/recipes/hindustan_times.recipe @@ -49,6 +49,11 @@ class HindustanTimes(BasicNewsRecipe): # ('Budget',''https://www.hindustantimes.com/feeds/rss/budget/rssfeed.xml') ] + def get_cover_url(self): + soup = self.index_to_soup('https://www.magzter.com/IN/HT-Digital-Streams-Ltd./Hindustan-Times-Delhi/Newspaper/') + for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): + return citem['content'] + def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] diff --git a/recipes/india_today.recipe b/recipes/india_today.recipe index 326ece1882..8e7d9f72b7 100644 --- a/recipes/india_today.recipe +++ b/recipes/india_today.recipe @@ -35,6 +35,13 @@ class IndiaToday(BasicNewsRecipe): ('Sports','https://www.indiatoday.in/rss/1206518'), ] + extra_css = '[itemprop^="description"] {font-size: small; font-style: italic;}' + + def get_cover_url(self): + soup = self.index_to_soup('https://www.magzter.com/IN/India-Today-Group/India-Today/News/') + for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): + return citem['content'] + def preprocess_raw_html(self, raw_html, url): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw_html) diff --git a/recipes/the_week.recipe b/recipes/the_week.recipe index 92637043d2..11b927bb57 100644 --- a/recipes/the_week.recipe +++ b/recipes/the_week.recipe @@ -2,7 +2,6 @@ # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2021, Kovid Goyal - from calibre.web.feeds.news import BasicNewsRecipe @@ -20,6 +19,7 @@ class TheWeek(BasicNewsRecipe): no_stylesheets = True use_embedded_content = True ignore_duplicate_articles = {'url'} + remove_attributes = ['style', 'align', 'border', 'hspace'] feeds = [ ('Cover Story', 'https://www.theweek.in/theweek/cover.rss'), @@ -34,23 +34,24 @@ class TheWeek(BasicNewsRecipe): ] def get_cover_url(self): - soup = self.index_to_soup('https://www.theweek.in/theweek.html') - for img in soup.findAll('img', attrs={'data-src-web': lambda x: x and '/cover-magazine' in x}): - src = img['data-src-web'] - try: - idx = src.rfind('.image.') - except Exception: - pass - else: - if idx > -1: - src = src[:idx] - return 'https://img.theweek.in' + src + soup = self.index_to_soup( + 'https://www.magzter.com/IN/Malayala_Manorama/THE_WEEK/Business/' + ) + for citem in soup.findAll( + 'meta', content=lambda s: s and s.endswith('view/3.jpg') + ): + return citem['content'] def preprocess_html(self, soup): a = soup.find('a') - a.name = 'div' + if a: + a.name = 'div' h2 = soup.find('h2') - h2.string = fix_title(h2.string) + if h2: + h2.string = fix_title(h2.string) + for p in soup.findAll('p'): + if p.string == '\xa0': + p.decompose() return soup def populate_article_metadata(self, article, soup, first):