diff --git a/recipes/india_today.recipe b/recipes/india_today.recipe index fb6d4a77af..00c805daa4 100644 --- a/recipes/india_today.recipe +++ b/recipes/india_today.recipe @@ -7,17 +7,27 @@ class IndiaToday(BasicNewsRecipe): __author__ = 'unkn0wn' no_stylesheets = True use_embedded_content = False - remove_attributes = ['style','height','width'] + remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url'} extra_css = '[itemprop^="description"] {font-size: small; font-style: italic;}' description = ( 'India’s Most Reputed, Credible and Popular news magazine.' - ' Read the most preferred magazine of 9.5 million Indians to access highly researched and unbiased content.') + ' Read the most preferred magazine of 9.5 million Indians to access highly researched and unbiased content.' + ) masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png' + extra_css = ''' + .body_caption{font-size:small;} + .image-alt{font-size:small;} + ''' + def get_cover_url(self): - soup = self.index_to_soup('https://www.readwhere.com/magazine/the-india-today-group/India-Today/1154') - for citem in soup.findAll('meta', content=lambda s: s and s.endswith('/magazine/300/new')): + soup = self.index_to_soup( + 'https://www.readwhere.com/magazine/the-india-today-group/India-Today/1154' + ) + for citem in soup.findAll( + 'meta', content=lambda s: s and s.endswith('/magazine/300/new') + ): return citem['content'].replace('300', '600') keep_only_tags = [ @@ -32,32 +42,40 @@ class IndiaToday(BasicNewsRecipe): section = None sections = {} - for tag in soup.findAll('div', attrs={'class':['magazin-top-left', 'section-ordering']}): + for tag in soup.findAll( + 'div', attrs={'class': ['magazin-top-left', 'section-ordering']} + ): sec = tag.find('span') section = self.tag_to_string(sec) self.log(section) sections[section] = [] - for a in tag.findAll('a', href=lambda x: x and x.startswith(("/magazine/cover-story/story/", "https://www.indiatoday.in/magazine/"))): + for a in tag.findAll( + 'a', + href=lambda x: x and x.startswith(( + "/magazine/cover-story/story/", + "https://www.indiatoday.in/magazine/" + )) + ): url = a['href'] if url.startswith('https'): url = url else: url = 'https://www.indiatoday.in' + url - title = self.tag_to_string(a) - empty = " " - if title is empty: - url = '' + title = self.tag_to_string(a).strip() + if not url or not title: + continue self.log('\t', title) self.log('\t\t', url) - sections[section].append({ - 'title': title, - 'url': url}) + sections[section].append({'title': title, 'url': url}) def sort_key(x): section = x[0] try: - return ('EDITOR\'S NOTE', 'Cover Story', 'The Big Story', 'Upfront', 'NATION', 'INTERVIEW').index(section) + return ( + 'EDITOR\'S NOTE', 'Cover Story', 'The Big Story', 'Upfront', + 'NATION', 'INTERVIEW' + ).index(section) except Exception: return 99999999 @@ -66,6 +84,15 @@ class IndiaToday(BasicNewsRecipe): def preprocess_raw_html(self, raw_html, url): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw_html) + for div in soup.findAll('div', attrs={'id': 'premium_content_data'}): + div.extract() + for tv in soup.findAll( + 'div', + attrs={ + 'class': ['live-tv-ico', 'sendros', 'live-tv-ico-st', 'sendros-st'] + } + ): + tv.extract() for script in soup.findAll('script'): script.extract() for style in soup.findAll('style'):