from calibre.web.feeds.news import BasicNewsRecipe, classes class IndiaToday(BasicNewsRecipe): title = u'India Today Magazine' language = 'en_IN' __author__ = 'unkn0wn' no_stylesheets = True use_embedded_content = False remove_attributes = ['style','height','width'] ignore_duplicate_articles = {'url'} extra_css = '[itemprop^="description"] {font-size: small; font-style: italic;}' description = ( 'India’s Most Reputed, Credible and Popular news magazine.' ' Read the most preferred magazine of 9.5 million Indians to access highly researched and unbiased content.') masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png' def get_cover_url(self): soup = self.index_to_soup('https://www.magzter.com/IN/India-Today-Group/India-Today/News/') for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): return citem['content'] keep_only_tags = [ dict(name='h1'), classes('story-kicker story-right'), dict(itemProp='articleBody'), ] def parse_index(self): soup = self.index_to_soup('https://www.indiatoday.in/magazine') section = None sections = {} for tag in soup.findAll('div', attrs={'class':['magazin-top-left', 'section-ordering']}): sec = tag.find('span') section = self.tag_to_string(sec) self.log(section) sections[section] = [] for a in tag.findAll('a', href=lambda x: x and x.startswith(("/magazine/cover-story/story/", "https://www.indiatoday.in/magazine/"))): url = a['href'] if url.startswith('https'): url = url else: url = 'https://www.indiatoday.in' + url title = self.tag_to_string(a) empty = " " if title is empty: url = '' self.log('\t', title) self.log('\t\t', url) sections[section].append({ 'title': title, 'url': url}) def sort_key(x): section = x[0] try: return ('EDITOR\'S NOTE', 'Cover Story', 'The Big Story', 'Upfront', 'NATION', 'INTERVIEW').index(section) except Exception: return 99999999 return sorted(sections.items(), key=sort_key) def preprocess_raw_html(self, raw_html, url): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw_html) for script in soup.findAll('script'): script.extract() for style in soup.findAll('style'): style.extract() for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] return str(soup)