from calibre.web.feeds.news import BasicNewsRecipe, classes def absurl(url): if url.startswith('/'): return 'https://www.deccanherald.com' + url class herald(BasicNewsRecipe): title = 'Deccan Herald' __author__ = 'unkn0wn' description = 'Deccan Herald is an Indian English language daily newspaper published from the Indian state of Karnataka.' language = 'en_IN' no_stylesheets = True remove_attributes = ['height', 'width', 'style'] ignore_duplicate_articles = {'url', 'title'} encoding = 'utf-8' keep_only_tags = [ classes('article-title article-author__name'), dict(name='div', attrs={'id':'main-content'}) ] remove_tags = [ classes( 'storyShare social-media-icons in_article_video static_text' ' nl-optin-mobile dk_only article-banner-adver-wrapper wb_holder' ' field-name-field-tags section-full strip--business' ) ] def parse_index(self): index = 'https://www.deccanherald.com/' sections = [ 'india', 'world', 'elections', 'opinion', 'specials', 'business', 'sports' ] feeds = [] for sec in sections: soup = self.index_to_soup(index + sec) section = sec.capitalize() self.log(section) articles = [] for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith('/' + sec + '/')}): url = absurl(a['href'].split('?')[0]) if url in {index + sec + '/', index + sec}: continue title = self.tag_to_string(a) self.log('\t', title, '\n\t\t', url) articles.append({'title': title, 'url': url}) if articles: feeds.append((section, articles)) return feeds