from collections import defaultdict from calibre.web.feeds.news import BasicNewsRecipe, classes class Frontline(BasicNewsRecipe): title = u'Frontline' __author__ = 'unkn0wn' description = 'Frontline, the fortnightly English magazine from the stable of The Hindu, has been a distinguished presence in the media world since 1984.' language = 'en_IN' no_stylesheets = True remove_javascript = True use_embedded_content = False encoding = 'utf-8' ignore_duplicate_articles = {'url'} masthead_url = 'https://frontline.thehindu.com/theme/images/fl-online/frontline-logo.png' remove_attributes = ['height', 'width'] resolve_internal_links = True extra_css = ''' .environment, .publish-time, .author { font-size:small; color:#404040; } .caption { font-size:small; text-align:center; } img { display:block; margin:0 auto; } .question {font-weight:bold;} ''' keep_only_tags = [ dict(name='div', attrs={'class':'container article-section'}) ] remove_tags = [ classes( 'breadcrumb comments-shares share-page article-video ' 'referpara slide-mobile title-patch hide-mobile related-stories' ), ] def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-original':True}): if img['data-original'].endswith('1x1_spacer.png'): source = img.findPrevious('source', srcset=True) img.extract() if source: source['src'] = source['srcset'].replace('_320','_1200') source.name = 'img' else: img['src'] = img['data-original'] for cap in soup.findAll(**classes('caption')): cap.name = 'figcaption' return soup def postprocess_html(self, soup, first_fetch): for src in soup.findAll('source'): src.extract() return soup def parse_index(self): soup = self.index_to_soup('https://frontline.thehindu.com/current-issue/') if cover := soup.find('div', attrs={'class':'magazine'}): self.cover_url = cover.find(**classes('sptar-image')).img['data-original'].replace('_320', '_1200') self.log('Cover ', self.cover_url) if desc := cover.find(**classes('sub-text')): self.description = self.tag_to_string(desc) feeds_dict = defaultdict(list) mag = soup.find(**classes('section-magazine')) for div in mag.findAll('div', attrs={'class':'content'}): a = div.find(**classes('title')).find('a') url = a['href'] title = self.tag_to_string(a) section = 'Articles' if cat := div.find(**classes('label')): section = self.tag_to_string(cat) desc = '' if art := div.find(**classes('sub-text')): desc = self.tag_to_string(art) if auth := div.find(**classes('author')): desc = self.tag_to_string(auth) + ' | ' + desc if not url or not title: continue self.log(section, '\n\t', title, '\n\t', desc, '\n\t\t', url) feeds_dict[section].append({"title": title, "url": url, "description": desc}) return [(section, articles) for section, articles in feeds_dict.items()]