#!/usr/bin/env python # vim:fileencoding=utf-8 import json import re from datetime import date from calibre.web.feeds.news import BasicNewsRecipe, classes class LiveMint(BasicNewsRecipe): title = 'Live Mint' description = 'Financial News from India.' language = 'en_IN' __author__ = 'Krittika Goyal, revised by unkn0wn' oldest_article = 1.15 # days max_articles_per_feed = 50 encoding = 'utf-8' use_embedded_content = False no_stylesheets = True remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://images.livemint.com/static/livemint-logo-v1.svg' recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', 'default': str(oldest_article), } } remove_empty_feeds = True resolve_internal_links = True def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) def get_cover_url(self): today = date.today().strftime('%d/%m/%Y') today = today.replace('/', '%2F') raw = self.index_to_soup( 'https://epaper.livemint.com/Home/GetAllpages?editionid=1&editiondate=' + today, raw=True ) for cov in json.loads(raw): if cov['NewsProPageTitle'].lower().startswith(('front', 'cover')): return cov['HighResolution'] extra_css = """ img {margin:0 auto;} .psTopLogoItem img, .ecologoStory { width:100; } #img-cap {font-size:small; text-align:center;} .summary, .highlights, .synopsis { font-weight:normal !important; font-style:italic; color:#202020; } em, blockquote {color:#202020;} .moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag, .auth {font-size:small;} """ keep_only_tags = [ dict( name='article', attrs={'id': lambda x: x and x.startswith(('article_', 'box_'))}, ), dict(attrs={'class': lambda x: x and x.startswith('storyPage_storyBox__')}), classes('contentSec'), ] remove_tags = [ dict(name=['meta', 'link', 'svg', 'button', 'iframe']), dict( attrs={ 'class': lambda x: x and x.startswith( ( 'storyPage_alsoRead__', 'storyPage_firstPublishDate__', 'storyPage_bcrumb__', ) ) } ), dict(attrs={'id': ['faqSection', 'seoText', 'ellipsisId', 'gift_redeemed_box ']}), classes( 'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider ninSec' ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget' ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn' ' double_gift_box trade' ), ] feeds = [ ('Companies', 'https://www.livemint.com/rss/companies'), ('Opinion', 'https://www.livemint.com/rss/opinion'), ('Money', 'https://www.livemint.com/rss/money'), ('Economy', 'https://www.livemint.com/rss/economy'), ('Politics', 'https://www.livemint.com/rss/politics'), ('Science', 'https://www.livemint.com/rss/science'), ('Industry', 'https://www.livemint.com/rss/industry'), ('Education', 'https://www.livemint.com/rss/education'), ('Sports', 'https://www.livemint.com/rss/sports'), ('Technology', 'https://www.livemint.com/rss/technology'), ('News', 'https://www.livemint.com/rss/news'), ('Mutual Funds', 'https://www.livemint.com/rss/Mutual Funds'), ('Markets', 'https://www.livemint.com/rss/markets'), ('AI', 'https://www.livemint.com/rss/AI'), ('Insurance', 'https://www.livemint.com/rss/insurance'), ('Budget', 'https://www.livemint.com/rss/budget'), ('Elections', 'https://www.livemint.com/rss/elections'), ] def preprocess_raw_html(self, raw, *a): # remove empty p tags raw = re.sub( r'(

\s*)(<[^(\/|a|i|b|em|strong)])', r'\g<2>', re.sub( r'(

\s* \s*<\/p>)|(

\s*<\/p>)|( \s*<\/p>)', '', raw ), ) if '' in raw: m = re.search( r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw ) raw1 = raw[m.start() :] raw1 = raw1.split('>', 1)[1].strip() data = json.JSONDecoder().raw_decode(raw1)[0] value = data['hasPart']['value'] body = ( data['articleBody'] + '

' + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1

\3', value) ) body = '

' + body + '

' raw2 = re.sub(r'
([^}]*)
', body, raw) return raw2 return raw def preprocess_html(self, soup): auth = soup.find( attrs={ 'class': lambda x: x and x.startswith(('storyPage_authorInfo__', 'storyPage_authorSocial__')) } ) if auth: auth['class'] = 'auth' summ = soup.find( attrs={'class': lambda x: x and x.startswith('storyPage_summary__')} ) if summ: summ['class'] = 'summary' for strong in soup.findAll('strong'): if strong.find('p'): strong.name = 'div' for embed in soup.findAll('div', attrs={'class': 'embed'}): nos = embed.find('noscript') if nos: nos.name = 'span' for span in soup.findAll('figcaption'): span['id'] = 'img-cap' for auth in soup.findAll( 'span', attrs={'class': lambda x: x and 'articleInfo' in x.split()} ): auth.name = 'div' for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] for span in soup.findAll('span', attrs={'class': 'exclusive'}): span.extract() for al in soup.findAll('a', attrs={'class': 'manualbacklink'}): pa = al.findParent(['p', 'h2', 'h3', 'h4']) if pa: pa.extract() wa = soup.find(**classes('autobacklink-topic')) if wa: p = wa.findParent('p') if p: p.extract() return soup def populate_article_metadata(self, article, soup, first): article.title = article.title.replace('', '₹')