import json import re from datetime import date from calibre.web.feeds.news import BasicNewsRecipe, classes is_saturday = date.today().weekday() == 5 class LiveMint(BasicNewsRecipe): title = 'Live Mint' description = 'Financial News from India.' language = 'en_IN' __author__ = 'Krittika Goyal, revised by unkn0wn' oldest_article = 1.15 # days max_articles_per_feed = 50 encoding = 'utf-8' use_embedded_content = False no_stylesheets = True remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://images.livemint.com/static/livemint-logo-v1.svg' remove_empty_feeds = True resolve_internal_links = True def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) if self.output_profile.short_name.startswith('kindle'): self.title = 'Mint | ' + date.today().strftime('%b %d, %Y') if is_saturday: self.title = 'Mint Lounge | ' + date.today().strftime('%b %d, %Y') if is_saturday: def get_cover_url(self): soup = self.index_to_soup('https://lifestyle.livemint.com/') if citem := soup.find('div', attrs={'class':'headLatestIss_cover'}): return citem.img['src'].replace('_tn.jpg', '_mr.jpg') masthead_url = 'https://lifestyle.livemint.com/mintlounge/static-images/lounge-logo.svg' oldest_article = 6.5 # days extra_css = ''' #story-summary-0 {font-style:italic; color:#202020;} .innerBanner, .storyImgSec {text-align:center; font-size:small;} .author {font-size:small;} ''' keep_only_tags = [ classes('storyPageHeading storyContent innerBanner author') ] remove_tags = [ dict(name=['meta', 'link', 'svg', 'button', 'iframe']), classes('hidden-article-url sidebarAdv similarStoriesClass moreFromSecClass linkStories publishDetail'), dict(attrs={'id':['hidden-article-id-0', 'hidden-article-type-0']}) ] feeds = [ ('Lounge News', 'https://lifestyle.livemint.com/rss/news'), ('Food', 'https://lifestyle.livemint.com/rss/food'), ('Fashion', 'https://lifestyle.livemint.com/rss/fashion'), ('How to Lounge', 'https://lifestyle.livemint.com/rss/how-to-lounge'), ('Smart Living', 'https://lifestyle.livemint.com/rss/smart-living'), ('Health', 'https://lifestyle.livemint.com/rss/health'), ('Relationships', 'https://lifestyle.livemint.com//rss/relationships') ] def preprocess_html(self, soup): if h2 := soup.find('h2'): h2.name = 'p' for also in soup.findAll('h2'): if self.tag_to_string(also).strip().startswith('Also'): also.extract() for img in soup.findAll('img', attrs={'data-img': True}): img['src'] = img['data-img'] return soup else: def get_cover_url(self): soup = self.index_to_soup( 'https://www.magzter.com/IN/HT-Digital-Streams-Ltd./Mint-Mumbai/Newspaper/' ) for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): return citem['content'] extra_css = ''' img {display:block; margin:0 auto;} #img-cap {font-size:small; text-align:center;} .summary, .highlights, .synopsis { font-weight:normal !important; font-style:italic; color:#202020; } h2 {font-size:normal !important;} .author-widget {font-size:small; font-style:italic; color:#404040;} em, blockquote {color:#202020;} .moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag {font-size:small;} ''' keep_only_tags = [ dict(name='article', attrs={'id':lambda x: x and x.startswith(('article_', 'box_'))}), classes('contentSec') ] remove_tags = [ dict(name=['meta', 'link', 'svg', 'button', 'iframe']), classes( 'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight' ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo' ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText' ) ] feeds = [ ('Companies', 'https://www.livemint.com/rss/companies'), ('Opinion', 'https://www.livemint.com/rss/opinion'), ('Money', 'https://www.livemint.com/rss/money'), ('Economy', 'https://www.livemint.com/rss/economy'), ('Politics', 'https://www.livemint.com/rss/politics'), ('Science', 'https://www.livemint.com/rss/science'), ('Industry', 'https://www.livemint.com/rss/industry'), ('Education', 'https://www.livemint.com/rss/education'), ('Sports', 'https://www.livemint.com/rss/sports'), ('Technology', 'https://www.livemint.com/rss/technology'), ('News', 'https://www.livemint.com/rss/news'), ('Mutual Funds', 'https://www.livemint.com/rss/Mutual Funds'), ('Markets', 'https://www.livemint.com/rss/markets'), ('AI', 'https://www.livemint.com/rss/AI'), ('Insurance', 'https://www.livemint.com/rss/insurance'), ('Budget', 'https://www.livemint.com/rss/budget'), ('Elections', 'https://www.livemint.com/rss/elections'), ] def preprocess_raw_html(self, raw, *a): if '' in raw: m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw) raw1 = raw[m.start():] raw1 = raw1.split('>', 1)[1].strip() data = json.JSONDecoder().raw_decode(raw1)[0] value = data['hasPart']['value'] body = data['articleBody'] + '

'\ + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1

\3', value) body = '

' + body + '

' raw = re.sub(r'
([^}]*)
', body, raw) return raw else: return raw def preprocess_html(self, soup): for strong in soup.findAll('strong'): if strong.find('p'): strong.name = 'div' for embed in soup.findAll('div', attrs={'class':'embed'}): if nos := embed.find('noscript'): nos.name = 'span' for span in soup.findAll('figcaption'): span['id'] = 'img-cap' for auth in soup.findAll('span', attrs={'class':lambda x: x and 'articleInfo' in x.split()}): auth.name = 'div' for span in soup.findAll('span', attrs={'class':'exclusive'}): span.extract() for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] if wa := soup.find(**classes('autobacklink-topic')): if p := wa.findParent('p'): p.extract() return soup def populate_article_metadata(self, article, soup, first): article.title = article.title.replace('','₹')