mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Update livemint.recipe
saturday edition no longer works.
This commit is contained in:
		
							parent
							
								
									d453685418
								
							
						
					
					
						commit
						276f96e2e2
					
				@ -6,7 +6,6 @@ from datetime import date
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
 | 
					from calibre.web.feeds.news import BasicNewsRecipe, classes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
is_saturday = date.today().weekday() == 5
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LiveMint(BasicNewsRecipe):
 | 
					class LiveMint(BasicNewsRecipe):
 | 
				
			||||||
    title = 'Live Mint'
 | 
					    title = 'Live Mint'
 | 
				
			||||||
@ -25,13 +24,12 @@ class LiveMint(BasicNewsRecipe):
 | 
				
			|||||||
        'days': {
 | 
					        'days': {
 | 
				
			||||||
            'short': 'Oldest article to download from this news source. In days ',
 | 
					            'short': 'Oldest article to download from this news source. In days ',
 | 
				
			||||||
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
 | 
					            'long': 'For example, 0.5, gives you articles from the past 12 hours',
 | 
				
			||||||
            'default': str(oldest_article)
 | 
					            'default': str(oldest_article),
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    remove_empty_feeds =  True
 | 
					    remove_empty_feeds = True
 | 
				
			||||||
    resolve_internal_links = True
 | 
					    resolve_internal_links = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, *args, **kwargs):
 | 
					    def __init__(self, *args, **kwargs):
 | 
				
			||||||
        BasicNewsRecipe.__init__(self, *args, **kwargs)
 | 
					        BasicNewsRecipe.__init__(self, *args, **kwargs)
 | 
				
			||||||
        d = self.recipe_specific_options.get('days')
 | 
					        d = self.recipe_specific_options.get('days')
 | 
				
			||||||
@ -42,157 +40,142 @@ class LiveMint(BasicNewsRecipe):
 | 
				
			|||||||
        today = date.today().strftime('%d/%m/%Y')
 | 
					        today = date.today().strftime('%d/%m/%Y')
 | 
				
			||||||
        today = today.replace('/', '%2F')
 | 
					        today = today.replace('/', '%2F')
 | 
				
			||||||
        raw = self.index_to_soup(
 | 
					        raw = self.index_to_soup(
 | 
				
			||||||
            'https://epaper.livemint.com/Home/GetAllpages?editionid=1&editiondate=' + today, raw=True
 | 
					            'https://epaper.livemint.com/Home/GetAllpages?editionid=1&editiondate=' + today,
 | 
				
			||||||
 | 
					            raw=True
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        for cov in json.loads(raw):
 | 
					        for cov in json.loads(raw):
 | 
				
			||||||
            if cov['NewsProPageTitle'].lower().startswith(('front', 'cover')):
 | 
					            if cov['NewsProPageTitle'].lower().startswith(('front', 'cover')):
 | 
				
			||||||
                return cov['HighResolution']
 | 
					                return cov['HighResolution']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if is_saturday:
 | 
					    extra_css = """
 | 
				
			||||||
        title = 'Mint Lounge'
 | 
					        img {margin:0 auto;}
 | 
				
			||||||
        masthead_url = 'https://lifestyle.livemint.com/mintlounge/static-images/lounge-logo.svg'
 | 
					        .psTopLogoItem img, .ecologoStory { width:100; }
 | 
				
			||||||
 | 
					        #img-cap {font-size:small; text-align:center;}
 | 
				
			||||||
 | 
					        .summary, .highlights, .synopsis {
 | 
				
			||||||
 | 
					            font-weight:normal !important; font-style:italic; color:#202020;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        em, blockquote {color:#202020;}
 | 
				
			||||||
 | 
					        .moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag, .auth {font-size:small;}
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        oldest_article = 6.5 # days
 | 
					    keep_only_tags = [
 | 
				
			||||||
 | 
					        dict(
 | 
				
			||||||
 | 
					            name='article',
 | 
				
			||||||
 | 
					            attrs={'id': lambda x: x and x.startswith(('article_', 'box_'))},
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        dict(attrs={'class': lambda x: x and x.startswith('storyPage_storyBox__')}),
 | 
				
			||||||
 | 
					        classes('contentSec'),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        extra_css = '''
 | 
					    remove_tags = [
 | 
				
			||||||
            #story-summary-0 {font-style:italic; color:#202020;}
 | 
					        dict(name=['meta', 'link', 'svg', 'button', 'iframe']),
 | 
				
			||||||
            .innerBanner, .storyImgSec {text-align:center; font-size:small;}
 | 
					        dict(
 | 
				
			||||||
            .author {font-size:small;}
 | 
					            attrs={
 | 
				
			||||||
        '''
 | 
					                'class': lambda x: x
 | 
				
			||||||
 | 
					                and x.startswith(
 | 
				
			||||||
        keep_only_tags = [
 | 
					                    (
 | 
				
			||||||
            classes('storyPageHeading storyContent innerBanner author')
 | 
					                        'storyPage_alsoRead__',
 | 
				
			||||||
        ]
 | 
					                        'storyPage_firstPublishDate__',
 | 
				
			||||||
        remove_tags = [
 | 
					                        'storyPage_bcrumb__',
 | 
				
			||||||
            dict(name=['meta', 'link', 'svg', 'button', 'iframe']),
 | 
					                    )
 | 
				
			||||||
            classes('hidden-article-url sidebarAdv similarStoriesClass moreFromSecClass linkStories publishDetail'),
 | 
					 | 
				
			||||||
            dict(attrs={'id':['hidden-article-id-0', 'hidden-article-type-0']})
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        feeds = [
 | 
					 | 
				
			||||||
            ('Lounge News', 'https://lifestyle.livemint.com/rss/news'),
 | 
					 | 
				
			||||||
            ('Food', 'https://lifestyle.livemint.com/rss/food'),
 | 
					 | 
				
			||||||
            ('Fashion', 'https://lifestyle.livemint.com/rss/fashion'),
 | 
					 | 
				
			||||||
            ('How to Lounge', 'https://lifestyle.livemint.com/rss/how-to-lounge'),
 | 
					 | 
				
			||||||
            ('Smart Living', 'https://lifestyle.livemint.com/rss/smart-living'),
 | 
					 | 
				
			||||||
            ('Health', 'https://lifestyle.livemint.com/rss/health'),
 | 
					 | 
				
			||||||
            ('Relationships', 'https://lifestyle.livemint.com//rss/relationships')
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def preprocess_html(self, soup):
 | 
					 | 
				
			||||||
            if h2 := soup.find('h2'):
 | 
					 | 
				
			||||||
                h2.name = 'p'
 | 
					 | 
				
			||||||
            for also in soup.findAll('h2'):
 | 
					 | 
				
			||||||
                if self.tag_to_string(also).strip().startswith('Also'):
 | 
					 | 
				
			||||||
                    also.extract()
 | 
					 | 
				
			||||||
            for img in soup.findAll('img', attrs={'data-img': True}):
 | 
					 | 
				
			||||||
                img['src'] = img['data-img']
 | 
					 | 
				
			||||||
            return soup
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        extra_css = '''
 | 
					 | 
				
			||||||
            img {margin:0 auto;}
 | 
					 | 
				
			||||||
            .psTopLogoItem img, .ecologoStory { width:100; }
 | 
					 | 
				
			||||||
            #img-cap {font-size:small; text-align:center;}
 | 
					 | 
				
			||||||
            .summary, .highlights, .synopsis {
 | 
					 | 
				
			||||||
                font-weight:normal !important; font-style:italic; color:#202020;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            em, blockquote {color:#202020;}
 | 
					 | 
				
			||||||
            .moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag, .auth {font-size:small;}
 | 
					 | 
				
			||||||
        '''
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        keep_only_tags = [
 | 
					 | 
				
			||||||
            dict(name='article', attrs={'id':lambda x: x and x.startswith(('article_', 'box_'))}),
 | 
					 | 
				
			||||||
            dict(attrs={'class':lambda x: x and x.startswith('storyPage_storyBox__')}),
 | 
					 | 
				
			||||||
            classes('contentSec')
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
        remove_tags = [
 | 
					 | 
				
			||||||
            dict(name=['meta', 'link', 'svg', 'button', 'iframe']),
 | 
					 | 
				
			||||||
            dict(attrs={'class':lambda x: x and x.startswith(
 | 
					 | 
				
			||||||
                ('storyPage_alsoRead__', 'storyPage_firstPublishDate__', 'storyPage_bcrumb__')
 | 
					 | 
				
			||||||
            )}),
 | 
					 | 
				
			||||||
            dict(attrs={'id':['faqSection', 'seoText', 'ellipsisId']}),
 | 
					 | 
				
			||||||
            classes(
 | 
					 | 
				
			||||||
                'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider ninSec'
 | 
					 | 
				
			||||||
                ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget'
 | 
					 | 
				
			||||||
                ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn trade'
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        feeds = [
 | 
					 | 
				
			||||||
            ('Companies', 'https://www.livemint.com/rss/companies'),
 | 
					 | 
				
			||||||
            ('Opinion', 'https://www.livemint.com/rss/opinion'),
 | 
					 | 
				
			||||||
            ('Money', 'https://www.livemint.com/rss/money'),
 | 
					 | 
				
			||||||
            ('Economy', 'https://www.livemint.com/rss/economy'),
 | 
					 | 
				
			||||||
            ('Politics', 'https://www.livemint.com/rss/politics'),
 | 
					 | 
				
			||||||
            ('Science', 'https://www.livemint.com/rss/science'),
 | 
					 | 
				
			||||||
            ('Industry', 'https://www.livemint.com/rss/industry'),
 | 
					 | 
				
			||||||
            ('Education', 'https://www.livemint.com/rss/education'),
 | 
					 | 
				
			||||||
            ('Sports', 'https://www.livemint.com/rss/sports'),
 | 
					 | 
				
			||||||
            ('Technology', 'https://www.livemint.com/rss/technology'),
 | 
					 | 
				
			||||||
            ('News', 'https://www.livemint.com/rss/news'),
 | 
					 | 
				
			||||||
            ('Mutual Funds', 'https://www.livemint.com/rss/Mutual Funds'),
 | 
					 | 
				
			||||||
            ('Markets', 'https://www.livemint.com/rss/markets'),
 | 
					 | 
				
			||||||
            ('AI', 'https://www.livemint.com/rss/AI'),
 | 
					 | 
				
			||||||
            ('Insurance', 'https://www.livemint.com/rss/insurance'),
 | 
					 | 
				
			||||||
            ('Budget', 'https://www.livemint.com/rss/budget'),
 | 
					 | 
				
			||||||
            ('Elections', 'https://www.livemint.com/rss/elections'),
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def preprocess_raw_html(self, raw, *a):
 | 
					 | 
				
			||||||
            # remove empty p tags
 | 
					 | 
				
			||||||
            raw = re.sub(
 | 
					 | 
				
			||||||
                r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub(
 | 
					 | 
				
			||||||
                    r'(<p>\s* \s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+> \s*<\/p>)', '', raw
 | 
					 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        dict(attrs={'id': ['faqSection', 'seoText', 'ellipsisId', 'gift_redeemed_box ']}),
 | 
				
			||||||
 | 
					        classes(
 | 
				
			||||||
 | 
					            'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider ninSec'
 | 
				
			||||||
 | 
					            ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget'
 | 
				
			||||||
 | 
					            ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn'
 | 
				
			||||||
 | 
					            ' double_gift_box trade'
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    feeds = [
 | 
				
			||||||
 | 
					        ('Companies', 'https://www.livemint.com/rss/companies'),
 | 
				
			||||||
 | 
					        ('Opinion', 'https://www.livemint.com/rss/opinion'),
 | 
				
			||||||
 | 
					        ('Money', 'https://www.livemint.com/rss/money'),
 | 
				
			||||||
 | 
					        ('Economy', 'https://www.livemint.com/rss/economy'),
 | 
				
			||||||
 | 
					        ('Politics', 'https://www.livemint.com/rss/politics'),
 | 
				
			||||||
 | 
					        ('Science', 'https://www.livemint.com/rss/science'),
 | 
				
			||||||
 | 
					        ('Industry', 'https://www.livemint.com/rss/industry'),
 | 
				
			||||||
 | 
					        ('Education', 'https://www.livemint.com/rss/education'),
 | 
				
			||||||
 | 
					        ('Sports', 'https://www.livemint.com/rss/sports'),
 | 
				
			||||||
 | 
					        ('Technology', 'https://www.livemint.com/rss/technology'),
 | 
				
			||||||
 | 
					        ('News', 'https://www.livemint.com/rss/news'),
 | 
				
			||||||
 | 
					        ('Mutual Funds', 'https://www.livemint.com/rss/Mutual Funds'),
 | 
				
			||||||
 | 
					        ('Markets', 'https://www.livemint.com/rss/markets'),
 | 
				
			||||||
 | 
					        ('AI', 'https://www.livemint.com/rss/AI'),
 | 
				
			||||||
 | 
					        ('Insurance', 'https://www.livemint.com/rss/insurance'),
 | 
				
			||||||
 | 
					        ('Budget', 'https://www.livemint.com/rss/budget'),
 | 
				
			||||||
 | 
					        ('Elections', 'https://www.livemint.com/rss/elections'),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def preprocess_raw_html(self, raw, *a):
 | 
				
			||||||
 | 
					        # remove empty p tags
 | 
				
			||||||
 | 
					        raw = re.sub(
 | 
				
			||||||
 | 
					            r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', r'\g<2>', re.sub(
 | 
				
			||||||
 | 
					                r'(<p>\s* \s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+> \s*<\/p>)', '', raw
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        if '<script>var wsjFlag=true;</script>' in raw:
 | 
				
			||||||
 | 
					            m = re.search(
 | 
				
			||||||
 | 
					                r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            if '<script>var wsjFlag=true;</script>' in raw:
 | 
					            raw1 = raw[m.start() :]
 | 
				
			||||||
                m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
 | 
					            raw1 = raw1.split('>', 1)[1].strip()
 | 
				
			||||||
                raw1 = raw[m.start():]
 | 
					            data = json.JSONDecoder().raw_decode(raw1)[0]
 | 
				
			||||||
                raw1 = raw1.split('>', 1)[1].strip()
 | 
					            value = data['hasPart']['value']
 | 
				
			||||||
                data = json.JSONDecoder().raw_decode(raw1)[0]
 | 
					            body = (
 | 
				
			||||||
                value = data['hasPart']['value']
 | 
					                data['articleBody']
 | 
				
			||||||
                body = data['articleBody'] + '</p> <p>'\
 | 
					                + '</p> <p>'
 | 
				
			||||||
                        + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1 <p> \3', value)
 | 
					                + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1 <p> \3', value)
 | 
				
			||||||
                body = '<div class="FirstEle"> <p>' +  body  + '</p> </div>'
 | 
					            )
 | 
				
			||||||
                raw2 = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
 | 
					            body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
 | 
				
			||||||
                return raw2
 | 
					            raw2 = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
 | 
				
			||||||
            return raw
 | 
					            return raw2
 | 
				
			||||||
 | 
					        return raw
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def preprocess_html(self, soup):
 | 
					    def preprocess_html(self, soup):
 | 
				
			||||||
            for h2 in soup.findAll('h2'):
 | 
					        auth = soup.find(
 | 
				
			||||||
                h2.name = 'h4'
 | 
					            attrs={
 | 
				
			||||||
            auth = soup.find(attrs={'class':lambda x: x and x.startswith(('storyPage_authorInfo__', 'storyPage_authorSocial__'))})
 | 
					                'class': lambda x: x
 | 
				
			||||||
            if auth:
 | 
					                and x.startswith(('storyPage_authorInfo__', 'storyPage_authorSocial__'))
 | 
				
			||||||
                auth['class'] = 'auth'
 | 
					            }
 | 
				
			||||||
            summ = soup.find(attrs={'class':lambda x: x and x.startswith('storyPage_summary__')})
 | 
					        )
 | 
				
			||||||
            if summ:
 | 
					        if auth:
 | 
				
			||||||
                summ['class'] = 'summary'
 | 
					            auth['class'] = 'auth'
 | 
				
			||||||
            for strong in soup.findAll('strong'):
 | 
					        summ = soup.find(
 | 
				
			||||||
                if strong.find('p'):
 | 
					            attrs={'class': lambda x: x and x.startswith('storyPage_summary__')}
 | 
				
			||||||
                    strong.name = 'div'
 | 
					        )
 | 
				
			||||||
            for embed in soup.findAll('div', attrs={'class':'embed'}):
 | 
					        if summ:
 | 
				
			||||||
                nos = embed.find('noscript')
 | 
					            summ['class'] = 'summary'
 | 
				
			||||||
                if nos:
 | 
					        for strong in soup.findAll('strong'):
 | 
				
			||||||
                    nos.name = 'span'
 | 
					            if strong.find('p'):
 | 
				
			||||||
            for span in soup.findAll('figcaption'):
 | 
					                strong.name = 'div'
 | 
				
			||||||
                span['id'] = 'img-cap'
 | 
					        for embed in soup.findAll('div', attrs={'class': 'embed'}):
 | 
				
			||||||
            for auth in soup.findAll('span', attrs={'class':lambda x: x and 'articleInfo' in x.split()}):
 | 
					            nos = embed.find('noscript')
 | 
				
			||||||
                auth.name = 'div'
 | 
					            if nos:
 | 
				
			||||||
            for img in soup.findAll('img', attrs={'data-src': True}):
 | 
					                nos.name = 'span'
 | 
				
			||||||
                img['src'] = img['data-src']
 | 
					        for span in soup.findAll('figcaption'):
 | 
				
			||||||
            for span in soup.findAll('span', attrs={'class':'exclusive'}):
 | 
					            span['id'] = 'img-cap'
 | 
				
			||||||
                span.extract()
 | 
					        for auth in soup.findAll(
 | 
				
			||||||
            for al in soup.findAll('a', attrs={'class':'manualbacklink'}):
 | 
					            'span', attrs={'class': lambda x: x and 'articleInfo' in x.split()}
 | 
				
			||||||
                pa = al.findParent(['p', 'h2', 'h3', 'h4'])
 | 
					        ):
 | 
				
			||||||
                if pa:
 | 
					            auth.name = 'div'
 | 
				
			||||||
                    pa.extract()
 | 
					        for img in soup.findAll('img', attrs={'data-src': True}):
 | 
				
			||||||
            wa = soup.find(**classes('autobacklink-topic'))
 | 
					            img['src'] = img['data-src']
 | 
				
			||||||
            if wa:
 | 
					        for span in soup.findAll('span', attrs={'class': 'exclusive'}):
 | 
				
			||||||
                p = wa.findParent('p')
 | 
					            span.extract()
 | 
				
			||||||
                if p:
 | 
					        for al in soup.findAll('a', attrs={'class': 'manualbacklink'}):
 | 
				
			||||||
                    p.extract()
 | 
					            pa = al.findParent(['p', 'h2', 'h3', 'h4'])
 | 
				
			||||||
            return soup
 | 
					            if pa:
 | 
				
			||||||
 | 
					                pa.extract()
 | 
				
			||||||
 | 
					        wa = soup.find(**classes('autobacklink-topic'))
 | 
				
			||||||
 | 
					        if wa:
 | 
				
			||||||
 | 
					            p = wa.findParent('p')
 | 
				
			||||||
 | 
					            if p:
 | 
				
			||||||
 | 
					                p.extract()
 | 
				
			||||||
 | 
					        return soup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def populate_article_metadata(self, article, soup, first):
 | 
					    def populate_article_metadata(self, article, soup, first):
 | 
				
			||||||
            article.title = article.title.replace('<span class="webrupee">₹</span>','₹')
 | 
					        article.title = article.title.replace('<span class="webrupee">₹</span>', '₹')
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user