From 276f96e2e23de5f1aaaeb705198ff742eabf02b0 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 3 Nov 2024 11:00:55 +0530 Subject: [PATCH] Update livemint.recipe saturday edition no longer works. --- recipes/livemint.recipe | 277 +++++++++++++++++++--------------------- 1 file changed, 130 insertions(+), 147 deletions(-) diff --git a/recipes/livemint.recipe b/recipes/livemint.recipe index 10e285c302..cc8511fba2 100644 --- a/recipes/livemint.recipe +++ b/recipes/livemint.recipe @@ -6,7 +6,6 @@ from datetime import date from calibre.web.feeds.news import BasicNewsRecipe, classes -is_saturday = date.today().weekday() == 5 class LiveMint(BasicNewsRecipe): title = 'Live Mint' @@ -25,13 +24,12 @@ class LiveMint(BasicNewsRecipe): 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', - 'default': str(oldest_article) + 'default': str(oldest_article), } } - remove_empty_feeds = True + remove_empty_feeds = True resolve_internal_links = True - def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') @@ -42,157 +40,142 @@ class LiveMint(BasicNewsRecipe): today = date.today().strftime('%d/%m/%Y') today = today.replace('/', '%2F') raw = self.index_to_soup( - 'https://epaper.livemint.com/Home/GetAllpages?editionid=1&editiondate=' + today, raw=True + 'https://epaper.livemint.com/Home/GetAllpages?editionid=1&editiondate=' + today, + raw=True ) for cov in json.loads(raw): if cov['NewsProPageTitle'].lower().startswith(('front', 'cover')): return cov['HighResolution'] - if is_saturday: - title = 'Mint Lounge' - masthead_url = 'https://lifestyle.livemint.com/mintlounge/static-images/lounge-logo.svg' + extra_css = """ + img {margin:0 auto;} + .psTopLogoItem img, .ecologoStory { width:100; } + #img-cap {font-size:small; text-align:center;} + .summary, .highlights, .synopsis { + font-weight:normal !important; font-style:italic; color:#202020; + } + em, blockquote {color:#202020;} + .moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag, .auth {font-size:small;} + """ - oldest_article = 6.5 # days + keep_only_tags = [ + dict( + name='article', + attrs={'id': lambda x: x and x.startswith(('article_', 'box_'))}, + ), + dict(attrs={'class': lambda x: x and x.startswith('storyPage_storyBox__')}), + classes('contentSec'), + ] - extra_css = ''' - #story-summary-0 {font-style:italic; color:#202020;} - .innerBanner, .storyImgSec {text-align:center; font-size:small;} - .author {font-size:small;} - ''' - - keep_only_tags = [ - classes('storyPageHeading storyContent innerBanner author') - ] - remove_tags = [ - dict(name=['meta', 'link', 'svg', 'button', 'iframe']), - classes('hidden-article-url sidebarAdv similarStoriesClass moreFromSecClass linkStories publishDetail'), - dict(attrs={'id':['hidden-article-id-0', 'hidden-article-type-0']}) - ] - - feeds = [ - ('Lounge News', 'https://lifestyle.livemint.com/rss/news'), - ('Food', 'https://lifestyle.livemint.com/rss/food'), - ('Fashion', 'https://lifestyle.livemint.com/rss/fashion'), - ('How to Lounge', 'https://lifestyle.livemint.com/rss/how-to-lounge'), - ('Smart Living', 'https://lifestyle.livemint.com/rss/smart-living'), - ('Health', 'https://lifestyle.livemint.com/rss/health'), - ('Relationships', 'https://lifestyle.livemint.com//rss/relationships') - ] - - def preprocess_html(self, soup): - if h2 := soup.find('h2'): - h2.name = 'p' - for also in soup.findAll('h2'): - if self.tag_to_string(also).strip().startswith('Also'): - also.extract() - for img in soup.findAll('img', attrs={'data-img': True}): - img['src'] = img['data-img'] - return soup - else: - - extra_css = ''' - img {margin:0 auto;} - .psTopLogoItem img, .ecologoStory { width:100; } - #img-cap {font-size:small; text-align:center;} - .summary, .highlights, .synopsis { - font-weight:normal !important; font-style:italic; color:#202020; - } - em, blockquote {color:#202020;} - .moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag, .auth {font-size:small;} - ''' - - keep_only_tags = [ - dict(name='article', attrs={'id':lambda x: x and x.startswith(('article_', 'box_'))}), - dict(attrs={'class':lambda x: x and x.startswith('storyPage_storyBox__')}), - classes('contentSec') - ] - remove_tags = [ - dict(name=['meta', 'link', 'svg', 'button', 'iframe']), - dict(attrs={'class':lambda x: x and x.startswith( - ('storyPage_alsoRead__', 'storyPage_firstPublishDate__', 'storyPage_bcrumb__') - )}), - dict(attrs={'id':['faqSection', 'seoText', 'ellipsisId']}), - classes( - 'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider ninSec' - ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget' - ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn trade' - ) - ] - - feeds = [ - ('Companies', 'https://www.livemint.com/rss/companies'), - ('Opinion', 'https://www.livemint.com/rss/opinion'), - ('Money', 'https://www.livemint.com/rss/money'), - ('Economy', 'https://www.livemint.com/rss/economy'), - ('Politics', 'https://www.livemint.com/rss/politics'), - ('Science', 'https://www.livemint.com/rss/science'), - ('Industry', 'https://www.livemint.com/rss/industry'), - ('Education', 'https://www.livemint.com/rss/education'), - ('Sports', 'https://www.livemint.com/rss/sports'), - ('Technology', 'https://www.livemint.com/rss/technology'), - ('News', 'https://www.livemint.com/rss/news'), - ('Mutual Funds', 'https://www.livemint.com/rss/Mutual Funds'), - ('Markets', 'https://www.livemint.com/rss/markets'), - ('AI', 'https://www.livemint.com/rss/AI'), - ('Insurance', 'https://www.livemint.com/rss/insurance'), - ('Budget', 'https://www.livemint.com/rss/budget'), - ('Elections', 'https://www.livemint.com/rss/elections'), - ] - - def preprocess_raw_html(self, raw, *a): - # remove empty p tags - raw = re.sub( - r'(
\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub( - r'(
\s* \s*<\/p>)|(
\s*<\/p>)|(
\s*<\/p>)', '', raw + remove_tags = [ + dict(name=['meta', 'link', 'svg', 'button', 'iframe']), + dict( + attrs={ + 'class': lambda x: x + and x.startswith( + ( + 'storyPage_alsoRead__', + 'storyPage_firstPublishDate__', + 'storyPage_bcrumb__', + ) ) + } + ), + dict(attrs={'id': ['faqSection', 'seoText', 'ellipsisId', 'gift_redeemed_box ']}), + classes( + 'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider ninSec' + ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget' + ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn' + ' double_gift_box trade' + ), + ] + + feeds = [ + ('Companies', 'https://www.livemint.com/rss/companies'), + ('Opinion', 'https://www.livemint.com/rss/opinion'), + ('Money', 'https://www.livemint.com/rss/money'), + ('Economy', 'https://www.livemint.com/rss/economy'), + ('Politics', 'https://www.livemint.com/rss/politics'), + ('Science', 'https://www.livemint.com/rss/science'), + ('Industry', 'https://www.livemint.com/rss/industry'), + ('Education', 'https://www.livemint.com/rss/education'), + ('Sports', 'https://www.livemint.com/rss/sports'), + ('Technology', 'https://www.livemint.com/rss/technology'), + ('News', 'https://www.livemint.com/rss/news'), + ('Mutual Funds', 'https://www.livemint.com/rss/Mutual Funds'), + ('Markets', 'https://www.livemint.com/rss/markets'), + ('AI', 'https://www.livemint.com/rss/AI'), + ('Insurance', 'https://www.livemint.com/rss/insurance'), + ('Budget', 'https://www.livemint.com/rss/budget'), + ('Elections', 'https://www.livemint.com/rss/elections'), + ] + + def preprocess_raw_html(self, raw, *a): + # remove empty p tags + raw = re.sub( + r'(
\s*)(<[^(\/|a|i|b|em|strong)])', r'\g<2>', re.sub( + r'(
\s* \s*<\/p>)|(
\s*<\/p>)|(
\s*<\/p>)', '', raw + ), + ) + if '' in raw: + m = re.search( + r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw ) - if '' in raw: - m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw) - raw1 = raw[m.start():] - raw1 = raw1.split('>', 1)[1].strip() - data = json.JSONDecoder().raw_decode(raw1)[0] - value = data['hasPart']['value'] - body = data['articleBody'] + '
'\ - + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1
\3', value) - body = '
' + body + '
' + + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1
\3', value) + ) + body = '
' + body + '