From 3f013c3856b10bd6fbffcd0f0084c7d59773d47e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 31 Aug 2022 20:43:19 +0530 Subject: [PATCH] update Live Law and Live Mint --- recipes/live_law.recipe | 3 ++- recipes/livemint.recipe | 59 ++++++++++++++++++++++------------------- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/recipes/live_law.recipe b/recipes/live_law.recipe index 1e2dc56c8e..58bf6b4943 100644 --- a/recipes/live_law.recipe +++ b/recipes/live_law.recipe @@ -35,6 +35,7 @@ class livelaw(BasicNewsRecipe): ] remove_tags = [ + classes('in-image-ad-wrap'), dict( name='div', attrs={'id': lambda x: x and x.startswith('inside_post_content_ad')} @@ -91,7 +92,7 @@ class livelaw(BasicNewsRecipe): def is_accepted_entry(self, entry): # Those sections in the top nav bar that we will omit omit_list = [ - 'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in' + 'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in', 'javascript:void(0);', ] is_accepted = True for omit_entry in omit_list: diff --git a/recipes/livemint.recipe b/recipes/livemint.recipe index 26cd09a29e..a112968816 100644 --- a/recipes/livemint.recipe +++ b/recipes/livemint.recipe @@ -13,7 +13,7 @@ class LiveMint(BasicNewsRecipe): title = u'Live Mint' description = 'Financial News from India.' language = 'en_IN' - __author__ = 'Krittika Goyal' + __author__ = 'Krittika Goyal, revised by unkn0wn' oldest_article = 1.15 # days max_articles_per_feed = 50 encoding = 'utf-8' @@ -48,6 +48,11 @@ class LiveMint(BasicNewsRecipe): ('How to Lounge','https://lifestyle.livemint.com/rss/how-to-lounge'), ('Smart Living','https://lifestyle.livemint.com/rss/smart-living'), ] + + def preprocess_html(self, soup): + for img in soup.findAll('img', attrs={'data-img': True}): + img['src'] = img['data-img'] + return soup else: # some wsj articles wont load extra_css = ''' @@ -90,31 +95,29 @@ class LiveMint(BasicNewsRecipe): ('Elections', 'https://www.livemint.com/rss/elections'), ] - def preprocess_raw_html(self, raw, *a): - if '' in raw: - m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw) - raw1 = raw[m.start():] - raw1 = raw1.split('>', 1)[1].strip() - data = json.JSONDecoder().raw_decode(raw1)[0] - value = data['hasPart']['value'] - body = data['articleBody'] + '

' + re.sub(r'([a-z]\.|[0-9]\.)([A-Z])', r'\1

\2', value) - body = '

' + body + '

' - raw = re.sub(r'
([^}]*)
', body, raw) - return raw - else: - return raw + def preprocess_raw_html(self, raw, *a): + if '' in raw: + m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw) + raw1 = raw[m.start():] + raw1 = raw1.split('>', 1)[1].strip() + data = json.JSONDecoder().raw_decode(raw1)[0] + value = data['hasPart']['value'] + body = data['articleBody'] + '

'\ + + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1

\3', value) + body = '

' + body + '

' + raw = re.sub(r'
([^}]*)
', body, raw) + return raw + else: + return raw - def preprocess_html(self, soup): - for span in soup.findAll('figcaption'): - span['id'] = 'img-cap' - for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}): - auth['id'] = 'auth-info' - auth.name = 'div' - for span in soup.findAll('span', attrs={'class':'exclusive'}): - span.extract() - for img in soup.findAll('img', attrs={'data-src': True}): - img['src'] = img['data-src'] - if is_saturday: - for img in soup.findAll('img', attrs={'data-img': True}): - img['src'] = img['data-img'] - return soup + def preprocess_html(self, soup): + for span in soup.findAll('figcaption'): + span['id'] = 'img-cap' + for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}): + auth['id'] = 'auth-info' + auth.name = 'div' + for span in soup.findAll('span', attrs={'class':'exclusive'}): + span.extract() + for img in soup.findAll('img', attrs={'data-src': True}): + img['src'] = img['data-src'] + return soup