From 3f013c3856b10bd6fbffcd0f0084c7d59773d47e Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Wed, 31 Aug 2022 20:43:19 +0530
Subject: [PATCH] update Live Law and Live Mint
---
recipes/live_law.recipe | 3 ++-
recipes/livemint.recipe | 59 ++++++++++++++++++++++-------------------
2 files changed, 33 insertions(+), 29 deletions(-)
diff --git a/recipes/live_law.recipe b/recipes/live_law.recipe
index 1e2dc56c8e..58bf6b4943 100644
--- a/recipes/live_law.recipe
+++ b/recipes/live_law.recipe
@@ -35,6 +35,7 @@ class livelaw(BasicNewsRecipe):
]
remove_tags = [
+ classes('in-image-ad-wrap'),
dict(
name='div',
attrs={'id': lambda x: x and x.startswith('inside_post_content_ad')}
@@ -91,7 +92,7 @@ class livelaw(BasicNewsRecipe):
def is_accepted_entry(self, entry):
# Those sections in the top nav bar that we will omit
omit_list = [
- 'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in'
+ 'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in', 'javascript:void(0);',
]
is_accepted = True
for omit_entry in omit_list:
diff --git a/recipes/livemint.recipe b/recipes/livemint.recipe
index 26cd09a29e..a112968816 100644
--- a/recipes/livemint.recipe
+++ b/recipes/livemint.recipe
@@ -13,7 +13,7 @@ class LiveMint(BasicNewsRecipe):
title = u'Live Mint'
description = 'Financial News from India.'
language = 'en_IN'
- __author__ = 'Krittika Goyal'
+ __author__ = 'Krittika Goyal, revised by unkn0wn'
oldest_article = 1.15 # days
max_articles_per_feed = 50
encoding = 'utf-8'
@@ -48,6 +48,11 @@ class LiveMint(BasicNewsRecipe):
('How to Lounge','https://lifestyle.livemint.com/rss/how-to-lounge'),
('Smart Living','https://lifestyle.livemint.com/rss/smart-living'),
]
+
+ def preprocess_html(self, soup):
+ for img in soup.findAll('img', attrs={'data-img': True}):
+ img['src'] = img['data-img']
+ return soup
else:
# some wsj articles wont load
extra_css = '''
@@ -90,31 +95,29 @@ class LiveMint(BasicNewsRecipe):
('Elections', 'https://www.livemint.com/rss/elections'),
]
- def preprocess_raw_html(self, raw, *a):
- if '' in raw:
- m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
- raw1 = raw[m.start():]
- raw1 = raw1.split('>', 1)[1].strip()
- data = json.JSONDecoder().raw_decode(raw1)[0]
- value = data['hasPart']['value']
- body = data['articleBody'] + '
' + re.sub(r'([a-z]\.|[0-9]\.)([A-Z])', r'\1
\2', value)
- body = '
'
- raw = re.sub(r'([^}]*)
', body, raw)
- return raw
- else:
- return raw
+ def preprocess_raw_html(self, raw, *a):
+ if '' in raw:
+ m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
+ raw1 = raw[m.start():]
+ raw1 = raw1.split('>', 1)[1].strip()
+ data = json.JSONDecoder().raw_decode(raw1)[0]
+ value = data['hasPart']['value']
+ body = data['articleBody'] + ' '\
+ + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1
\3', value)
+ body = '
'
+ raw = re.sub(r'([^}]*)
', body, raw)
+ return raw
+ else:
+ return raw
- def preprocess_html(self, soup):
- for span in soup.findAll('figcaption'):
- span['id'] = 'img-cap'
- for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}):
- auth['id'] = 'auth-info'
- auth.name = 'div'
- for span in soup.findAll('span', attrs={'class':'exclusive'}):
- span.extract()
- for img in soup.findAll('img', attrs={'data-src': True}):
- img['src'] = img['data-src']
- if is_saturday:
- for img in soup.findAll('img', attrs={'data-img': True}):
- img['src'] = img['data-img']
- return soup
+ def preprocess_html(self, soup):
+ for span in soup.findAll('figcaption'):
+ span['id'] = 'img-cap'
+ for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}):
+ auth['id'] = 'auth-info'
+ auth.name = 'div'
+ for span in soup.findAll('span', attrs={'class':'exclusive'}):
+ span.extract()
+ for img in soup.findAll('img', attrs={'data-src': True}):
+ img['src'] = img['data-src']
+ return soup