update Live Law and Live Mint

This commit is contained in:
Kovid Goyal 2022-08-31 20:43:19 +05:30
parent a2e531db83
commit 3f013c3856
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 33 additions and 29 deletions

View File

@ -35,6 +35,7 @@ class livelaw(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
classes('in-image-ad-wrap'),
dict( dict(
name='div', name='div',
attrs={'id': lambda x: x and x.startswith('inside_post_content_ad')} attrs={'id': lambda x: x and x.startswith('inside_post_content_ad')}
@ -91,7 +92,7 @@ class livelaw(BasicNewsRecipe):
def is_accepted_entry(self, entry): def is_accepted_entry(self, entry):
# Those sections in the top nav bar that we will omit # Those sections in the top nav bar that we will omit
omit_list = [ omit_list = [
'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in' 'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in', 'javascript:void(0);',
] ]
is_accepted = True is_accepted = True
for omit_entry in omit_list: for omit_entry in omit_list:

View File

@ -13,7 +13,7 @@ class LiveMint(BasicNewsRecipe):
title = u'Live Mint' title = u'Live Mint'
description = 'Financial News from India.' description = 'Financial News from India.'
language = 'en_IN' language = 'en_IN'
__author__ = 'Krittika Goyal' __author__ = 'Krittika Goyal, revised by unkn0wn'
oldest_article = 1.15 # days oldest_article = 1.15 # days
max_articles_per_feed = 50 max_articles_per_feed = 50
encoding = 'utf-8' encoding = 'utf-8'
@ -48,6 +48,11 @@ class LiveMint(BasicNewsRecipe):
('How to Lounge','https://lifestyle.livemint.com/rss/how-to-lounge'), ('How to Lounge','https://lifestyle.livemint.com/rss/how-to-lounge'),
('Smart Living','https://lifestyle.livemint.com/rss/smart-living'), ('Smart Living','https://lifestyle.livemint.com/rss/smart-living'),
] ]
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-img': True}):
img['src'] = img['data-img']
return soup
else: else:
# some wsj articles wont load # some wsj articles wont load
extra_css = ''' extra_css = '''
@ -90,31 +95,29 @@ class LiveMint(BasicNewsRecipe):
('Elections', 'https://www.livemint.com/rss/elections'), ('Elections', 'https://www.livemint.com/rss/elections'),
] ]
def preprocess_raw_html(self, raw, *a): def preprocess_raw_html(self, raw, *a):
if '<script>var wsjFlag=true;</script>' in raw: if '<script>var wsjFlag=true;</script>' in raw:
m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw) m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
raw1 = raw[m.start():] raw1 = raw[m.start():]
raw1 = raw1.split('>', 1)[1].strip() raw1 = raw1.split('>', 1)[1].strip()
data = json.JSONDecoder().raw_decode(raw1)[0] data = json.JSONDecoder().raw_decode(raw1)[0]
value = data['hasPart']['value'] value = data['hasPart']['value']
body = data['articleBody'] + '</p> <p>' + re.sub(r'([a-z]\.|[0-9]\.)([A-Z])', r'\1 <p> \2', value) body = data['articleBody'] + '</p> <p>'\
body = '<div class="FirstEle"> <p>' + body + '</p> </div>' + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1 <p> \3', value)
raw = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw) body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
return raw raw = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
else: return raw
return raw else:
return raw
def preprocess_html(self, soup): def preprocess_html(self, soup):
for span in soup.findAll('figcaption'): for span in soup.findAll('figcaption'):
span['id'] = 'img-cap' span['id'] = 'img-cap'
for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}): for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}):
auth['id'] = 'auth-info' auth['id'] = 'auth-info'
auth.name = 'div' auth.name = 'div'
for span in soup.findAll('span', attrs={'class':'exclusive'}): for span in soup.findAll('span', attrs={'class':'exclusive'}):
span.extract() span.extract()
for img in soup.findAll('img', attrs={'data-src': True}): for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src'] img['src'] = img['data-src']
if is_saturday: return soup
for img in soup.findAll('img', attrs={'data-img': True}):
img['src'] = img['data-img']
return soup