mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
update Live Law and Live Mint
This commit is contained in:
parent
a2e531db83
commit
3f013c3856
@ -35,6 +35,7 @@ class livelaw(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
|
classes('in-image-ad-wrap'),
|
||||||
dict(
|
dict(
|
||||||
name='div',
|
name='div',
|
||||||
attrs={'id': lambda x: x and x.startswith('inside_post_content_ad')}
|
attrs={'id': lambda x: x and x.startswith('inside_post_content_ad')}
|
||||||
@ -91,7 +92,7 @@ class livelaw(BasicNewsRecipe):
|
|||||||
def is_accepted_entry(self, entry):
|
def is_accepted_entry(self, entry):
|
||||||
# Those sections in the top nav bar that we will omit
|
# Those sections in the top nav bar that we will omit
|
||||||
omit_list = [
|
omit_list = [
|
||||||
'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in'
|
'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in', 'javascript:void(0);',
|
||||||
]
|
]
|
||||||
is_accepted = True
|
is_accepted = True
|
||||||
for omit_entry in omit_list:
|
for omit_entry in omit_list:
|
||||||
|
@ -13,7 +13,7 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
title = u'Live Mint'
|
title = u'Live Mint'
|
||||||
description = 'Financial News from India.'
|
description = 'Financial News from India.'
|
||||||
language = 'en_IN'
|
language = 'en_IN'
|
||||||
__author__ = 'Krittika Goyal'
|
__author__ = 'Krittika Goyal, revised by unkn0wn'
|
||||||
oldest_article = 1.15 # days
|
oldest_article = 1.15 # days
|
||||||
max_articles_per_feed = 50
|
max_articles_per_feed = 50
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
@ -48,6 +48,11 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
('How to Lounge','https://lifestyle.livemint.com/rss/how-to-lounge'),
|
('How to Lounge','https://lifestyle.livemint.com/rss/how-to-lounge'),
|
||||||
('Smart Living','https://lifestyle.livemint.com/rss/smart-living'),
|
('Smart Living','https://lifestyle.livemint.com/rss/smart-living'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for img in soup.findAll('img', attrs={'data-img': True}):
|
||||||
|
img['src'] = img['data-img']
|
||||||
|
return soup
|
||||||
else:
|
else:
|
||||||
# some wsj articles wont load
|
# some wsj articles wont load
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
@ -90,31 +95,29 @@ class LiveMint(BasicNewsRecipe):
|
|||||||
('Elections', 'https://www.livemint.com/rss/elections'),
|
('Elections', 'https://www.livemint.com/rss/elections'),
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, *a):
|
def preprocess_raw_html(self, raw, *a):
|
||||||
if '<script>var wsjFlag=true;</script>' in raw:
|
if '<script>var wsjFlag=true;</script>' in raw:
|
||||||
m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
|
m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
|
||||||
raw1 = raw[m.start():]
|
raw1 = raw[m.start():]
|
||||||
raw1 = raw1.split('>', 1)[1].strip()
|
raw1 = raw1.split('>', 1)[1].strip()
|
||||||
data = json.JSONDecoder().raw_decode(raw1)[0]
|
data = json.JSONDecoder().raw_decode(raw1)[0]
|
||||||
value = data['hasPart']['value']
|
value = data['hasPart']['value']
|
||||||
body = data['articleBody'] + '</p> <p>' + re.sub(r'([a-z]\.|[0-9]\.)([A-Z])', r'\1 <p> \2', value)
|
body = data['articleBody'] + '</p> <p>'\
|
||||||
body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
|
+ re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1 <p> \3', value)
|
||||||
raw = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
|
body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
|
||||||
return raw
|
raw = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
|
||||||
else:
|
return raw
|
||||||
return raw
|
else:
|
||||||
|
return raw
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for span in soup.findAll('figcaption'):
|
for span in soup.findAll('figcaption'):
|
||||||
span['id'] = 'img-cap'
|
span['id'] = 'img-cap'
|
||||||
for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}):
|
for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}):
|
||||||
auth['id'] = 'auth-info'
|
auth['id'] = 'auth-info'
|
||||||
auth.name = 'div'
|
auth.name = 'div'
|
||||||
for span in soup.findAll('span', attrs={'class':'exclusive'}):
|
for span in soup.findAll('span', attrs={'class':'exclusive'}):
|
||||||
span.extract()
|
span.extract()
|
||||||
for img in soup.findAll('img', attrs={'data-src': True}):
|
for img in soup.findAll('img', attrs={'data-src': True}):
|
||||||
img['src'] = img['data-src']
|
img['src'] = img['data-src']
|
||||||
if is_saturday:
|
return soup
|
||||||
for img in soup.findAll('img', attrs={'data-img': True}):
|
|
||||||
img['src'] = img['data-img']
|
|
||||||
return soup
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user