mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
update Live Law and Live Mint
This commit is contained in:
parent
a2e531db83
commit
3f013c3856
@ -35,6 +35,7 @@ class livelaw(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
classes('in-image-ad-wrap'),
|
||||
dict(
|
||||
name='div',
|
||||
attrs={'id': lambda x: x and x.startswith('inside_post_content_ad')}
|
||||
@ -91,7 +92,7 @@ class livelaw(BasicNewsRecipe):
|
||||
def is_accepted_entry(self, entry):
|
||||
# Those sections in the top nav bar that we will omit
|
||||
omit_list = [
|
||||
'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in'
|
||||
'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in', 'javascript:void(0);',
|
||||
]
|
||||
is_accepted = True
|
||||
for omit_entry in omit_list:
|
||||
|
@ -13,7 +13,7 @@ class LiveMint(BasicNewsRecipe):
|
||||
title = u'Live Mint'
|
||||
description = 'Financial News from India.'
|
||||
language = 'en_IN'
|
||||
__author__ = 'Krittika Goyal'
|
||||
__author__ = 'Krittika Goyal, revised by unkn0wn'
|
||||
oldest_article = 1.15 # days
|
||||
max_articles_per_feed = 50
|
||||
encoding = 'utf-8'
|
||||
@ -48,6 +48,11 @@ class LiveMint(BasicNewsRecipe):
|
||||
('How to Lounge','https://lifestyle.livemint.com/rss/how-to-lounge'),
|
||||
('Smart Living','https://lifestyle.livemint.com/rss/smart-living'),
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for img in soup.findAll('img', attrs={'data-img': True}):
|
||||
img['src'] = img['data-img']
|
||||
return soup
|
||||
else:
|
||||
# some wsj articles wont load
|
||||
extra_css = '''
|
||||
@ -90,31 +95,29 @@ class LiveMint(BasicNewsRecipe):
|
||||
('Elections', 'https://www.livemint.com/rss/elections'),
|
||||
]
|
||||
|
||||
def preprocess_raw_html(self, raw, *a):
|
||||
if '<script>var wsjFlag=true;</script>' in raw:
|
||||
m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
|
||||
raw1 = raw[m.start():]
|
||||
raw1 = raw1.split('>', 1)[1].strip()
|
||||
data = json.JSONDecoder().raw_decode(raw1)[0]
|
||||
value = data['hasPart']['value']
|
||||
body = data['articleBody'] + '</p> <p>' + re.sub(r'([a-z]\.|[0-9]\.)([A-Z])', r'\1 <p> \2', value)
|
||||
body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
|
||||
raw = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
|
||||
return raw
|
||||
else:
|
||||
return raw
|
||||
def preprocess_raw_html(self, raw, *a):
|
||||
if '<script>var wsjFlag=true;</script>' in raw:
|
||||
m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
|
||||
raw1 = raw[m.start():]
|
||||
raw1 = raw1.split('>', 1)[1].strip()
|
||||
data = json.JSONDecoder().raw_decode(raw1)[0]
|
||||
value = data['hasPart']['value']
|
||||
body = data['articleBody'] + '</p> <p>'\
|
||||
+ re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1 <p> \3', value)
|
||||
body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
|
||||
raw = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
|
||||
return raw
|
||||
else:
|
||||
return raw
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for span in soup.findAll('figcaption'):
|
||||
span['id'] = 'img-cap'
|
||||
for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}):
|
||||
auth['id'] = 'auth-info'
|
||||
auth.name = 'div'
|
||||
for span in soup.findAll('span', attrs={'class':'exclusive'}):
|
||||
span.extract()
|
||||
for img in soup.findAll('img', attrs={'data-src': True}):
|
||||
img['src'] = img['data-src']
|
||||
if is_saturday:
|
||||
for img in soup.findAll('img', attrs={'data-img': True}):
|
||||
img['src'] = img['data-img']
|
||||
return soup
|
||||
def preprocess_html(self, soup):
|
||||
for span in soup.findAll('figcaption'):
|
||||
span['id'] = 'img-cap'
|
||||
for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}):
|
||||
auth['id'] = 'auth-info'
|
||||
auth.name = 'div'
|
||||
for span in soup.findAll('span', attrs={'class':'exclusive'}):
|
||||
span.extract()
|
||||
for img in soup.findAll('img', attrs={'data-src': True}):
|
||||
img['src'] = img['data-src']
|
||||
return soup
|
||||
|
Loading…
x
Reference in New Issue
Block a user