update livemint

This commit is contained in:
unkn0w7n 2024-03-11 09:27:33 +05:30
parent 88d926143e
commit 6d3865ca10
5 changed files with 23 additions and 19 deletions

View File

@ -84,6 +84,7 @@ class LiveMint(BasicNewsRecipe):
extra_css = '''
img {margin:0 auto;}
.psTopLogoItem img, .ecologoStory { width:100; }
#img-cap {font-size:small; text-align:center;}
.summary, .highlights, .synopsis {
font-weight:normal !important; font-style:italic; color:#202020;
@ -129,7 +130,11 @@ class LiveMint(BasicNewsRecipe):
def preprocess_raw_html(self, raw, *a):
# remove empty p tags
raw = re.sub(r'(<p>\s*&nbsp;\s*<\/p>)|(<p>\s*<\/p>)', '', raw)
raw = re.sub(
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub(
r'(<p>\s*&nbsp;\s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+>&nbsp;\s*<\/p>)', '', raw
)
)
if '<script>var wsjFlag=true;</script>' in raw:
m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
raw1 = raw[m.start():]
@ -141,8 +146,7 @@ class LiveMint(BasicNewsRecipe):
body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
raw2 = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
return raw2
else:
return raw
return raw
def preprocess_html(self, soup):
for strong in soup.findAll('strong'):

View File

@ -53,8 +53,8 @@ def parse_inline(inl):
yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
if 'caption' in props:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
props['caption']['text'], ' ' + props['caption']['credit']
)
props['caption']['text'], ' ' + props['caption']['credit']
)
yield '</p>'
if inl.get('content', {}).get('name', '') == 'ImageGroup':
if 'images' in inl['content']['props']:
@ -64,8 +64,8 @@ def parse_inline(inl):
yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
if 'caption' in imgs:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
imgs['caption']['text'], ' ' + imgs['caption']['credit']
)
imgs['caption']['text'], ' ' + imgs['caption']['credit']
)
yield '</p>'
@ -83,7 +83,7 @@ def parse_body(x):
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', {}):
elif 'attrs' in x and 'href' in x.get('attrs', ''):
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
yield yld

View File

@ -52,8 +52,8 @@ def parse_inline(inl):
yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
if 'caption' in props:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
props['caption']['text'], ' ' + props['caption']['credit']
)
props['caption']['text'], ' ' + props['caption']['credit']
)
yield '</p>'
if inl.get('content', {}).get('name', '') == 'ImageGroup':
if 'images' in inl['content']['props']:
@ -63,8 +63,8 @@ def parse_inline(inl):
yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
if 'caption' in imgs:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
imgs['caption']['text'], ' ' + imgs['caption']['credit']
)
imgs['caption']['text'], ' ' + imgs['caption']['credit']
)
yield '</p>'
@ -82,7 +82,7 @@ def parse_body(x):
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', {}):
elif 'attrs' in x and 'href' in x.get('attrs', ''):
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
yield yld

View File

@ -57,8 +57,8 @@ def parse_inline(inl):
yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
if 'caption' in props:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
props['caption']['text'], ' ' + props['caption']['credit']
)
props['caption']['text'], ' ' + props['caption']['credit']
)
yield '</p>'
if inl.get('content', {}).get('name', '') == 'ImageGroup':
if 'images' in inl['content']['props']:
@ -68,8 +68,8 @@ def parse_inline(inl):
yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
if 'caption' in imgs:
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
imgs['caption']['text'], ' ' + imgs['caption']['credit']
)
imgs['caption']['text'], ' ' + imgs['caption']['credit']
)
yield '</p>'
@ -87,7 +87,7 @@ def parse_body(x):
tag = x['type']
if tag == 'inline':
yield ''.join(parse_inline(x))
elif 'attrs' in x and 'href' in x.get('attrs', {}):
elif 'attrs' in x and 'href' in x.get('attrs', ''):
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
for yld in parse_cont(x):
yield yld

View File

@ -27,7 +27,7 @@ class PhilosophyNow(BasicNewsRecipe):
remove_tags = [dict(name='div', attrs={'id':'welcome_box'})]
extra_css = '''
img {display:block; margin:0 auto;}
.articleImage { font-size:small; text-align:center; }
.articleImageCaption { font-size:small; text-align:center; }
em, blockquote { color:#202020; }
'''