mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
update livemint
This commit is contained in:
parent
88d926143e
commit
6d3865ca10
@ -84,6 +84,7 @@ class LiveMint(BasicNewsRecipe):
|
||||
|
||||
extra_css = '''
|
||||
img {margin:0 auto;}
|
||||
.psTopLogoItem img, .ecologoStory { width:100; }
|
||||
#img-cap {font-size:small; text-align:center;}
|
||||
.summary, .highlights, .synopsis {
|
||||
font-weight:normal !important; font-style:italic; color:#202020;
|
||||
@ -129,7 +130,11 @@ class LiveMint(BasicNewsRecipe):
|
||||
|
||||
def preprocess_raw_html(self, raw, *a):
|
||||
# remove empty p tags
|
||||
raw = re.sub(r'(<p>\s* \s*<\/p>)|(<p>\s*<\/p>)', '', raw)
|
||||
raw = re.sub(
|
||||
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub(
|
||||
r'(<p>\s* \s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+> \s*<\/p>)', '', raw
|
||||
)
|
||||
)
|
||||
if '<script>var wsjFlag=true;</script>' in raw:
|
||||
m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
|
||||
raw1 = raw[m.start():]
|
||||
@ -141,8 +146,7 @@ class LiveMint(BasicNewsRecipe):
|
||||
body = '<div class="FirstEle"> <p>' + body + '</p> </div>'
|
||||
raw2 = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
|
||||
return raw2
|
||||
else:
|
||||
return raw
|
||||
return raw
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for strong in soup.findAll('strong'):
|
||||
|
@ -53,8 +53,8 @@ def parse_inline(inl):
|
||||
yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
|
||||
if 'caption' in props:
|
||||
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
|
||||
props['caption']['text'], ' ' + props['caption']['credit']
|
||||
)
|
||||
props['caption']['text'], ' ' + props['caption']['credit']
|
||||
)
|
||||
yield '</p>'
|
||||
if inl.get('content', {}).get('name', '') == 'ImageGroup':
|
||||
if 'images' in inl['content']['props']:
|
||||
@ -64,8 +64,8 @@ def parse_inline(inl):
|
||||
yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
|
||||
if 'caption' in imgs:
|
||||
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
|
||||
imgs['caption']['text'], ' ' + imgs['caption']['credit']
|
||||
)
|
||||
imgs['caption']['text'], ' ' + imgs['caption']['credit']
|
||||
)
|
||||
yield '</p>'
|
||||
|
||||
|
||||
@ -83,7 +83,7 @@ def parse_body(x):
|
||||
tag = x['type']
|
||||
if tag == 'inline':
|
||||
yield ''.join(parse_inline(x))
|
||||
elif 'attrs' in x and 'href' in x.get('attrs', {}):
|
||||
elif 'attrs' in x and 'href' in x.get('attrs', ''):
|
||||
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
|
||||
for yld in parse_cont(x):
|
||||
yield yld
|
||||
|
@ -52,8 +52,8 @@ def parse_inline(inl):
|
||||
yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
|
||||
if 'caption' in props:
|
||||
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
|
||||
props['caption']['text'], ' ' + props['caption']['credit']
|
||||
)
|
||||
props['caption']['text'], ' ' + props['caption']['credit']
|
||||
)
|
||||
yield '</p>'
|
||||
if inl.get('content', {}).get('name', '') == 'ImageGroup':
|
||||
if 'images' in inl['content']['props']:
|
||||
@ -63,8 +63,8 @@ def parse_inline(inl):
|
||||
yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
|
||||
if 'caption' in imgs:
|
||||
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
|
||||
imgs['caption']['text'], ' ' + imgs['caption']['credit']
|
||||
)
|
||||
imgs['caption']['text'], ' ' + imgs['caption']['credit']
|
||||
)
|
||||
yield '</p>'
|
||||
|
||||
|
||||
@ -82,7 +82,7 @@ def parse_body(x):
|
||||
tag = x['type']
|
||||
if tag == 'inline':
|
||||
yield ''.join(parse_inline(x))
|
||||
elif 'attrs' in x and 'href' in x.get('attrs', {}):
|
||||
elif 'attrs' in x and 'href' in x.get('attrs', ''):
|
||||
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
|
||||
for yld in parse_cont(x):
|
||||
yield yld
|
||||
|
@ -57,8 +57,8 @@ def parse_inline(inl):
|
||||
yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
|
||||
if 'caption' in props:
|
||||
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
|
||||
props['caption']['text'], ' ' + props['caption']['credit']
|
||||
)
|
||||
props['caption']['text'], ' ' + props['caption']['credit']
|
||||
)
|
||||
yield '</p>'
|
||||
if inl.get('content', {}).get('name', '') == 'ImageGroup':
|
||||
if 'images' in inl['content']['props']:
|
||||
@ -68,8 +68,8 @@ def parse_inline(inl):
|
||||
yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
|
||||
if 'caption' in imgs:
|
||||
yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
|
||||
imgs['caption']['text'], ' ' + imgs['caption']['credit']
|
||||
)
|
||||
imgs['caption']['text'], ' ' + imgs['caption']['credit']
|
||||
)
|
||||
yield '</p>'
|
||||
|
||||
|
||||
@ -87,7 +87,7 @@ def parse_body(x):
|
||||
tag = x['type']
|
||||
if tag == 'inline':
|
||||
yield ''.join(parse_inline(x))
|
||||
elif 'attrs' in x and 'href' in x.get('attrs', {}):
|
||||
elif 'attrs' in x and 'href' in x.get('attrs', ''):
|
||||
yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
|
||||
for yld in parse_cont(x):
|
||||
yield yld
|
||||
|
@ -27,7 +27,7 @@ class PhilosophyNow(BasicNewsRecipe):
|
||||
remove_tags = [dict(name='div', attrs={'id':'welcome_box'})]
|
||||
extra_css = '''
|
||||
img {display:block; margin:0 auto;}
|
||||
.articleImage { font-size:small; text-align:center; }
|
||||
.articleImageCaption { font-size:small; text-align:center; }
|
||||
em, blockquote { color:#202020; }
|
||||
'''
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user