update livemint

2025-07-09 03:04:10 -04:00 · 2024-03-11 09:27:33 +05:30 · 2024-03-11 09:27:33 +05:30 · 6d3865ca10
commit 6d3865ca10
parent 88d926143e
5 changed files with 23 additions and 19 deletions
--- a/recipes/livemint.recipe
+++ b/recipes/livemint.recipe
@ -84,6 +84,7 @@ class LiveMint(BasicNewsRecipe):

        extra_css = '''
            img {margin:0 auto;}
+            .psTopLogoItem img, .ecologoStory { width:100; }
            #img-cap {font-size:small; text-align:center;}
            .summary, .highlights, .synopsis {
                font-weight:normal !important; font-style:italic; color:#202020;
@ -129,7 +130,11 @@ class LiveMint(BasicNewsRecipe):

        def preprocess_raw_html(self, raw, *a):
            # remove empty p tags
-            raw = re.sub(r'(<p>\s*&nbsp;\s*<\/p>)|(<p>\s*<\/p>)', '', raw)
+            raw = re.sub(
+                r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub(
+                    r'(<p>\s*&nbsp;\s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+>&nbsp;\s*<\/p>)', '', raw
+                )
+            )
            if '<script>var wsjFlag=true;</script>' in raw:
                m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
                raw1 = raw[m.start():]
@ -141,8 +146,7 @@ class LiveMint(BasicNewsRecipe):
                body = '<div class="FirstEle"> <p>' +  body  + '</p> </div>'
                raw2 = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
                return raw2
-            else:
-                return raw
+            return raw

        def preprocess_html(self, soup):
            for strong in soup.findAll('strong'):
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@ -53,8 +53,8 @@ def parse_inline(inl):
            yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
        if 'caption' in props:
            yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-                    props['caption']['text'], ' ' + props['caption']['credit']
-                )
+                props['caption']['text'], ' ' + props['caption']['credit']
+            )
        yield '</p>'
    if inl.get('content', {}).get('name', '') == 'ImageGroup':
        if 'images' in inl['content']['props']:
@ -64,8 +64,8 @@ def parse_inline(inl):
                    yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
                if 'caption' in imgs:
                    yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-                    imgs['caption']['text'], ' ' + imgs['caption']['credit']
-                )
+                        imgs['caption']['text'], ' ' + imgs['caption']['credit']
+                    )
                yield '</p>'


@ -83,7 +83,7 @@ def parse_body(x):
            tag = x['type']
            if tag == 'inline':
                yield ''.join(parse_inline(x))
-            elif 'attrs' in x and 'href' in x.get('attrs', {}):
+            elif 'attrs' in x and 'href' in x.get('attrs', ''):
                yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
                for yld in parse_cont(x):
                    yield yld
--- a/recipes/natgeohis.recipe
+++ b/recipes/natgeohis.recipe
@ -52,8 +52,8 @@ def parse_inline(inl):
            yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
        if 'caption' in props:
            yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-                    props['caption']['text'], ' ' + props['caption']['credit']
-                )
+                props['caption']['text'], ' ' + props['caption']['credit']
+            )
        yield '</p>'
    if inl.get('content', {}).get('name', '') == 'ImageGroup':
        if 'images' in inl['content']['props']:
@ -63,8 +63,8 @@ def parse_inline(inl):
                    yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
                if 'caption' in imgs:
                    yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-                    imgs['caption']['text'], ' ' + imgs['caption']['credit']
-                )
+                        imgs['caption']['text'], ' ' + imgs['caption']['credit']
+                    )
                yield '</p>'


@ -82,7 +82,7 @@ def parse_body(x):
            tag = x['type']
            if tag == 'inline':
                yield ''.join(parse_inline(x))
-            elif 'attrs' in x and 'href' in x.get('attrs', {}):
+            elif 'attrs' in x and 'href' in x.get('attrs', ''):
                yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
                for yld in parse_cont(x):
                    yield yld
--- a/recipes/natgeomag.recipe
+++ b/recipes/natgeomag.recipe
@ -57,8 +57,8 @@ def parse_inline(inl):
            yield '<div class="img"><img src="{}"></div>'.format(props['image']['src'])
        if 'caption' in props:
            yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-                    props['caption']['text'], ' ' + props['caption']['credit']
-                )
+                props['caption']['text'], ' ' + props['caption']['credit']
+            )
        yield '</p>'
    if inl.get('content', {}).get('name', '') == 'ImageGroup':
        if 'images' in inl['content']['props']:
@ -68,8 +68,8 @@ def parse_inline(inl):
                    yield '<div class="img"><img src="{}"></div>'.format(imgs['src'])
                if 'caption' in imgs:
                    yield '<div class="cap">{}<span class="cred">{}</span></div>'.format(
-                    imgs['caption']['text'], ' ' + imgs['caption']['credit']
-                )
+                        imgs['caption']['text'], ' ' + imgs['caption']['credit']
+                    )
                yield '</p>'


@ -87,7 +87,7 @@ def parse_body(x):
            tag = x['type']
            if tag == 'inline':
                yield ''.join(parse_inline(x))
-            elif 'attrs' in x and 'href' in x.get('attrs', {}):
+            elif 'attrs' in x and 'href' in x.get('attrs', ''):
                yield '<' + tag + ' href = "{}">'.format(x['attrs']['href'])
                for yld in parse_cont(x):
                    yield yld
--- a/recipes/phillosophy_now.recipe
+++ b/recipes/phillosophy_now.recipe
@ -27,7 +27,7 @@ class PhilosophyNow(BasicNewsRecipe):
    remove_tags = [dict(name='div', attrs={'id':'welcome_box'})]
    extra_css = '''
        img {display:block; margin:0 auto;}
-        .articleImage { font-size:small; text-align:center; }
+        .articleImageCaption { font-size:small; text-align:center; }
        em, blockquote { color:#202020; }
    '''