update Live Law and Live Mint

2025-06-23 15:30:45 -04:00 · 2022-08-31 20:43:19 +05:30 · 2022-08-31 20:43:19 +05:30 · 3f013c3856
commit 3f013c3856
parent a2e531db83
2 changed files with 33 additions and 29 deletions
--- a/recipes/live_law.recipe
+++ b/recipes/live_law.recipe
@ -35,6 +35,7 @@ class livelaw(BasicNewsRecipe):
    ]

    remove_tags = [
+        classes('in-image-ad-wrap'),
        dict(
            name='div',
            attrs={'id': lambda x: x and x.startswith('inside_post_content_ad')}
@ -91,7 +92,7 @@ class livelaw(BasicNewsRecipe):
    def is_accepted_entry(self, entry):
        # Those sections in the top nav bar that we will omit
        omit_list = [
-            'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in'
+            'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in', 'javascript:void(0);',
        ]
        is_accepted = True
        for omit_entry in omit_list:
--- a/recipes/livemint.recipe
+++ b/recipes/livemint.recipe
@ -13,7 +13,7 @@ class LiveMint(BasicNewsRecipe):
    title = u'Live Mint'
    description = 'Financial News from India.'
    language = 'en_IN'
-    __author__ = 'Krittika Goyal'
+    __author__ = 'Krittika Goyal, revised by unkn0wn'
    oldest_article = 1.15  # days
    max_articles_per_feed = 50
    encoding = 'utf-8'
@ -48,6 +48,11 @@ class LiveMint(BasicNewsRecipe):
            ('How to Lounge','https://lifestyle.livemint.com/rss/how-to-lounge'),
            ('Smart Living','https://lifestyle.livemint.com/rss/smart-living'),
        ]
+
+        def preprocess_html(self, soup):
+            for img in soup.findAll('img', attrs={'data-img': True}):
+                img['src'] = img['data-img']
+            return soup
    else:
        # some wsj articles wont load
        extra_css = '''
@ -90,31 +95,29 @@ class LiveMint(BasicNewsRecipe):
            ('Elections', 'https://www.livemint.com/rss/elections'),
        ]

-    def preprocess_raw_html(self, raw, *a):
-        if '<script>var wsjFlag=true;</script>' in raw:
-            m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
-            raw1 = raw[m.start():]
-            raw1 = raw1.split('>', 1)[1].strip()
-            data = json.JSONDecoder().raw_decode(raw1)[0]
-            value = data['hasPart']['value']
-            body = data['articleBody'] + '</p> <p>' + re.sub(r'([a-z]\.|[0-9]\.)([A-Z])', r'\1 <p> \2', value)
-            body = '<div class="FirstEle"> <p>' +  body  + '</p> </div>'
-            raw = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
-            return raw
-        else:
-            return raw
+        def preprocess_raw_html(self, raw, *a):
+            if '<script>var wsjFlag=true;</script>' in raw:
+                m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw)
+                raw1 = raw[m.start():]
+                raw1 = raw1.split('>', 1)[1].strip()
+                data = json.JSONDecoder().raw_decode(raw1)[0]
+                value = data['hasPart']['value']
+                body = data['articleBody'] + '</p> <p>'\
+                        + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1 <p> \3', value)
+                body = '<div class="FirstEle"> <p>' +  body  + '</p> </div>'
+                raw = re.sub(r'<div class="FirstEle">([^}]*)</div>', body, raw)
+                return raw
+            else:
+                return raw

-    def preprocess_html(self, soup):
-        for span in soup.findAll('figcaption'):
-            span['id'] = 'img-cap'
-        for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}):
-            auth['id'] = 'auth-info'
-            auth.name = 'div'
-        for span in soup.findAll('span', attrs={'class':'exclusive'}):
-            span.extract()
-        for img in soup.findAll('img', attrs={'data-src': True}):
-            img['src'] = img['data-src']
-        if is_saturday:
-            for img in soup.findAll('img', attrs={'data-img': True}):
-                img['src'] = img['data-img']
-        return soup
+        def preprocess_html(self, soup):
+            for span in soup.findAll('figcaption'):
+                span['id'] = 'img-cap'
+            for auth in soup.findAll('span', attrs={'class':['articleInfo pubtime','articleInfo author']}):
+                auth['id'] = 'auth-info'
+                auth.name = 'div'
+            for span in soup.findAll('span', attrs={'class':'exclusive'}):
+                span.extract()
+            for img in soup.findAll('img', attrs={'data-src': True}):
+                img['src'] = img['data-src']
+            return soup