...

2025-08-30 23:00:21 -04:00 · 2023-07-26 10:02:12 +05:30 · 2023-07-26 10:02:12 +05:30 · ee62e1b5ba
commit ee62e1b5ba
parent c419c58c97
2 changed files with 29 additions and 12 deletions
--- a/recipes/bloomberg.recipe
+++ b/recipes/bloomberg.recipe
@ -121,7 +121,11 @@ class Bloomberg(BasicNewsRecipe):
            data = json.loads(m2[0].text)
            data = data['props']['pageProps']['story']

-        title = '<h1>' + data['headline'] + '</h1>'
+        art_url = data['url']
+        if not art_url.startswith('http'):
+            art_url = 'https://www.bloomberg.com' + art_url
+
+        title = '<h1 title="{}">'.format(art_url) + data['headline'] + '</h1>'

        cat = subhead = lede = auth = caption = ''

@ -129,7 +133,7 @@ class Bloomberg(BasicNewsRecipe):
                cat = '<p class="cat">' + data['primaryCategory'] + '</p>'

        if len(data['abstract']) != 0 and len(data['abstract']) == 2:
-                subhead = '<div class="subhead"><p>' + data['abstract'][0] + '</p><p>' + data['abstract'][1] + '</p></div>'
+                subhead = '<div class="subhead"><p>' + data['abstract'][0] + ' </p><p>' + data['abstract'][1] + '</p></div>'
        else:
            if 'summary' in data:
                subhead = '<div class="subhead"><p>' + data['summary'] + '</p></div>'
@ -175,3 +179,9 @@ class Bloomberg(BasicNewsRecipe):
        for img in soup.findAll('img', attrs={'src':lambda x: x and x.endswith(('-1x-1.jpg', '-1x-1.png'))}):
            img['src'] = img['src'].replace('-1x-1', '750x-1')
        return soup
+
+    def populate_article_metadata(self, article, soup, first):
+        article.url = soup.find('h1')['title']
+        article.summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'}))
+        article.text_summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'}))
+        article.title = article.title.replace(' - Bloomberg', '')
--- a/recipes/business_standard.recipe
+++ b/recipes/business_standard.recipe
@ -26,15 +26,13 @@ class BusinessStandard(BasicNewsRecipe):
    ignore_duplicate_articles = {'title', 'url'}
    remove_empty_feeds = True
    resolve_internal_links = True
-    simultaneous_downloads = 1
+    max_articles_per_feed = 20

    extra_css = '''
+        img {display:block; margin:0 auto;}
        .auth, .cat { font-size:small; color:#202020; }
        .cap { font-size:small; text-align:center; }
    '''
-    
-    art_url = ''
-    art_desc = ''

    articles_are_obfuscated = True

@ -47,7 +45,6 @@ class BusinessStandard(BasicNewsRecipe):
        ]
        if any(x in link['href'] for x in skip_sections):
            self.abort_article('skipping video links ', link['href'])
-        self.art_url = link['href']
        self.log('Found ', link['href'])
        html = br.open(link['href']).read()
        pt = PersistentTemporaryFile('.html')
@ -72,9 +69,16 @@ class BusinessStandard(BasicNewsRecipe):
        m = root.xpath('//script[@id="__NEXT_DATA__"]')
        
        data = json.loads(m[0].text)
+
+        img_url = None
+        if 'articleImageUrl' in data['props']['pageProps']['articleSchema']:
+            img_url = data['props']['pageProps']['articleSchema']['articleImageUrl']
+
+        art_url = 'https://www.business-standard.com' + data['props']['pageProps']['url']
+        
        data = data['props']['pageProps']['data']

-        title = '<h1>' + data['pageTitle'] + '</h1>'
+        title = '<h1 title=art_url>' + data['pageTitle'] + '</h1>'

        cat = subhead = lede = auth = caption = ''

@ -92,7 +96,10 @@ class BusinessStandard(BasicNewsRecipe):
            auth = '<div><p class="auth">' + data['multiple_authors_name'] + ' | ' + data['placeName'] + ' | ' + date + '</p></div>'

        if data['featuredImageObj'] and 'url' in data['featuredImageObj']:
-            lede = '<p class="cap"><img src="{}">'.format(data['featuredImageObj']['url'])
+            if img_url is not None:
+                lede = '<p class="cap"><img src="{}">'.format(img_url)
+            else:
+                lede = '<p class="cap"><img src="{}">'.format(data['featuredImageObj']['url'])
            if 'alt_text' in data['featuredImageObj']:
                caption = '<span>' + data['featuredImageObj']['alt_text'] + '</span></p>'

@ -101,7 +108,7 @@ class BusinessStandard(BasicNewsRecipe):
        return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div><p></p>' + body + '</div></body></html>'

    def populate_article_metadata(self, article, soup, first):
-        article.url = self.art_url
-        article.summary = self.art_desc
-        article.text_summary = self.art_desc
+        article.url = soup.find('h1')['title']
+        article.summary = self.tag_to_string(soup.find('h3'))
+        article.text_summary = self.tag_to_string(soup.find('h3'))
        article.title = article.title.replace(' - Business Standard', '')