This commit is contained in:
unkn0w7n 2023-07-26 10:02:12 +05:30
parent c419c58c97
commit ee62e1b5ba
2 changed files with 29 additions and 12 deletions

View File

@ -121,7 +121,11 @@ class Bloomberg(BasicNewsRecipe):
data = json.loads(m2[0].text)
data = data['props']['pageProps']['story']
title = '<h1>' + data['headline'] + '</h1>'
art_url = data['url']
if not art_url.startswith('http'):
art_url = 'https://www.bloomberg.com' + art_url
title = '<h1 title="{}">'.format(art_url) + data['headline'] + '</h1>'
cat = subhead = lede = auth = caption = ''
@ -129,7 +133,7 @@ class Bloomberg(BasicNewsRecipe):
cat = '<p class="cat">' + data['primaryCategory'] + '</p>'
if len(data['abstract']) != 0 and len(data['abstract']) == 2:
subhead = '<div class="subhead"><p>' + data['abstract'][0] + '</p><p>' + data['abstract'][1] + '</p></div>'
subhead = '<div class="subhead"><p>' + data['abstract'][0] + ' </p><p>' + data['abstract'][1] + '</p></div>'
else:
if 'summary' in data:
subhead = '<div class="subhead"><p>' + data['summary'] + '</p></div>'
@ -175,3 +179,9 @@ class Bloomberg(BasicNewsRecipe):
for img in soup.findAll('img', attrs={'src':lambda x: x and x.endswith(('-1x-1.jpg', '-1x-1.png'))}):
img['src'] = img['src'].replace('-1x-1', '750x-1')
return soup
def populate_article_metadata(self, article, soup, first):
article.url = soup.find('h1')['title']
article.summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'}))
article.text_summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'}))
article.title = article.title.replace(' - Bloomberg', '')

View File

@ -26,15 +26,13 @@ class BusinessStandard(BasicNewsRecipe):
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
resolve_internal_links = True
simultaneous_downloads = 1
max_articles_per_feed = 20
extra_css = '''
img {display:block; margin:0 auto;}
.auth, .cat { font-size:small; color:#202020; }
.cap { font-size:small; text-align:center; }
'''
art_url = ''
art_desc = ''
articles_are_obfuscated = True
@ -47,7 +45,6 @@ class BusinessStandard(BasicNewsRecipe):
]
if any(x in link['href'] for x in skip_sections):
self.abort_article('skipping video links ', link['href'])
self.art_url = link['href']
self.log('Found ', link['href'])
html = br.open(link['href']).read()
pt = PersistentTemporaryFile('.html')
@ -72,9 +69,16 @@ class BusinessStandard(BasicNewsRecipe):
m = root.xpath('//script[@id="__NEXT_DATA__"]')
data = json.loads(m[0].text)
img_url = None
if 'articleImageUrl' in data['props']['pageProps']['articleSchema']:
img_url = data['props']['pageProps']['articleSchema']['articleImageUrl']
art_url = 'https://www.business-standard.com' + data['props']['pageProps']['url']
data = data['props']['pageProps']['data']
title = '<h1>' + data['pageTitle'] + '</h1>'
title = '<h1 title=art_url>' + data['pageTitle'] + '</h1>'
cat = subhead = lede = auth = caption = ''
@ -92,7 +96,10 @@ class BusinessStandard(BasicNewsRecipe):
auth = '<div><p class="auth">' + data['multiple_authors_name'] + ' | ' + data['placeName'] + ' | ' + date + '</p></div>'
if data['featuredImageObj'] and 'url' in data['featuredImageObj']:
lede = '<p class="cap"><img src="{}">'.format(data['featuredImageObj']['url'])
if img_url is not None:
lede = '<p class="cap"><img src="{}">'.format(img_url)
else:
lede = '<p class="cap"><img src="{}">'.format(data['featuredImageObj']['url'])
if 'alt_text' in data['featuredImageObj']:
caption = '<span>' + data['featuredImageObj']['alt_text'] + '</span></p>'
@ -101,7 +108,7 @@ class BusinessStandard(BasicNewsRecipe):
return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div><p></p>' + body + '</div></body></html>'
def populate_article_metadata(self, article, soup, first):
article.url = self.art_url
article.summary = self.art_desc
article.text_summary = self.art_desc
article.url = soup.find('h1')['title']
article.summary = self.tag_to_string(soup.find('h3'))
article.text_summary = self.tag_to_string(soup.find('h3'))
article.title = article.title.replace(' - Business Standard', '')