diff --git a/recipes/business_standard.recipe b/recipes/business_standard.recipe index 4e961240f6..48d8c57c51 100644 --- a/recipes/business_standard.recipe +++ b/recipes/business_standard.recipe @@ -17,12 +17,7 @@ class BusinessStandard(BasicNewsRecipe): no_stylesheets = True remove_javascript = True - remove_attributes = ['width', 'height', 'float', 'style'] - - def get_cover_url(self): - soup = self.index_to_soup('https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/') - for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): - return citem['content'] + remove_attributes = ['width', 'height', 'style'] def get_browser(self): return BasicNewsRecipe.get_browser(self, user_agent='common_words/based') @@ -32,6 +27,14 @@ class BusinessStandard(BasicNewsRecipe): resolve_internal_links = True max_articles_per_feed = 50 oldest_article = 1.15 + browser_type = 'webengine' + + extra_css = ''' + img {display:block; margin:0 auto;} + .sub { font-style:italic; color:#202020; } + .auth, .cat { font-size:small; color:#202020; } + .cap { font-size:small; text-align:center; } + ''' recipe_specific_options = { 'days': { @@ -41,18 +44,23 @@ class BusinessStandard(BasicNewsRecipe): } } + def get_cover_url(self): + d = self.recipe_specific_options.get('date') + if not (d and isinstance(d, str)): + soup = self.index_to_soup( + 'https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/' + ) + for citem in soup.findAll( + 'meta', content=lambda s: s and s.endswith('view/3.jpg') + ): + return citem['content'] + def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) - extra_css = ''' - img {display:block; margin:0 auto;} - .auth, .cat { font-size:small; color:#202020; } - .cap { font-size:small; text-align:center; } - ''' - # https://www.business-standard.com/rss-feeds/listing feeds = [ ('Top Stories', 'https://www.business-standard.com/rss/home_page_top_stories.rss'), @@ -88,30 +96,69 @@ class BusinessStandard(BasicNewsRecipe): cat = subhead = lede = auth = caption = '' if 'defaultArticleCat' in data and data['defaultArticleCat'] is not None: - if 'h1_tag' in data['defaultArticleCat'] and data['defaultArticleCat']['h1_tag'] is not None: - cat = '

' + data['defaultArticleCat']['h1_tag'] + '

' + if ( + 'h1_tag' in data['defaultArticleCat'] + and data['defaultArticleCat']['h1_tag'] is not None + ): + cat = '
' + data['defaultArticleCat']['h1_tag'] + '
' if 'metaDescription' in data and data['metaDescription'] is not None: - subhead = '

' + data['metaDescription'] + '

' + subhead = '

' + data['metaDescription'] + '

' self.art_desc = data['metaDescription'] - date = (datetime.fromtimestamp(int(data['publishDate']))).strftime('%b %d, %Y | %I:%M %p') + date = (datetime.fromtimestamp(int(data['publishDate']))).strftime( + '%b %d, %Y | %I:%M %p' + ) authors = [] if 'articleMappedMultipleAuthors' in data: for aut in data['articleMappedMultipleAuthors']: authors.append(data['articleMappedMultipleAuthors'][str(aut)]) - auth = '

' + ', '.join(authors) + ' | ' + data['placeName'] + ' | ' + date + '

' + auth = ( + '

' + + ', '.join(authors) + + ' | ' + + data['placeName'] + + ' | ' + + date + + '

' + ) if 'featuredImageObj' in data: if 'url' in data['featuredImageObj']: if img_url is not None: lede = '

'.format(img_url) else: - lede = '

'.format(data['featuredImageObj']['url']) + lede = '

'.format( + data['featuredImageObj']['url'] + ) if 'alt_text' in data['featuredImageObj']: caption = '' + data['featuredImageObj']['alt_text'] + '

' body = data['htmlContent'] - return '' + cat + title + subhead + auth + lede + caption + '

' + body + '
' + return ( + '' + + cat + + title + + subhead + + auth + + lede + + caption + + '

' + + body + + '
' + ) + + def preprocess_html(self, soup): + for img in soup.findAll('img'): + img.attrs = {'src': img.get('src', '')} + for x in soup.findAll('div', 'p'): + x.attrs = {'class': x.get('class', '')} + for attr in self.remove_attributes: + for x in soup.findAll(attrs={attr: True}): + del x[attr] + for br in soup.findAll('small', attrs={'class': 'brtag'}): + br.name = 'br' + br.clear() + return soup diff --git a/recipes/business_standard_print.recipe b/recipes/business_standard_print.recipe index 439284b28d..10fe56a0e2 100644 --- a/recipes/business_standard_print.recipe +++ b/recipes/business_standard_print.recipe @@ -38,15 +38,19 @@ class BusinessStandardPrint(BasicNewsRecipe): recipe_specific_options = { 'date': { 'short': 'The date of the print edition to download (DD-MM-YYYY format)', - 'long': 'For example, 20-09-2023' + 'long': 'For example, 20-09-2023', } } def get_cover_url(self): d = self.recipe_specific_options.get('date') if not (d and isinstance(d, str)): - soup = self.index_to_soup('https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/') - for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): + soup = self.index_to_soup( + 'https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/' + ) + for citem in soup.findAll( + 'meta', content=lambda s: s and s.endswith('view/3.jpg') + ): return citem['content'] def parse_index(self): @@ -81,7 +85,7 @@ class BusinessStandardPrint(BasicNewsRecipe): desc = article['sub_heading'] url = 'https://www.business-standard.com' + article['article_url'] self.log('\t', title, '\n\t', desc, '\n\t\t', url) - articles.append({'title': title, 'description':desc, 'url': url}) + articles.append({'title': title, 'description': desc, 'url': url}) if articles: feeds.append((section, articles)) return feeds @@ -105,33 +109,59 @@ class BusinessStandardPrint(BasicNewsRecipe): cat = subhead = lede = auth = caption = '' if 'defaultArticleCat' in data and data['defaultArticleCat'] is not None: - if 'h1_tag' in data['defaultArticleCat'] and data['defaultArticleCat']['h1_tag'] is not None: + if ( + 'h1_tag' in data['defaultArticleCat'] + and data['defaultArticleCat']['h1_tag'] is not None + ): cat = '
' + data['defaultArticleCat']['h1_tag'] + '
' if 'metaDescription' in data and data['metaDescription'] is not None: subhead = '

' + data['metaDescription'] + '

' self.art_desc = data['metaDescription'] - date = (datetime.fromtimestamp(int(data['publishDate']))).strftime('%b %d, %Y | %I:%M %p') + date = (datetime.fromtimestamp(int(data['publishDate']))).strftime( + '%b %d, %Y | %I:%M %p' + ) authors = [] if 'articleMappedMultipleAuthors' in data: for aut in data['articleMappedMultipleAuthors']: authors.append(data['articleMappedMultipleAuthors'][str(aut)]) - auth = '

' + ', '.join(authors) + ' | ' + data['placeName'] + ' | ' + date + '

' + auth = ( + '

' + + ', '.join(authors) + + ' | ' + + data['placeName'] + + ' | ' + + date + + '

' + ) if 'featuredImageObj' in data: if 'url' in data['featuredImageObj']: if img_url is not None: lede = '

'.format(img_url) else: - lede = '

'.format(data['featuredImageObj']['url']) + lede = '

'.format( + data['featuredImageObj']['url'] + ) if 'alt_text' in data['featuredImageObj']: caption = '' + data['featuredImageObj']['alt_text'] + '

' body = data['htmlContent'] - return '' + cat + title + subhead + auth + lede + caption + '

' + body + '
' + return ( + '' + + cat + + title + + subhead + + auth + + lede + + caption + + '

' + + body + + '
' + ) def preprocess_html(self, soup): for img in soup.findAll('img'): @@ -141,4 +171,7 @@ class BusinessStandardPrint(BasicNewsRecipe): for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] + for br in soup.findAll('small', attrs={'class': 'brtag'}): + br.name = 'br' + br.clear() return soup