From 07db492808c5fba756271fdcfe203750d27d312b Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sat, 10 May 2025 14:21:38 +0530
Subject: [PATCH 1/2] Update business_standard_print.recipe
---
recipes/business_standard_print.recipe | 51 +++++++++++++++++++++-----
1 file changed, 42 insertions(+), 9 deletions(-)
diff --git a/recipes/business_standard_print.recipe b/recipes/business_standard_print.recipe
index 439284b28d..10fe56a0e2 100644
--- a/recipes/business_standard_print.recipe
+++ b/recipes/business_standard_print.recipe
@@ -38,15 +38,19 @@ class BusinessStandardPrint(BasicNewsRecipe):
recipe_specific_options = {
'date': {
'short': 'The date of the print edition to download (DD-MM-YYYY format)',
- 'long': 'For example, 20-09-2023'
+ 'long': 'For example, 20-09-2023',
}
}
def get_cover_url(self):
d = self.recipe_specific_options.get('date')
if not (d and isinstance(d, str)):
- soup = self.index_to_soup('https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/')
- for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')):
+ soup = self.index_to_soup(
+ 'https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/'
+ )
+ for citem in soup.findAll(
+ 'meta', content=lambda s: s and s.endswith('view/3.jpg')
+ ):
return citem['content']
def parse_index(self):
@@ -81,7 +85,7 @@ class BusinessStandardPrint(BasicNewsRecipe):
desc = article['sub_heading']
url = 'https://www.business-standard.com' + article['article_url']
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
- articles.append({'title': title, 'description':desc, 'url': url})
+ articles.append({'title': title, 'description': desc, 'url': url})
if articles:
feeds.append((section, articles))
return feeds
@@ -105,33 +109,59 @@ class BusinessStandardPrint(BasicNewsRecipe):
cat = subhead = lede = auth = caption = ''
if 'defaultArticleCat' in data and data['defaultArticleCat'] is not None:
- if 'h1_tag' in data['defaultArticleCat'] and data['defaultArticleCat']['h1_tag'] is not None:
+ if (
+ 'h1_tag' in data['defaultArticleCat']
+ and data['defaultArticleCat']['h1_tag'] is not None
+ ):
cat = '
' + data['defaultArticleCat']['h1_tag'] + '
'
if 'metaDescription' in data and data['metaDescription'] is not None:
subhead = '' + data['metaDescription'] + '
'
self.art_desc = data['metaDescription']
- date = (datetime.fromtimestamp(int(data['publishDate']))).strftime('%b %d, %Y | %I:%M %p')
+ date = (datetime.fromtimestamp(int(data['publishDate']))).strftime(
+ '%b %d, %Y | %I:%M %p'
+ )
authors = []
if 'articleMappedMultipleAuthors' in data:
for aut in data['articleMappedMultipleAuthors']:
authors.append(data['articleMappedMultipleAuthors'][str(aut)])
- auth = '' + ', '.join(authors) + ' | ' + data['placeName'] + ' | ' + date + '
'
+ auth = (
+ ''
+ + ', '.join(authors)
+ + ' | '
+ + data['placeName']
+ + ' | '
+ + date
+ + '
'
+ )
if 'featuredImageObj' in data:
if 'url' in data['featuredImageObj']:
if img_url is not None:
lede = '
'.format(img_url)
else:
- lede = '
'.format(data['featuredImageObj']['url'])
+ lede = '
'.format(
+ data['featuredImageObj']['url']
+ )
if 'alt_text' in data['featuredImageObj']:
caption = '' + data['featuredImageObj']['alt_text'] + '
'
body = data['htmlContent']
- return '' + cat + title + subhead + auth + lede + caption + '
' + body + '
'
+ return (
+ ''
+ + cat
+ + title
+ + subhead
+ + auth
+ + lede
+ + caption
+ + '
'
+ + body
+ + '
'
+ )
def preprocess_html(self, soup):
for img in soup.findAll('img'):
@@ -141,4 +171,7 @@ class BusinessStandardPrint(BasicNewsRecipe):
for attr in self.remove_attributes:
for x in soup.findAll(attrs={attr: True}):
del x[attr]
+ for br in soup.findAll('small', attrs={'class': 'brtag'}):
+ br.name = 'br'
+ br.clear()
return soup
From 81eb5e951bc0bfb220863a2d6f598e5782b3e02f Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sat, 10 May 2025 14:22:19 +0530
Subject: [PATCH 2/2] Update business_standard.recipe
---
recipes/business_standard.recipe | 85 +++++++++++++++++++++++++-------
1 file changed, 66 insertions(+), 19 deletions(-)
diff --git a/recipes/business_standard.recipe b/recipes/business_standard.recipe
index 4e961240f6..48d8c57c51 100644
--- a/recipes/business_standard.recipe
+++ b/recipes/business_standard.recipe
@@ -17,12 +17,7 @@ class BusinessStandard(BasicNewsRecipe):
no_stylesheets = True
remove_javascript = True
- remove_attributes = ['width', 'height', 'float', 'style']
-
- def get_cover_url(self):
- soup = self.index_to_soup('https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/')
- for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')):
- return citem['content']
+ remove_attributes = ['width', 'height', 'style']
def get_browser(self):
return BasicNewsRecipe.get_browser(self, user_agent='common_words/based')
@@ -32,6 +27,14 @@ class BusinessStandard(BasicNewsRecipe):
resolve_internal_links = True
max_articles_per_feed = 50
oldest_article = 1.15
+ browser_type = 'webengine'
+
+ extra_css = '''
+ img {display:block; margin:0 auto;}
+ .sub { font-style:italic; color:#202020; }
+ .auth, .cat { font-size:small; color:#202020; }
+ .cap { font-size:small; text-align:center; }
+ '''
recipe_specific_options = {
'days': {
@@ -41,18 +44,23 @@ class BusinessStandard(BasicNewsRecipe):
}
}
+ def get_cover_url(self):
+ d = self.recipe_specific_options.get('date')
+ if not (d and isinstance(d, str)):
+ soup = self.index_to_soup(
+ 'https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/'
+ )
+ for citem in soup.findAll(
+ 'meta', content=lambda s: s and s.endswith('view/3.jpg')
+ ):
+ return citem['content']
+
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
- extra_css = '''
- img {display:block; margin:0 auto;}
- .auth, .cat { font-size:small; color:#202020; }
- .cap { font-size:small; text-align:center; }
- '''
-
# https://www.business-standard.com/rss-feeds/listing
feeds = [
('Top Stories', 'https://www.business-standard.com/rss/home_page_top_stories.rss'),
@@ -88,30 +96,69 @@ class BusinessStandard(BasicNewsRecipe):
cat = subhead = lede = auth = caption = ''
if 'defaultArticleCat' in data and data['defaultArticleCat'] is not None:
- if 'h1_tag' in data['defaultArticleCat'] and data['defaultArticleCat']['h1_tag'] is not None:
- cat = '' + data['defaultArticleCat']['h1_tag'] + '
'
+ if (
+ 'h1_tag' in data['defaultArticleCat']
+ and data['defaultArticleCat']['h1_tag'] is not None
+ ):
+ cat = '' + data['defaultArticleCat']['h1_tag'] + '
'
if 'metaDescription' in data and data['metaDescription'] is not None:
- subhead = '' + data['metaDescription'] + '
'
+ subhead = '' + data['metaDescription'] + '
'
self.art_desc = data['metaDescription']
- date = (datetime.fromtimestamp(int(data['publishDate']))).strftime('%b %d, %Y | %I:%M %p')
+ date = (datetime.fromtimestamp(int(data['publishDate']))).strftime(
+ '%b %d, %Y | %I:%M %p'
+ )
authors = []
if 'articleMappedMultipleAuthors' in data:
for aut in data['articleMappedMultipleAuthors']:
authors.append(data['articleMappedMultipleAuthors'][str(aut)])
- auth = '' + ', '.join(authors) + ' | ' + data['placeName'] + ' | ' + date + '
'
+ auth = (
+ ''
+ + ', '.join(authors)
+ + ' | '
+ + data['placeName']
+ + ' | '
+ + date
+ + '
'
+ )
if 'featuredImageObj' in data:
if 'url' in data['featuredImageObj']:
if img_url is not None:
lede = '
'.format(img_url)
else:
- lede = '
'.format(data['featuredImageObj']['url'])
+ lede = '
'.format(
+ data['featuredImageObj']['url']
+ )
if 'alt_text' in data['featuredImageObj']:
caption = '' + data['featuredImageObj']['alt_text'] + '
'
body = data['htmlContent']
- return '' + cat + title + subhead + auth + lede + caption + ''
+ return (
+ ''
+ + cat
+ + title
+ + subhead
+ + auth
+ + lede
+ + caption
+ + '
'
+ + body
+ + '
'
+ )
+
+ def preprocess_html(self, soup):
+ for img in soup.findAll('img'):
+ img.attrs = {'src': img.get('src', '')}
+ for x in soup.findAll('div', 'p'):
+ x.attrs = {'class': x.get('class', '')}
+ for attr in self.remove_attributes:
+ for x in soup.findAll(attrs={attr: True}):
+ del x[attr]
+ for br in soup.findAll('small', attrs={'class': 'brtag'}):
+ br.name = 'br'
+ br.clear()
+ return soup