This commit is contained in:
Kovid Goyal 2024-07-23 19:23:49 +05:30
commit ab5ff807af
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 51 additions and 38 deletions

View File

@ -1,7 +1,8 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
import json
from datetime import datetime
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
from html5_parser import parse
@ -28,7 +29,22 @@ class BusinessStandard(BasicNewsRecipe):
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
resolve_internal_links = True
max_articles_per_feed = 20
max_articles_per_feed = 50
oldest_article = 1.15
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
extra_css = '''
img {display:block; margin:0 auto;}
@ -36,35 +52,21 @@ class BusinessStandard(BasicNewsRecipe):
.cap { font-size:small; text-align:center; }
'''
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
br = self.get_browser()
soup = self.index_to_soup(url)
link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.business-standard.com')})
skip_sections =[ # add sections you want to skip
'/video/', '/videos/', '/multimedia/',
# https://www.business-standard.com/rss-feeds/listing
feeds = [
('Top Stories', 'https://www.business-standard.com/rss/home_page_top_stories.rss'),
('Todays Paper', 'https://www.business-standard.com/rss/todays-paper.rss'),
('Budget', 'https://www.business-standard.com/rss/budget-110.rss'),
('Economy', 'https://www.business-standard.com/rss/economy-102.rss'),
('Opinion', 'https://www.business-standard.com/rss/opinion-105.rss'),
('Companies', 'https://www.business-standard.com/rss/companies-101.rss'),
('Industries', 'https://www.business-standard.com/rss/industry-217.rss'),
('Market', 'https://www.business-standard.com/rss/markets-106.rss'),
('Politics', 'https://www.business-standard.com/rss/budget-110.rss'),
('World', 'https://www.business-standard.com/rss/industry-217.rss'),
('Technology', 'https://www.business-standard.com/rss/technology-108.rss'),
('Latest', 'https://www.business-standard.com/rss/latest.rss')
]
if any(x in link['href'] for x in skip_sections):
self.abort_article('skipping video links ', link['href'])
self.log('Found ', link['href'])
html = br.open(link['href']).read()
pt = PersistentTemporaryFile('.html')
pt.write(html)
pt.close()
return pt.name
feeds = []
sections = [
'india-news', 'economy', 'opinion', 'markets', 'companies', 'industry', 'finance', 'world-news',
# 'politics', 'cricket', 'sports', 'technology', 'book', 'education', 'specials'
]
for sec in sections:
a = 'https://news.google.com/rss/search?q=when:27h+allinurl:business-standard.com{}&hl=en-IN&gl=IN&ceid=IN:en'
feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
# feeds.append(('Others', a.format('')))
def preprocess_raw_html(self, raw, *a):
root = parse(raw)
@ -112,9 +114,3 @@ class BusinessStandard(BasicNewsRecipe):
body = data['htmlContent']
return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div><p></p>' + body + '</div></body></html>'
def populate_article_metadata(self, article, soup, first):
article.url = soup.find('h1')['title']
article.summary = self.tag_to_string(soup.find('h3'))
article.text_summary = self.tag_to_string(soup.find('h3'))
article.title = article.title.replace(' - Business Standard', '')

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe, classes
@ -29,6 +31,14 @@ class BT(BasicNewsRecipe):
'banner_content'
)
]
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (YYYY-MM-DD format)',
'long': 'For example, 2024-07-07'
}
}
extra_css = '''
img {display:block; margin:0 auto;}
em { color:#202020; }
@ -43,8 +53,14 @@ class BT(BasicNewsRecipe):
)
soup = self.index_to_soup('https://www.businesstoday.in')
a = soup.findAll('a', attrs={'class':'mag_sld_img'})[1]
self.cover_url = a.img['data-src'].split('?')[0]
url = a['href']
d = self.recipe_specific_options.get('date')
if d and isinstance(d, str):
url = 'https://www.businesstoday.in/magazine/issue/' + d
else:
self.cover_url = a.img['data-src'].split('?')[0]
self.log('issue =', url)
self.timefmt = ' [' + url.split('/')[-1] + ']'
soup = self.index_to_soup(url)

View File

@ -113,7 +113,8 @@ class LiveMint(BasicNewsRecipe):
'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider'
' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget'
' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn'
)
),
dict(attrs={'class':lambda x: x and x.startswith('storyPage_alsoRead__')})
]
feeds = [