This commit is contained in:
Kovid Goyal 2023-04-28 08:27:38 +05:30
commit e80efaea86
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 40 additions and 38 deletions

View File

@ -1,7 +1,7 @@
import json import json
import re import re
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import date
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
@ -26,9 +26,10 @@ class TheHindu(BasicNewsRecipe):
extra_css = ''' extra_css = '''
.caption {font-size:small; text-align:center;} .caption {font-size:small; text-align:center;}
.author {font-size:small; font-weight:bold;} .author, .dateLine {font-size:small; font-weight:bold;}
.subhead, .subhead_lead {font-weight:bold;} .subhead, .subhead_lead {font-weight:bold;}
img {display:block; margin:0 auto;} img {display:block; margin:0 auto;}
.italic {font-style:italic; color:#202020;}
''' '''
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
@ -52,20 +53,22 @@ class TheHindu(BasicNewsRecipe):
BasicNewsRecipe.__init__(self, *args, **kwargs) BasicNewsRecipe.__init__(self, *args, **kwargs)
if self.output_profile.short_name.startswith('kindle'): if self.output_profile.short_name.startswith('kindle'):
if not past_edition: if not past_edition:
self.title = 'The Hindu ' + datetime.today().strftime('%b %d, %Y') self.title = 'The Hindu ' + date.today().strftime('%b %d, %Y')
def parse_index(self): def parse_index(self):
global local_edition global local_edition
if local_edition or past_edition: if local_edition or past_edition:
if local_edition is None: if local_edition is None:
local_edition = 'th_chennai' local_edition = 'th_chennai'
today = datetime.today().strftime('%Y-%m-%d') today = date.today().strftime('%Y-%m-%d')
if past_edition: if past_edition:
today = past_edition today = past_edition
self.log('Downloading past edition of', local_edition + ' from ' + today) self.log('Downloading past edition of', local_edition + ' from ' + today)
url = absurl('/todays-paper/' + today + '/' + local_edition + '/') url = absurl('/todays-paper/' + today + '/' + local_edition + '/')
else: else:
url = 'https://www.thehindu.com/todays-paper/' url = 'https://www.thehindu.com/todays-paper/'
raw = self.index_to_soup(url, raw=True) raw = self.index_to_soup(url, raw=True)
soup = self.index_to_soup(raw) soup = self.index_to_soup(raw)
ans = self.hindu_parse_index(soup) ans = self.hindu_parse_index(soup)
@ -83,8 +86,8 @@ class TheHindu(BasicNewsRecipe):
if not self.tag_to_string(script).strip().startswith('let grouped_articles = {}'): if not self.tag_to_string(script).strip().startswith('let grouped_articles = {}'):
continue continue
if script is not None: if script is not None:
art = re.search(r'grouped_articles = ({\"[^<]+?]})', self.tag_to_string(script)) art = re.search(r'grouped_articles = ({\".*)', self.tag_to_string(script))
data = json.loads(art.group(1)) data = json.JSONDecoder().raw_decode(art.group(1))[0]
feeds_dict = defaultdict(list) feeds_dict = defaultdict(list)

View File

@ -12,35 +12,26 @@ class IrishIndependent(BasicNewsRecipe):
description = 'Irish and World news from Irelands Bestselling Daily Broadsheet' description = 'Irish and World news from Irelands Bestselling Daily Broadsheet'
__author__ = 'Neil Grogan' __author__ = 'Neil Grogan'
language = 'en_IE' language = 'en_IE'
oldest_article = 7 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_tags_before = dict(id='article')
remove_tags_after = [dict(name='div', attrs={'class': 'toolsBottom'})]
no_stylesheets = True no_stylesheets = True
keep_only_tags = [ keep_only_tags = [
classes('n-content1 n-content2 n-content3'), dict(name='div', attrs={'class':lambda x: x and '_contentwrapper' in x})
] ]
remove_tags_after = classes('quick-subscribe')
remove_tags = [ remove_tags = [
classes('icon1 icon-close c-lightbox1-side c-socials1 social-embed-consent-wall n-split1-side c-footer1'), dict(name='div', attrs={'data-testid':['article-share', 'embed-video']})
dict(attrs={'data-ad-slot': True}),
dict(attrs={'data-lightbox': True}),
dict(name='form'),
dict(attrs={'data-urn': lambda x: x and ':video:' in x}),
] ]
feeds = [ feeds = [
(u'Frontpage News', u'http://www.independent.ie/rss'), ('News', 'http://www.independent.ie/rss'),
(u'World News', u'http://www.independent.ie/world-news/rss'), ('Opinion', 'http://www.independent.ie/opinion/rss'),
(u'Technology', u'http://www.independent.ie/business/technology/rss'), ('Business', 'http://www.independent.ie/business/rss'),
(u'Sport', u'http://www.independent.ie/sport/rss'), ('Sport', 'http://www.independent.ie/sport/rss'),
(u'Entertainment', u'http://www.independent.ie/entertainment/rss'), ('Life', 'http://www.independent.ie/life/rss'),
(u'Independent Woman', u'http://www.independent.ie/lifestyle/independent-woman/rss'), ('Style', 'http://www.independent.ie/style/rss'),
(u'Education', u'http://www.independent.ie/education/rss'), ('Entertainment', 'http://www.independent.ie/business/rss'),
(u'Lifestyle', u'http://www.independent.ie/lifestyle/rss'),
(u'Travel', u'http://www.independent.ie/travel/rss'),
(u'Letters', u'http://www.independent.ie/opinion/letters/rss'),
(u'Weather', u'http://www.independent.ie/weather/rss')
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):

View File

@ -33,11 +33,11 @@ class IrishTimes(BasicNewsRecipe):
temp_files = [] temp_files = []
keep_only_tags = [ keep_only_tags = [
dict(name=['h1', 'h2']), dict(name=['h1', 'h2']),
classes('lead-art-wrapper article-body-wrapper'), classes('lead-art-wrapper article-body-wrapper byline-text'),
] ]
remove_tags = [ remove_tags = [
dict(name='button'), dict(name='button'),
classes('sm-promo-headline'), classes('sm-promo-headline top-table-list-container'),
] ]
remove_attributes = ['width', 'height'] remove_attributes = ['width', 'height']

View File

@ -21,16 +21,17 @@ class LiveMint(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
resolve_internal_links = True resolve_internal_links = True
def get_cover_url(self):
soup = self.index_to_soup(
'https://www.magzter.com/IN/HT-Digital-Streams-Ltd./Mint-Mumbai/Newspaper/'
)
for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')):
return citem['content']
if is_saturday: if is_saturday:
def get_cover_url(self):
soup = self.index_to_soup('https://lifestyle.livemint.com/')
self.title = 'Mint Lounge'
if citem := soup.find('div', attrs={'class':'headLatestIss_cover'}):
return citem.img['src'].replace('_tn.jpg', '_mr.jpg')
masthead_url = 'https://lifestyle.livemint.com/mintlounge/static-images/lounge-logo.svg'
oldest_article = 6 # days oldest_article = 6.5 # days
extra_css = ''' extra_css = '''
#story-summary-0 {font-style:italic; color:#202020;} #story-summary-0 {font-style:italic; color:#202020;}
@ -63,6 +64,13 @@ class LiveMint(BasicNewsRecipe):
img['src'] = img['data-img'] img['src'] = img['data-img']
return soup return soup
else: else:
def get_cover_url(self):
soup = self.index_to_soup(
'https://www.magzter.com/IN/HT-Digital-Streams-Ltd./Mint-Mumbai/Newspaper/'
)
for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')):
return citem['content']
extra_css = ''' extra_css = '''
img {display:block; margin:0 auto;} img {display:block; margin:0 auto;}

View File

@ -42,7 +42,7 @@ class PsychologyToday(BasicNewsRecipe):
self.cover_url = absurl(a.img['src']) self.cover_url = absurl(a.img['src'])
soup = self.index_to_soup(absurl(a['href'])) soup = self.index_to_soup(absurl(a['href']))
articles = [] articles = []
for article in soup.find('div', role='article').findAll('article'): for article in soup.findAll('div', attrs={'class':'article-text'}):
title = self.tag_to_string(article.find(['h2','h3'])).strip() title = self.tag_to_string(article.find(['h2','h3'])).strip()
url = absurl(article.find(['h2','h3']).a['href']) url = absurl(article.find(['h2','h3']).a['href'])
self.log('\n', title, 'at', url) self.log('\n', title, 'at', url)