import json import re from collections import defaultdict from datetime import date from calibre.web.feeds.news import BasicNewsRecipe, classes def absurl(url): if url.startswith('/'): url = 'https://www.thehindu.com' + url return url # Chennai is default edition, for other editions use 'th_hyderabad', 'th_bangalore', 'th_delhi', 'th_kolkata' etc local_edition = None # For past editions, set date to, for example, '2023-01-28' past_edition = None is_monday = date.today().weekday() == 0 is_friday = date.today().weekday() == 4 is_saturday = date.today().weekday() == 5 is_sunday = date.today().weekday() == 6 if past_edition: year, month, day = (int(x) for x in past_edition.split('-')) dt = date(year, month, day) is_monday = dt.weekday() == 0 is_saturday = dt.weekday() == 5 is_sunday = dt.weekday() == 6 class TheHindu(BasicNewsRecipe): title = 'The Hindu' __author__ = 'unkn0wn' description = 'Articles from The Hindu, Today\'s Paper.' language = 'en_IN' no_stylesheets = True masthead_url = 'https://www.thehindu.com/theme/images/th-online/thehindu-logo.svg' remove_attributes = ['style', 'height', 'width'] extra_css = ''' .caption {font-size:small; text-align:center;} .author, .dateLine {font-size:small; font-weight:bold;} .subhead, .subhead_lead, .bold {font-weight:bold;} img {display:block; margin:0 auto;} .italic {font-style:italic; color:#202020;} ''' ignore_duplicate_articles = {'url'} keep_only_tags = [ classes('article-section') ] remove_tags = [ classes('hide-mobile comments-shares share-page editiondetails') ] def preprocess_html(self, soup): for cap in soup.findAll('p', attrs={'class':'caption'}): cap.name = 'figcaption' for img in soup.findAll('img', attrs={'data-original':True}): img['src'] = img['data-original'] return soup def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) if self.output_profile.short_name.startswith('kindle'): if not past_edition: self.title = 'The Hindu ' + date.today().strftime('%b %d, %Y') else: self.title = 'The Hindu ' + dt.strftime('%b %d, %Y') def parse_index(self): mag_url = None global local_edition if local_edition or past_edition: if local_edition is None: local_edition = 'th_chennai' today = date.today().strftime('%Y-%m-%d') if past_edition: today = past_edition self.log('Downloading past edition of', local_edition + ' from ' + today) url = absurl('/todays-paper/' + today + '/' + local_edition + '/') if is_monday: mag_url = url + '?supplement=' + local_edition + '-epbs' if is_saturday: mag_url = url + '?supplement=' + local_edition + '-mp' if is_sunday: mag_url = url + '?supplement=' + local_edition + '-sm' else: url = 'https://www.thehindu.com/todays-paper/' if is_monday: mag_url = url + '?supplement=th_chennai-epbs' if is_friday: mag_url = url + '?supplement=th_chennai-fr' if is_saturday: mag_url = url + '?supplement=th_chennai-mp' if is_sunday: mag_url = url + '?supplement=th_chennai-sm' raw = self.index_to_soup(url, raw=True) soup = self.index_to_soup(raw) ans = self.hindu_parse_index(soup) cover = soup.find(attrs={'class':'hindu-ad'}) if cover: self.cover_url = cover.img['src'] if not ans: raise ValueError( 'The Hindu Newspaper is not published Today.' ) if mag_url: self.log('\nFetching Magazine') soup = self.index_to_soup(mag_url) ans2 = self.hindu_parse_index(soup) if ans2: return ans + ans2 self.log('\tMagazine not Found') return ans return ans def hindu_parse_index(self, soup): for script in soup.findAll('script'): if not self.tag_to_string(script).__contains__('grouped_articles = {"'): continue if script is not None: art = re.search(r'grouped_articles = ({\".*)', self.tag_to_string(script)) data = json.JSONDecoder().raw_decode(art.group(1))[0] feeds_dict = defaultdict(list) a = json.dumps(data) for sec in json.loads(a): for item in data[sec]: section = sec.replace('TH_', '') title = item['articleheadline'] url = absurl(item['href']) desc = 'Page no.' + item['pageno'] + ' | ' + item['teaser_text'] or '' self.log('\t', title, '\n\t\t', url) feeds_dict[section].append({"title": title, "url": url, "description": desc}) return [(section, articles) for section, articles in feeds_dict.items()] else: return []