#!/usr/bin/env python # vim:fileencoding=utf-8 import json from collections import defaultdict from datetime import date from calibre.web.feeds.news import BasicNewsRecipe index = 'https://epaper.hindustantimes.com' class ht(BasicNewsRecipe): title = 'Hindustan Times Print Edition' language = 'en_IN' __author__ = 'unkn0wn' masthead_url = 'https://www.htmedia.in/wp-content/uploads/2020/08/HT-dot-com-logo-product.png' description = 'Articles from the Hindustan Times epaper, digital edition' encoding = 'utf-8' delay = 1 ignore_duplicate_articles = {'title'} extra_css = ''' .cap { text-align:center; font-size:small; } img { display:block; margin:0 auto; } ''' recipe_specific_options = { 'location': { 'short': 'The name of the local edition', 'long': ('If The Hindustan Times is available in your local town/city, ' 'set this to your location, for example, Delhi\nAvailable Editions:' 'Delhi, Mumbai, Chandigarh, Lucknow, Patna, Bengaluru, Pune, Gurgaon,' 'Ludhiana, Rajasthan, Amritsar,\nEast UP, Haryana, Jammu, Navi Mumbai,' 'Noida, Punjab, Ranchi, Thane, Uttarakhand, West UP'), 'default': 'Delhi' }, 'date': { 'short': 'The date of the edition to download (DD/MM/YYYY format)', 'long': 'For example, 22/12/2023' } } def parse_index(self): self.log( '\n***\nif this recipe fails, report it on: ' 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' ) local_edition = 'Delhi' d = self.recipe_specific_options.get('location') if d and isinstance(d, str): local_edition = d today = date.today().strftime('%d/%m/%Y') p = self.recipe_specific_options.get('date') if p and isinstance(p, str): today = p day, month, year = (int(x) for x in today.split('/')) dt = date(year, month, day) self.timefmt = ' [%s]' % dt.strftime('%b %d, %Y') today = today.replace('/', '%2F') get_edition = index + '/Home/GetEditionSupplementHierarchy?EditionDate=' + today edi_data = json.loads(self.index_to_soup(get_edition, raw=True)) cities = [] for edi in edi_data: cities.append(edi['EditionName']) self.log('## For your local_edition, modify this recipe to match your city from the names below\n(', ', '.join(cities), ')\n') for edi in edi_data: if edi['EditionName'] == local_edition: edi_name = edi['EditionName'] edi_id = str(edi['EditionId']) self.log('Downloading', edi_name, 'Edition', self.timefmt) url = index + '/Home/GetAllpages?editionid=' + edi_id + '&editiondate=' + today main_data = json.loads(self.index_to_soup(url, raw=True)) feeds_dict = defaultdict(list) for page in main_data: page_no = page['PageNumber'] sec_name = page['NewsProPageTitle'] if sec_name == 'Full Page Ad': continue if sec_name.startswith('Front'): self.cover_url = page['HighResolution'] art = index + '/Home/getingRectangleObject?pageid=' + str(page['PageId']) self.log(sec_name, ' ', page_no) art_data = json.loads(self.index_to_soup(art, raw=True)) for snaps in art_data: section = sec_name url = str(snaps['OrgId']) title = ' '.join(snaps['StoryTitle'].split()[:15]) if not title: continue desc = page_no self.log('\t', title, ' ', desc) feeds_dict[section].append({"title": title, "description": desc, "url": url}) return [(section, articles) for section, articles in feeds_dict.items()] def preprocess_raw_html(self, raw, *a): data = json.loads(raw) body = '' for x in data['StoryContent']: if x['Headlines']: if len(x['Headlines']) > 0: body += '
' for x in data['StoryContent']: if x['Body']: body += x['Body'] # if data['filepathstorypic']: # this gives you a snap image of the article from page # body += '