This commit is contained in:
Kovid Goyal 2023-11-08 11:56:41 +05:30
commit 3eb7f21fde
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 27 additions and 23 deletions

View File

@ -79,11 +79,10 @@ class TheEconomicTimes(BasicNewsRecipe):
for h3 in section.findAll(("h1", "h3", "h4", "h5")):
span = h3.find(
'span',
href=lambda x: x and x.startswith('/epaper/'),
href=lambda x: x and x.startswith('https://economictimes.indiatimes.com/epaper/'),
attrs={'class': 'banner'}
)
url = span['href']
url = 'https://economictimes.indiatimes.com' + url
title = self.tag_to_string(span)
div = h3.find_next_sibling('div', attrs={'class': 'dsc'})
if div is not None:

View File

@ -1,6 +1,7 @@
from calibre.web.feeds.news import BasicNewsRecipe
import json
from datetime import date
from collections import defaultdict
# default edition is Delhi i.e., 'cap'
@ -54,33 +55,32 @@ class toiprint(BasicNewsRecipe):
url = index + '/DayIndex/' + date_ + '_' + le + '.json'
raw = self.index_to_soup(url, raw=True)
data = json.loads(raw)
if 'DigitalIndex' not in data:
if 'DayIndex' not in data:
raise ValueError(
'The Times of India Newspaper is not published today.'
)
data = data['DigitalIndex']
feeds = []
data = data['DayIndex']
feeds_dict = defaultdict(list)
for link in data:
sec_name = link['PageTitle']
if sec_name == 'Advertisement':
continue
self.log(sec_name)
articles = []
if 'Views' in link:
for sec in link['Views']:
if 'Articles' in sec:
for art in sec['Articles']:
if 'ArticleName' not in art:
continue
url = art['ArticleName']
title = art.get('ArticleTitle', 'unknown').replace('<br>', '')
if art.get('ColumnTitle', '') == '':
desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ArticleBody', '')
else:
desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ColumnTitle', '')
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
articles.append({'title': title, 'description':desc, 'url': url})
if articles:
feeds.append((sec_name, articles))
return feeds
if 'Articles' in link:
for art in link['Articles']:
section = sec_name
if 'ArticleName' not in art:
continue
url = art['ArticleName']
title = art.get('ArticleTitle', 'unknown').replace('<br>', '')
if art.get('ColumnTitle', '') == '':
desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ArticleBody', '')
else:
desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ColumnTitle', '')
self.log('\t', title, '\n\t', desc.replace('\n', ''))
feeds_dict[section].append({"title": title, "url": url, "description": desc})
return [(section, articles) for section, articles in feeds_dict.items()]
def preprocess_raw_html(self, raw, *a):
data = json.loads(raw)
@ -107,8 +107,11 @@ class toiprint(BasicNewsRecipe):
elif 'ZoneText' in x:
body += '<p><i>' + x['ZoneText'] + '</i></p>'
return '<html><body><div>' \
+ body.replace('<br>', '<p>').replace('<br/>', '<p>').replace('&lt;br&gt;', '<p>').replace('\n', '<div>') \
+ body.replace('<br>', '<p>').replace('<br/>', '<p>').replace('&lt;br&gt;', '<p>').replace('\n', '<br>') \
+ '</div></body></html>'
def print_version(self, url):
return index + '/ArticleZoneJson/' + url.split('_')[-3] + '/' + url + '.json'
def populate_article_metadata(self, article, soup, first):
article.url = '***'

View File

@ -35,6 +35,7 @@ class WSJ(BasicNewsRecipe):
__author__ = 'Kovid Goyal'
description = 'News and current affairs'
language = 'en'
masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'
compress_news_images = True
compress_news_images_auto_size = 7

View File

@ -35,6 +35,7 @@ class WSJ(BasicNewsRecipe):
__author__ = 'Kovid Goyal'
description = 'News and current affairs'
language = 'en'
masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'
compress_news_images = True
compress_news_images_auto_size = 7