This commit is contained in:
Kovid Goyal 2024-10-29 12:15:18 +05:30
commit adc64d7378
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 74 additions and 26 deletions

View File

@ -99,16 +99,22 @@ class IndianExpress(BasicNewsRecipe):
def articles_from_soup(self, soup):
ans = []
div = soup.find('div', attrs={'class':['nation', 'o-opin']})
for art in div.findAll(attrs={'class':['articles', 'o-opin-article']}):
div = soup.find('div', attrs={'class': ['nation', 'o-opin', 'myie-nation']})
for art in div.findAll(
attrs={'class': ['articles', 'o-opin-article', 'myie-articles']}
):
for a in art.findAll('a', href=True):
if not a.find('img') and not ('/profile/' in a['href'] or '/agency/' in a['href']):
if not a.find('img') and not any(
x in a['href'] for x in ['/profile/', '/agency/', '/section/']
):
url = a['href']
title = self.tag_to_string(a)
desc = ''
if p:= art.find('p'):
if p := art.find('p'):
desc = self.tag_to_string(p)
if da := art.find('div', attrs={'class':['date', 'o-opin-date']}):
if da := art.find(
'div', attrs={'class': ['date', 'o-opin-date', 'my-time']}
):
date = parse_date(self.tag_to_string(da)).replace(tzinfo=None)
today = datetime.now()
if (today - date) > timedelta(self.oldest_article):

View File

@ -4,7 +4,6 @@ import json
import time
from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
@ -12,6 +11,7 @@ def p_dt(x):
dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
return dt.strftime('%b %d, %Y, %I:%M %p')
class Reuters(BasicNewsRecipe):
title = 'Reuters'
__author__ = 'unkn0wn'
@ -20,28 +20,35 @@ class Reuters(BasicNewsRecipe):
'reaching billions of people worldwide every day. Reuters provides business, financial, national and international '
'news to professionals via desktop terminals, the worlds media organizations, industry events and directly to consumers.'
)
masthead_url = 'https://www.reutersagency.com/wp-content/uploads/2024/06/reuters-logo.png'
cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024'
masthead_url = (
'https://upload.wikimedia.org/wikipedia/commons/9/9e/Reuters_logo_2024.svg'
)
cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024'
language = 'en'
encoding = 'utf-8'
oldest_article = 1.2 # days
oldest_article = 1.2 # days
no_javascript = True
no_stylesheets = True
remove_attributes = ['style', 'height', 'width']
resolve_internal_links = True
ignore_duplicate_articles = {'url', 'title'}
extra_css = '''
extra_css = """
.label, .auth { font-size:small; color:#202020; }
.figc { font-size:small; }
img {display:block; margin:0 auto;}
'''
"""
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
'default': str(oldest_article),
},
'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
'long': 'This is useful for non e-ink devices',
'default': '480'
}
}
@ -54,11 +61,22 @@ class Reuters(BasicNewsRecipe):
def parse_index(self):
index = 'https://www.reuters.com'
today = datetime.now()
feed_api = index + '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json'
feed_api = (
index
+ '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json'
)
path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json'
sections = [
'world', 'business', 'markets','sustainability', 'legal',
'breakingviews', 'technology', 'sports', 'science', 'lifestyle'
'world',
'business',
'markets',
'sustainability',
'legal',
'breakingviews',
'technology',
# 'sports',
'science',
# 'lifestyle',
]
feeds = []
@ -69,7 +87,9 @@ class Reuters(BasicNewsRecipe):
articles = []
data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))['wireitems']
data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))[
'wireitems'
]
for x in data:
if x.get('wireitem_type', '') == 'story':
@ -77,7 +97,9 @@ class Reuters(BasicNewsRecipe):
if y.get('type', '') == 'story':
title = y['story']['hed']
date = datetime.fromisoformat(y['story']['updated_at'][:-1]) + timedelta(seconds=time.timezone)
date = datetime.fromisoformat(
y['story']['updated_at'][:-1]
) + timedelta(seconds=time.timezone)
if (today - date) > timedelta(self.oldest_article):
continue
@ -86,12 +108,18 @@ class Reuters(BasicNewsRecipe):
if path.get('type', '') == 'article':
url = path_api.format(path['api_path_native'])
self.log(' ', title, '\n\t', desc)
articles.append({'title': title, 'description':desc, 'url': url})
articles.append(
{'title': title, 'description': desc, 'url': url}
)
if articles:
feeds.append((section, articles))
return feeds
def preprocess_raw_html(self, raw, url):
res = '&width=480'
w = self.recipe_specific_options.get('res')
if w and isinstance(w, str):
res = '&width=' + w
js = json.loads(raw)
data = js['wireitems']
body = ''
@ -103,19 +131,30 @@ class Reuters(BasicNewsRecipe):
break
for y in x['templates']:
if 'title' in y['cid']:
body += '<h1 title="{}">'.format(js['share_url']) + y['content'] + '</h1>'
body += (
'<h1 title="{}">'.format(js['share_url'])
+ y['content']
+ '</h1>'
)
break
for y in x['templates']:
if 'author' in y['cid']:
body += '<p>'
auths = [x for x in y.get('authors_names', [])]
if auths:
body += '<div class="auth">' + 'By ' + ', '.join(auths) + '</div>'
body += (
'<div class="auth">' + 'By ' + ', '.join(auths) + '</div>'
)
break
for y in x['templates']:
if 'datetime' in y['cid']:
body += '<div class="auth">' + str(y['read_minutes']) \
+ ' minute read | ' + p_dt(y['display_time']) + '</div>'
body += (
'<div class="auth">'
+ str(y['read_minutes'])
+ ' minute read | '
+ p_dt(y['display_time'])
+ '</div>'
)
body += '</p>'
break
for y in x['templates']:
@ -126,7 +165,8 @@ class Reuters(BasicNewsRecipe):
if 'image' in y['cid']:
if 'renditions' in y['image']:
body += '<img src="{}"><div class="figc">{}</div>'.format(
y['image']['url'].split('&')[0] + '&width=480', y['image']['caption']
y['image']['url'].split('&')[0] + res,
y['image']['caption'],
)
else:
body += '<img src="{}"><div class="figc">{}</div>'.format(
@ -136,7 +176,8 @@ class Reuters(BasicNewsRecipe):
for imgs in y['images']:
if 'renditions' in imgs:
body += '<img src="{}"><div class="figc">{}</div>'.format(
imgs['url'].split('&')[0] + '&width=480', imgs['caption']
imgs['url'].split('&')[0] + res,
imgs['caption'],
)
else:
body += '<img src="{}"><div class="figc">{}</div>'.format(
@ -144,9 +185,10 @@ class Reuters(BasicNewsRecipe):
)
if 'video' in y['cid']:
body += '<img src="{}"><div class="figc">{}</div>'.format(
y['video']['thumbnail']['url'], y['video']['thumbnail']['caption']
y['video']['thumbnail']['url'],
y['video']['thumbnail']['caption'],
)
return BeautifulSoup('<html><body><div>' + body + '</div></body></html>').prettify()
return '<html><body><div>' + body + '</div></body></html>'
def populate_article_metadata(self, article, soup, first):
article.url = soup.find('h1')['title']