mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update reuters.recipe
This commit is contained in:
parent
cdc5810486
commit
b06be72a99
@ -1,114 +1,134 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# vim:fileencoding=utf-8
|
|
||||||
# License: GPLv3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
country = 'us'
|
def p_dt(x):
|
||||||
country_defs = {
|
dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
|
||||||
'us': ('www.reuters.com', {
|
return dt.strftime('%b %d, %Y, %I:%M %p')
|
||||||
'World': 'world',
|
|
||||||
'Business': 'business',
|
|
||||||
'Markets': 'markets',
|
|
||||||
'Tech': 'technology',
|
|
||||||
# 'Sports': 'lifestyle/sports',
|
|
||||||
'Wealth': 'markets/wealth',
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def prefixed_classes(classes):
|
|
||||||
q = frozenset(classes.split(' '))
|
|
||||||
|
|
||||||
def matcher(x):
|
|
||||||
if x:
|
|
||||||
for candidate in frozenset(x.split()):
|
|
||||||
for x in q:
|
|
||||||
if candidate.startswith(x):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
return {'attrs': {'class': matcher}}
|
|
||||||
|
|
||||||
|
|
||||||
def extract_article_list(raw):
|
|
||||||
if isinstance(raw, bytes):
|
|
||||||
raw = raw.decode('utf-8')
|
|
||||||
# open('/t/raw.html', 'w').write(raw)
|
|
||||||
idx = raw.index(';Fusion.globalContent={')
|
|
||||||
d = raw[idx:]
|
|
||||||
d = d[d.index('{'):]
|
|
||||||
data = json.JSONDecoder().raw_decode(d)[0]
|
|
||||||
# from pprint import pformat
|
|
||||||
# print(pformat(data), file=open('/t/raw.py', 'w'))
|
|
||||||
k = 'arcResult' if 'arcResult' in data else 'result'
|
|
||||||
for article in data[k]['articles']:
|
|
||||||
yield {'title': article['title'], 'description': article['description'], 'url': article['canonical_url']}
|
|
||||||
|
|
||||||
|
|
||||||
# if __name__ == '__main__':
|
|
||||||
# print(list(extract_article_list(open('/t/World News _ Latest Top Stories _ Reuters.html').read())))
|
|
||||||
|
|
||||||
|
|
||||||
class Reuters(BasicNewsRecipe):
|
class Reuters(BasicNewsRecipe):
|
||||||
title = 'Reuters'
|
title = 'Reuters'
|
||||||
description = 'News from all over'
|
__author__ = 'unkn0wn'
|
||||||
__author__ = 'Kovid Goyal'
|
description = (
|
||||||
|
'Reuters, the news and media division of Thomson Reuters, is the world’s largest multimedia news provider, '
|
||||||
|
'reaching billions of people worldwide every day. Reuters provides business, financial, national and international '
|
||||||
|
'news to professionals via desktop terminals, the world’s media organizations, industry events and directly to consumers.'
|
||||||
|
)
|
||||||
|
masthead_url = 'https://www.reutersprofessional.com/wp-content/uploads/2024/03/primary-logo.svg'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
oldest_article = 2 # days
|
||||||
keep_only_tags = [
|
no_javascript = True
|
||||||
prefixed_classes('article-body__container__ article-header__container__'),
|
no_stylesheets = True
|
||||||
]
|
|
||||||
remove_tags = [
|
|
||||||
prefixed_classes(
|
|
||||||
'context-widget__tabs___ article-header__toolbar__ read-next-mobile__container__ toolbar__container__ button__link__'
|
|
||||||
' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___'
|
|
||||||
' SocialEmbed__inner___ trust-badge author-bio__social__ with-spinner__spinner__ author-bio__author-image__'
|
|
||||||
),
|
|
||||||
dict(name=['button', 'link', 'svg']),
|
|
||||||
]
|
|
||||||
remove_attributes = ['style', 'height', 'width']
|
remove_attributes = ['style', 'height', 'width']
|
||||||
|
resolve_internal_links = True
|
||||||
|
ignore_duplicate_articles = {'url', 'title'}
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
img { max-width: 100%; }
|
.label, .auth { font-size:small; color:#202020; }
|
||||||
[class^="article-header__tags__"],
|
.figc { font-size:small; text-align:center; }
|
||||||
[class^="author-bio__author-card__"],
|
img {display:block; margin:0 auto;}
|
||||||
[class^="article-header__author-date__"] {
|
|
||||||
font-size:small;
|
|
||||||
}
|
|
||||||
[data-testid="primary-gallery"], [data-testid="primary-image"] { font-size:small; text-align:center; }
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
base, sections = country_defs[country]
|
index = 'https://www.reuters.com'
|
||||||
ans = []
|
today = datetime.now()
|
||||||
|
feed_api = index + '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json'
|
||||||
|
path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json'
|
||||||
|
sections = [
|
||||||
|
'world', 'business', 'markets','sustainability', 'legal',
|
||||||
|
'breakingviews', 'technology', 'sports', 'science', 'lifestyle'
|
||||||
|
]
|
||||||
|
|
||||||
for section_title in sections:
|
feeds = []
|
||||||
slug = sections[section_title]
|
|
||||||
self.log(section_title)
|
for sec in sections:
|
||||||
articles = list(self.parse_reuters_section(base, slug))
|
section = sec.capitalize()
|
||||||
|
self.log(section)
|
||||||
|
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))['wireitems']
|
||||||
|
|
||||||
|
for x in data:
|
||||||
|
if x.get('wireitem_type', '') == 'story':
|
||||||
|
for y in x['templates']:
|
||||||
|
if y.get('type', '') == 'story':
|
||||||
|
title = y['story']['hed']
|
||||||
|
|
||||||
|
date = datetime.fromisoformat(y['story']['updated_at'][:-1]) + timedelta(seconds=time.timezone)
|
||||||
|
if (today - date) > timedelta(self.oldest_article):
|
||||||
|
continue
|
||||||
|
|
||||||
|
desc = y['story']['lede']
|
||||||
|
path = y['template_action']
|
||||||
|
if path.get('type', '') == 'article':
|
||||||
|
url = path_api.format(path['api_path_native'])
|
||||||
|
self.log(' ', title, '\n\t', desc)
|
||||||
|
articles.append({'title': title, 'description':desc, 'url': url})
|
||||||
if articles:
|
if articles:
|
||||||
ans.append((section_title, articles))
|
feeds.append((section, articles))
|
||||||
if self.test and len(ans) >= self.test[0]:
|
return feeds
|
||||||
break
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def parse_reuters_section(self, base, slug):
|
def preprocess_raw_html(self, raw, url):
|
||||||
url = 'https://' + base + '/' + slug
|
js = json.loads(raw)
|
||||||
raw = self.index_to_soup(url, raw=True)
|
data = js['wireitems']
|
||||||
for article in extract_article_list(raw):
|
body = ''
|
||||||
article['url'] = 'https://{}{}'.format(base, article['url'])
|
for x in data:
|
||||||
yield article
|
if x.get('wireitem_type', '') == 'story':
|
||||||
self.log('\t', article['title'], article['url'])
|
for y in x['templates']:
|
||||||
|
if 'label' in y['cid']:
|
||||||
|
body += '<div class="label">' + y['title'] + '</div>'
|
||||||
|
break
|
||||||
|
for y in x['templates']:
|
||||||
|
if 'title' in y['cid']:
|
||||||
|
body += '<h1 title="{}">'.format(js['share_url']) + y['content'] + '</h1>'
|
||||||
|
break
|
||||||
|
for y in x['templates']:
|
||||||
|
if 'author' in y['cid']:
|
||||||
|
body += '<p>'
|
||||||
|
auths = [x for x in y.get('authors_names', [])]
|
||||||
|
if auths:
|
||||||
|
body += '<div class="auth">' + 'By ' + ', '.join(auths) + '</div>'
|
||||||
|
break
|
||||||
|
for y in x['templates']:
|
||||||
|
if 'datetime' in y['cid']:
|
||||||
|
body += '<div class="auth">' + str(y['read_minutes']) \
|
||||||
|
+ ' minute read | ' + p_dt(y['display_time']) + '</div>'
|
||||||
|
body += '</p>'
|
||||||
|
break
|
||||||
|
for y in x['templates']:
|
||||||
|
if 'paragraph' in y['cid']:
|
||||||
|
body += '<p>' + y['content'] + '</p>'
|
||||||
|
if 'header' in y['cid']:
|
||||||
|
body += '<h4>' + y['content'] + '</h4>'
|
||||||
|
if 'image' in y['cid']:
|
||||||
|
if 'renditions' in y['image']:
|
||||||
|
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||||||
|
y['image']['url'].split('&')[0] + '&width=480', y['image']['caption']
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||||||
|
y['image']['url'], y['image']['caption']
|
||||||
|
)
|
||||||
|
if 'gallery' in y['cid']:
|
||||||
|
for imgs in y['images']:
|
||||||
|
if 'renditions' in imgs:
|
||||||
|
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||||||
|
imgs['url'].split('&')[0] + '&width=480', imgs['caption']
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||||||
|
imgs['url'], imgs['caption']
|
||||||
|
)
|
||||||
|
if 'video' in y['cid']:
|
||||||
|
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||||||
|
y['video']['thumbnail']['url'], y['video']['thumbnail']['caption']
|
||||||
|
)
|
||||||
|
return BeautifulSoup('<html><body><div>' + body + '</div></body></html>').prettify()
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
for noscript in soup.findAll('noscript'):
|
article.url = soup.find('h1')['title']
|
||||||
if noscript.findAll('img'):
|
|
||||||
noscript.name = 'div'
|
|
||||||
for img in soup.findAll('img', attrs={'srcset':True}):
|
|
||||||
img['src'] = img['srcset'].split()[0]
|
|
||||||
return soup
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user