mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-26 16:25:02 -05:00
104 lines
3.2 KiB
Python
104 lines
3.2 KiB
Python
#!/usr/bin/env python
|
|
# vim:fileencoding=utf-8
|
|
# License: GPLv3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
|
|
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
import json
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
country = 'us'
|
|
country_defs = {
|
|
'us': ('www.reuters.com', {
|
|
'World': 'world',
|
|
'Business': 'business',
|
|
'Markets': 'markets',
|
|
'Tech': 'technology',
|
|
'Sports': 'lifestyle/sports',
|
|
'Wealth': 'markets/wealth',
|
|
})
|
|
}
|
|
|
|
|
|
def prefixed_classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
|
|
def matcher(x):
|
|
if x:
|
|
for candidate in frozenset(x.split()):
|
|
for x in q:
|
|
if candidate.startswith(x):
|
|
return True
|
|
return False
|
|
return {'attrs': {'class': matcher}}
|
|
|
|
|
|
def extract_article_list(raw):
|
|
if isinstance(raw, bytes):
|
|
raw = raw.decode('utf-8')
|
|
# open('/t/raw.html', 'w').write(raw)
|
|
idx = raw.index(';Fusion.globalContent={')
|
|
d = raw[idx:]
|
|
d = d[d.index('{'):]
|
|
data = json.JSONDecoder().raw_decode(d)[0]
|
|
# from pprint import pformat
|
|
# print(pformat(data), file=open('/t/raw.py', 'w'))
|
|
k = 'arcResult' if 'arcResult' in data else 'result'
|
|
for article in data[k]['articles']:
|
|
yield {'title': article['title'], 'description': article['description'], 'url': article['canonical_url']}
|
|
|
|
|
|
# if __name__ == '__main__':
|
|
# print(list(extract_article_list(open('/t/World News _ Latest Top Stories _ Reuters.html').read())))
|
|
|
|
|
|
class Reuters(BasicNewsRecipe):
|
|
title = 'Reuters'
|
|
description = 'News from all over'
|
|
__author__ = 'Kovid Goyal'
|
|
language = 'en'
|
|
|
|
keep_only_tags = [
|
|
prefixed_classes('article-header__heading___ article-header__author___ article-body__content___'),
|
|
]
|
|
remove_tags = [
|
|
prefixed_classes(
|
|
'context-widget__tabs___'
|
|
' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___'
|
|
' SocialEmbed__inner___'
|
|
),
|
|
dict(name=['button', 'link']),
|
|
]
|
|
remove_attributes = ['style']
|
|
extra_css = '''
|
|
img { max-width: 100%; }
|
|
'''
|
|
|
|
def preprocess_html(self, soup, *a):
|
|
for noscript in soup.findAll('noscript'):
|
|
if noscript.findAll('img'):
|
|
noscript.name = 'div'
|
|
return soup
|
|
|
|
def parse_index(self):
|
|
base, sections = country_defs[country]
|
|
ans = []
|
|
|
|
for section_title in sections:
|
|
slug = sections[section_title]
|
|
self.log(section_title)
|
|
articles = list(self.parse_reuters_section(base, slug))
|
|
if articles:
|
|
ans.append((section_title, articles))
|
|
if self.test and len(ans) >= self.test[0]:
|
|
break
|
|
return ans
|
|
|
|
def parse_reuters_section(self, base, slug):
|
|
url = 'https://' + base + '/' + slug
|
|
raw = self.index_to_soup(url, raw=True)
|
|
for article in extract_article_list(raw):
|
|
article['url'] = 'https://{}{}'.format(base, article['url'])
|
|
yield article
|
|
self.log('\t', article['title'], article['url'])
|