#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2020, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import json from calibre.web.feeds.news import BasicNewsRecipe country = 'us' country_defs = { 'us': ('www.reuters.com', { 'World': 'world', 'Business': 'business', 'Markets': 'markets', 'Tech': 'technology', 'Sports': 'lifestyle/sports', 'Wealth': 'markets/wealth', }) } def prefixed_classes(classes): q = frozenset(classes.split(' ')) def matcher(x): if x: for candidate in frozenset(x.split()): for x in q: if candidate.startswith(x): return True return False return {'attrs': {'class': matcher}} def extract_article_list(raw): if isinstance(raw, bytes): raw = raw.decode('utf-8') # open('/t/raw.html', 'w').write(raw) idx = raw.index(';Fusion.globalContent={') d = raw[idx:] d = d[d.index('{'):] data = json.JSONDecoder().raw_decode(d)[0] # from pprint import pformat # print(pformat(data), file=open('/t/raw.py', 'w')) k = 'arcResult' if 'arcResult' in data else 'result' for article in data[k]['articles']: yield {'title': article['title'], 'description': article['description'], 'url': article['canonical_url']} # if __name__ == '__main__': # print(list(extract_article_list(open('/t/World News _ Latest Top Stories _ Reuters.html').read()))) class Reuters(BasicNewsRecipe): title = 'Reuters' description = 'News from all over' __author__ = 'Kovid Goyal' language = 'en' keep_only_tags = [ prefixed_classes('article-header__heading___ article-header__author___ article-body__content___'), ] remove_tags = [ prefixed_classes( 'context-widget__tabs___' ' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___' ' SocialEmbed__inner___' ), dict(name=['button', 'link']), ] remove_attributes = ['style'] extra_css = ''' img { max-width: 100%; } ''' def preprocess_html(self, soup, *a): for noscript in soup.findAll('noscript'): if noscript.findAll('img'): noscript.name = 'div' return soup def parse_index(self): base, sections = country_defs[country] ans = [] for section_title in sections: slug = sections[section_title] self.log(section_title) articles = list(self.parse_reuters_section(base, slug)) if articles: ans.append((section_title, articles)) if self.test and len(ans) >= self.test[0]: break return ans def parse_reuters_section(self, base, slug): url = 'https://' + base + '/' + slug raw = self.index_to_soup(url, raw=True) for article in extract_article_list(raw): article['url'] = 'https://{}{}'.format(base, article['url']) yield article self.log('\t', article['title'], article['url'])