#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2020, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import json from calibre.web.feeds.news import BasicNewsRecipe country = 'us' country_defs = { 'us': ('www.reuters.com', { 'World': 'world', 'Business': 'business', 'Markets': 'markets', 'Tech': 'technology', # 'Sports': 'lifestyle/sports', 'Wealth': 'markets/wealth', }) } def prefixed_classes(classes): q = frozenset(classes.split(' ')) def matcher(x): if x: for candidate in frozenset(x.split()): for x in q: if candidate.startswith(x): return True return False return {'attrs': {'class': matcher}} def extract_article_list(raw): if isinstance(raw, bytes): raw = raw.decode('utf-8') # open('/t/raw.html', 'w').write(raw) idx = raw.index(';Fusion.globalContent={') d = raw[idx:] d = d[d.index('{'):] data = json.JSONDecoder().raw_decode(d)[0] # from pprint import pformat # print(pformat(data), file=open('/t/raw.py', 'w')) k = 'arcResult' if 'arcResult' in data else 'result' for article in data[k]['articles']: yield {'title': article['title'], 'description': article['description'], 'url': article['canonical_url']} # if __name__ == '__main__': # print(list(extract_article_list(open('/t/World News _ Latest Top Stories _ Reuters.html').read()))) class Reuters(BasicNewsRecipe): title = 'Reuters' description = 'News from all over' __author__ = 'Kovid Goyal' language = 'en' keep_only_tags = [ prefixed_classes('article-body__container__ article-header__container__'), ] remove_tags = [ prefixed_classes( 'context-widget__tabs___ article-header__toolbar__ read-next-mobile__container__ toolbar__container__ button__link__' ' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___' ' SocialEmbed__inner___ trust-badge author-bio__social__ with-spinner__spinner__ author-bio__author-image__' ), dict(name=['button', 'link', 'svg']), ] remove_attributes = ['style', 'height', 'width'] extra_css = ''' img { max-width: 100%; } [class^="article-header__tags__"], [class^="author-bio__author-card__"], [class^="article-header__author-date__"] { font-size:small; } [data-testid="primary-gallery"], [data-testid="primary-image"] { font-size:small; text-align:center; } ''' def parse_index(self): base, sections = country_defs[country] ans = [] for section_title in sections: slug = sections[section_title] self.log(section_title) articles = list(self.parse_reuters_section(base, slug)) if articles: ans.append((section_title, articles)) if self.test and len(ans) >= self.test[0]: break return ans def parse_reuters_section(self, base, slug): url = 'https://' + base + '/' + slug raw = self.index_to_soup(url, raw=True) for article in extract_article_list(raw): article['url'] = 'https://{}{}'.format(base, article['url']) yield article self.log('\t', article['title'], article['url']) def preprocess_html(self, soup): for noscript in soup.findAll('noscript'): if noscript.findAll('img'): noscript.name = 'div' for img in soup.findAll('img', attrs={'srcset':True}): img['src'] = img['srcset'].split()[0] return soup