From 3f16b5ac618aa9f8f69ab5670ff58a2ea56e7c1d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 1 Jan 2022 14:34:27 +0530 Subject: [PATCH] Update Reuters --- recipes/reuters.recipe | 70 +++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/recipes/reuters.recipe b/recipes/reuters.recipe index a85a703ea5..602cb4566c 100644 --- a/recipes/reuters.recipe +++ b/recipes/reuters.recipe @@ -4,17 +4,18 @@ from __future__ import absolute_import, division, print_function, unicode_literals -from calibre.web.feeds.news import BasicNewsRecipe, classes +import json +from calibre.web.feeds.news import BasicNewsRecipe country = 'us' country_defs = { 'us': ('www.reuters.com', { - 'Business': 'finance', - 'Markets': 'finance/markets', 'World': 'world', - 'Politics': 'politics', - 'Tech': 'news/technology', - 'Wealth': 'finance/wealth', + 'Business': 'business', + 'Markets': 'markets', + 'Tech': 'technology', + 'Sports': 'lifestyle/sports', + 'Wealth': 'markets/wealth', }) } @@ -32,6 +33,25 @@ def prefixed_classes(classes): return {'attrs': {'class': matcher}} +def extract_article_list(raw): + if isinstance(raw, bytes): + raw = raw.decode('utf-8') + # open('/t/raw.html', 'w').write(raw) + idx = raw.index(';Fusion.globalContent={') + d = raw[idx:] + d = d[d.index('{'):] + data = json.JSONDecoder().raw_decode(d)[0] + # from pprint import pformat + # print(pformat(data), file=open('/t/raw.py', 'w')) + k = 'arcResult' if 'arcResult' in data else 'result' + for article in data[k]['articles']: + yield {'title': article['title'], 'description': article['description'], 'url': article['canonical_url']} + + +# if __name__ == '__main__': +# print(list(extract_article_list(open('/t/World News _ Latest Top Stories _ Reuters.html').read()))) + + class Reuters(BasicNewsRecipe): title = 'Reuters' description = 'News from all over' @@ -39,29 +59,28 @@ class Reuters(BasicNewsRecipe): language = 'en' keep_only_tags = [ - prefixed_classes('ArticlePage-article-header ArticlePage-article-body'), + prefixed_classes('ArticleHeader__heading___ ArticleHeader__author___ ArticleBody__container___ ArticlePage-article-header ArticlePage-article-body'), ] remove_tags = [ - prefixed_classes('ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer-'), + prefixed_classes( + 'ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___' + ' SocialEmbed__inner___' + ), dict(name=['button', 'link']), ] + remove_attributes = ['style'] def preprocess_html(self, soup, *a): - meta = soup.find(attrs={'name': "sailthru.image.full"}) - if meta is not None: - url = meta['content'] - body = soup.find(**prefixed_classes('ArticlePage-article-body')) - if body is not None: - div = soup.new_tag('div') - div.append(soup.new_tag('img', src=url)) - body.insert(0, div) + for noscript in soup.findAll('noscript'): + if noscript.findAll('img'): + noscript.name = 'div' return soup def parse_index(self): base, sections = country_defs[country] ans = [] - for section_title in sorted(sections): + for section_title in sections: slug = sections[section_title] self.log(section_title) articles = list(self.parse_reuters_section(base, slug)) @@ -73,15 +92,8 @@ class Reuters(BasicNewsRecipe): def parse_reuters_section(self, base, slug): url = 'https://' + base + '/' + slug - try: - soup = self.index_to_soup(url) - except Exception: - self.log.error('Failed to load Reuters section:', url) - return - for div in soup.findAll(**classes('news-headline-list')): - h3 = div.find(**classes('story-title')) - a = h3.parent - title = self.tag_to_string(h3) - url = 'https://{}{}'.format(base, a['href']) - self.log('\t', title, url) - yield {'title': title, 'url': url} + raw = self.index_to_soup(url, raw=True) + for article in extract_article_list(raw): + article['url'] = 'https://{}{}'.format(base, article['url']) + yield article + self.log('\t', article['title'], article['url'])