Update Reuters

2025-07-09 03:04:10 -04:00 · 2020-10-11 13:01:02 +05:30 · 2020-10-11 13:01:02 +05:30 · 77e14bff20
commit 77e14bff20
parent fcf8822020
1 changed files with 27 additions and 21 deletions
--- a/recipes/reuters.recipe
+++ b/recipes/reuters.recipe
@ -4,8 +4,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import re
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 country = 'us'
@ -21,6 +19,19 @@ country_defs = {
 }
 def prefixed_classes(classes):
    q = frozenset(classes.split(' '))
    def matcher(x):
        if x:
            for candidate in frozenset(x.split()):
                for x in q:
                    if candidate.startswith(x):
                        return True
        return False
    return {'attrs': {'class': matcher}}
 class Reuters(BasicNewsRecipe):
    title = 'Reuters'
    description = 'News from all over'
@ -28,13 +39,24 @@ class Reuters(BasicNewsRecipe):
    language = 'en'
    keep_only_tags = [
-        classes('ArticleHeader_content-container StandardArticleBody_body')
+        prefixed_classes('ArticlePage-article-header ArticlePage-article-body'),
    ]
    remove_tags = [
-        classes('Image_expand-button RelatedCoverage_related-coverage-module'),
+        prefixed_classes('ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer-'),
-        dict(name='link'),
+        dict(name=['button', 'link']),
    ]
    def preprocess_html(self, soup, *a):
        meta = soup.find(attrs={'name': "sailthru.image.full"})
        if meta is not None:
            url = meta['content']
            body = soup.find(**prefixed_classes('ArticlePage-article-body'))
            if body is not None:
                div = soup.new_tag('div')
                div.append(soup.new_tag('img', src=url))
                body.insert(0, div)
        return soup
    def parse_index(self):
        base, sections = country_defs[country]
        ans = []
@ -59,19 +81,3 @@ class Reuters(BasicNewsRecipe):
            url = 'https://{}{}'.format(base, a['href'])
            self.log('\t', title, url)
            yield {'title': title, 'url': url}
    def preprocess_html(self, soup):
        url_pat = re.compile(r'url\((.+?)\)')
        for div in soup.findAll(style=True, **classes('LazyImage_image')):
            m = url_pat.search(div['style'])
            if m is None:
                self.warn('Failed to find lazy image url in:', div['style'])
                continue
            url = m.group(1)
            if url.startswith('//'):
                url = 'https:' + url
            url = url.replace('&w=20', '')
            img = div.findPreviousSibling('img')
            img['src'] = url
            img['data-modified'] = url
        return soup