From 77e14bff2021f1c5c18cefd1b7cac075b0cb6932 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 11 Oct 2020 13:01:02 +0530
Subject: [PATCH] Update Reuters

---
 recipes/reuters.recipe | 48 ++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/recipes/reuters.recipe b/recipes/reuters.recipe
index 6d02150d07..2d174cd068 100644
--- a/recipes/reuters.recipe
+++ b/recipes/reuters.recipe
@@ -4,8 +4,6 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import re
-
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 
 country = 'us'
@@ -21,6 +19,19 @@ country_defs = {
 }
 
 
+def prefixed_classes(classes):
+    q = frozenset(classes.split(' '))
+
+    def matcher(x):
+        if x:
+            for candidate in frozenset(x.split()):
+                for x in q:
+                    if candidate.startswith(x):
+                        return True
+        return False
+    return {'attrs': {'class': matcher}}
+
+
 class Reuters(BasicNewsRecipe):
     title = 'Reuters'
     description = 'News from all over'
@@ -28,13 +39,24 @@ class Reuters(BasicNewsRecipe):
     language = 'en'
 
     keep_only_tags = [
-        classes('ArticleHeader_content-container StandardArticleBody_body')
+        prefixed_classes('ArticlePage-article-header ArticlePage-article-body'),
     ]
     remove_tags = [
-        classes('Image_expand-button RelatedCoverage_related-coverage-module'),
-        dict(name='link'),
+        prefixed_classes('ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer-'),
+        dict(name=['button', 'link']),
     ]
 
+    def preprocess_html(self, soup, *a):
+        meta = soup.find(attrs={'name': "sailthru.image.full"})
+        if meta is not None:
+            url = meta['content']
+            body = soup.find(**prefixed_classes('ArticlePage-article-body'))
+            if body is not None:
+                div = soup.new_tag('div')
+                div.append(soup.new_tag('img', src=url))
+                body.insert(0, div)
+        return soup
+
     def parse_index(self):
         base, sections = country_defs[country]
         ans = []
@@ -59,19 +81,3 @@ class Reuters(BasicNewsRecipe):
             url = 'https://{}{}'.format(base, a['href'])
             self.log('\t', title, url)
             yield {'title': title, 'url': url}
-
-    def preprocess_html(self, soup):
-        url_pat = re.compile(r'url\((.+?)\)')
-        for div in soup.findAll(style=True, **classes('LazyImage_image')):
-            m = url_pat.search(div['style'])
-            if m is None:
-                self.warn('Failed to find lazy image url in:', div['style'])
-                continue
-            url = m.group(1)
-            if url.startswith('//'):
-                url = 'https:' + url
-            url = url.replace('&w=20', '')
-            img = div.findPreviousSibling('img')
-            img['src'] = url
-            img['data-modified'] = url
-        return soup