From aa1482d782afb42bce3fff581d10c3e167326433 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 13 Sep 2020 21:39:44 +0530
Subject: [PATCH] Update ProPublica

Fixes #1895457 [Pro Publica news only creates Table of Contents, no articles.](https://bugs.launchpad.net/calibre/+bug/1895457)
---
 recipes/propublica.recipe | 70 +++++++++++++++------------------------
 1 file changed, 26 insertions(+), 44 deletions(-)

diff --git a/recipes/propublica.recipe b/recipes/propublica.recipe
index 19d63d4215..f19e4928b2 100644
--- a/recipes/propublica.recipe
+++ b/recipes/propublica.recipe
@@ -2,6 +2,12 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 
 
+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(attrs={
+        'class': lambda x: x and frozenset(x.split()).intersection(q)})
+
+
 class ProPublicaRecipe(BasicNewsRecipe):
     __license__ = 'GPL v3'
     __author__ = 'kwetal'
@@ -21,49 +27,25 @@ class ProPublicaRecipe(BasicNewsRecipe):
     no_stylesheets = True
     remove_javascript = True
 
-    keep_only_tags = []
-    keep_only_tags.append(dict(name='div', attrs={'class': 'article'}))
+    keep_only_tags = [
+        classes('title-wrapper content-wrapper article-header lead-art article-body')
+    ]
+    remove_tags = [
+        classes('email-signup story-tools newsletter topics-list')
+    ]
+    remove_tags_after = classes('article-body')
 
-    remove_tags = []
-    remove_tags.append(dict(name='div', attrs={'id': 'rollups'}))
-    remove_tags.append(dict(name='div', attrs={'class': 'follow_info'}))
-    remove_tags.append(dict(name='ul', attrs={'class': 'long-tools-top'}))
-    remove_tags.append(dict(name='ul', attrs={'id': 'share-box'}))
-    remove_tags.append(dict(name='div', attrs={'class': 'tags'}))
-    remove_tags.append(dict(name='ul', attrs={'class': 'long-tools'}))
-    remove_tags.append(dict(name='ul', attrs={'id': 'share-box2'}))
-    remove_tags.append(dict(name='p', attrs={'id': 'original-url'}))
+    feeds = [
+        (u'Top Stories', u'http://feeds.propublica.org/propublica/main'),
+    ]
 
-    feeds = []
-    feeds.append(
-        (u'Top Stories', u'http://feeds.propublica.org/propublica/main'))
-    feeds.append(
-        (u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus'))
-    feeds.append(
-        (u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout'))
-    feeds.append(
-        (u'Business', u'http://feeds.propublica.org/propublica/business-money'))
-    feeds.append(
-        (u'Justice', u'http://feeds.propublica.org/propublica/justice-law'))
-    feeds.append((u'Energy & Environment',
-                  u'http://feeds.propublica.org/propublica/energy-environment'))
-    feeds.append((u'Government & Politics',
-                  u'http://feeds.propublica.org/propublica/government-politics'))
-    feeds.append(
-        (u'Health & Science', u'http://feeds.propublica.org/propublica/health-science'))
-    feeds.append((u'Media & Technology',
-                  u'http://feeds.propublica.org/propublica/media-technology'))
-    feeds.append((u'National Security',
-                  u'http://feeds.propublica.org/propublica/national-security'))
-
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-                          'publisher': publisher}
-
-    extra_css = '''
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                img {float: left; margin-right: 0.5em;}
-                h1 {text-align: left;}
-                a, a[href] {text-decoration: none; color: blue;}
-                div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;}
-                div.info {font-size: small; color: #696969;}
-                '''
+    def preprocess_html(self, soup):
+        for img in soup.findAll('img', **classes('lazyload')):
+            source = img.findParent('picture')
+            if source is not None:
+                source = source.find('source')
+                if source is not None:
+                    img['src'] = source['data-srcset']
+        for img in soup.findAll('img', srcset=True):
+            img['src'] = img['srcset'].split()[0]
+        return soup