Update ProPublica

Fixes #1895457 [Pro Publica news only creates Table of Contents, no articles.](https://bugs.launchpad.net/calibre/+bug/1895457)
2026-01-06 20:20:30 -05:00 · 2020-09-13 21:39:44 +05:30 · 2020-09-13 21:39:44 +05:30 · aa1482d782
commit aa1482d782
parent 4b58b7e550
1 changed files with 26 additions and 44 deletions
--- a/recipes/propublica.recipe
+++ b/recipes/propublica.recipe
@ -2,6 +2,12 @@
 from calibre.web.feeds.news import BasicNewsRecipe


+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(attrs={
+        'class': lambda x: x and frozenset(x.split()).intersection(q)})
+
+
 class ProPublicaRecipe(BasicNewsRecipe):
    __license__ = 'GPL v3'
    __author__ = 'kwetal'
@ -21,49 +27,25 @@ class ProPublicaRecipe(BasicNewsRecipe):
    no_stylesheets = True
    remove_javascript = True

-    keep_only_tags = []
-    keep_only_tags.append(dict(name='div', attrs={'class': 'article'}))
+    keep_only_tags = [
+        classes('title-wrapper content-wrapper article-header lead-art article-body')
+    ]
+    remove_tags = [
+        classes('email-signup story-tools newsletter topics-list')
+    ]
+    remove_tags_after = classes('article-body')

-    remove_tags = []
-    remove_tags.append(dict(name='div', attrs={'id': 'rollups'}))
-    remove_tags.append(dict(name='div', attrs={'class': 'follow_info'}))
-    remove_tags.append(dict(name='ul', attrs={'class': 'long-tools-top'}))
-    remove_tags.append(dict(name='ul', attrs={'id': 'share-box'}))
-    remove_tags.append(dict(name='div', attrs={'class': 'tags'}))
-    remove_tags.append(dict(name='ul', attrs={'class': 'long-tools'}))
-    remove_tags.append(dict(name='ul', attrs={'id': 'share-box2'}))
-    remove_tags.append(dict(name='p', attrs={'id': 'original-url'}))
+    feeds = [
+        (u'Top Stories', u'http://feeds.propublica.org/propublica/main'),
+    ]

-    feeds = []
-    feeds.append(
-        (u'Top Stories', u'http://feeds.propublica.org/propublica/main'))
-    feeds.append(
-        (u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus'))
-    feeds.append(
-        (u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout'))
-    feeds.append(
-        (u'Business', u'http://feeds.propublica.org/propublica/business-money'))
-    feeds.append(
-        (u'Justice', u'http://feeds.propublica.org/propublica/justice-law'))
-    feeds.append((u'Energy & Environment',
-                  u'http://feeds.propublica.org/propublica/energy-environment'))
-    feeds.append((u'Government & Politics',
-                  u'http://feeds.propublica.org/propublica/government-politics'))
-    feeds.append(
-        (u'Health & Science', u'http://feeds.propublica.org/propublica/health-science'))
-    feeds.append((u'Media & Technology',
-                  u'http://feeds.propublica.org/propublica/media-technology'))
-    feeds.append((u'National Security',
-                  u'http://feeds.propublica.org/propublica/national-security'))
-
-    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
-                          'publisher': publisher}
-
-    extra_css = '''
-                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
-                img {float: left; margin-right: 0.5em;}
-                h1 {text-align: left;}
-                a, a[href] {text-decoration: none; color: blue;}
-                div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;}
-                div.info {font-size: small; color: #696969;}
-                '''
+    def preprocess_html(self, soup):
+        for img in soup.findAll('img', **classes('lazyload')):
+            source = img.findParent('picture')
+            if source is not None:
+                source = source.find('source')
+                if source is not None:
+                    img['src'] = source['data-srcset']
+        for img in soup.findAll('img', srcset=True):
+            img['src'] = img['srcset'].split()[0]
+        return soup