Update Naked Capitalism

2026-05-27 09:12:34 -04:00 · 2026-03-21 17:25:54 +05:30
parent f23369204d
commit 2c9af589ed
1 changed files with 37 additions and 33 deletions
@@ -1,43 +1,47 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
 from calibre.web.feeds.news import BasicNewsRecipe


 class NakedCapitalism(BasicNewsRecipe):
    title = 'Naked Capitalism'
-    __author__ = 'PaulB223'
-    language = 'en_US'
    oldest_article = 7
-    max_articles_per_feed = 50
+    max_articles_per_feed = 100
+    auto_cleanup = True

-    simultaneous_downloads = 1
-    delay = 5.0
-
-    disable_header = True
-    fetch_masthead = False
-    no_stylesheets = True
-    remove_javascript = True
-
-    keep_only_tags = [
-        dict(name='article'),
-        dict(attrs={'class': lambda x: x and 'post-content' in x.split()}),
-        dict(attrs={'class': 'entry-content'}),
-        dict(attrs={'class': 'post-content'}),
-        dict(id='content')
-    ]
-
-    remove_tags = [
-        dict(name=['nav', 'header', 'footer', 'aside', 'svg', 'button', 'script', 'style']),
-        dict(attrs={'class': lambda x: x and any(c in x.lower() for c in ['sidebar', 'ads', 'ad-', 'share', 'donation', 'related', 'comments'])})
-    ]
-
-    def get_feeds(self):
-        return [
-            ('Naked Capitalism', 'https://www.nakedcapitalism.com/feed'),
-            ('Naked Capitalism (p2)', 'https://www.nakedcapitalism.com/feed?paged=2'),
-        ]
+    extra_css = '''
+        body { font-family: serif !important; color: black !important; }
+        p { display: block !important; margin-bottom: 1em !important; line-height: 1.4 !important; }
+        div, article, section {
+            width: auto !important;
+            height: auto !important;
+            overflow: visible !important;
+            display: block !important;
+        }
+    '''

    def preprocess_html(self, soup):
-        for link in soup.findAll('a', text=lambda x: x and 'Read more' in x):
-            link.decompose()
-        for link in soup.findAll('a', text=lambda x: x and 'Continue reading' in x):
-            link.decompose()
+        for tag in soup.findAll(['script', 'style', 'iframe']):
+            tag.decompose()
+        for tag in soup.findAll(True):
+            if tag.has_attr('style'):
+                del tag['style']
+            if tag.has_attr('srcset'):
+                del tag['srcset']
        return soup
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser(self)
+        br.set_handle_robots(False)
+        br.addheaders = [
+            ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36 Edg/145.0.0.0'),
+            ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
+            ('Accept-Language', 'en-US,en;q=0.5'),
+            ('Accept-Encoding', 'gzip, deflate, br'),
+            ('Connection', 'keep-alive'),
+        ]
+        return br
+
+    feeds = [
+        ('Naked Capitalism', 'https://www.nakedcapitalism.com/feed'),
+    ]