Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-07-07 18:24:30 -04:00 · 2023-10-14 11:46:05 +05:30 · 2023-10-14 11:46:05 +05:30 · 88fbe7424e
commit 88fbe7424e
parent 16a6fcd15e 0841f08d22
3 changed files with 53 additions and 12 deletions
--- a/recipes/icons/newslaundry.png
+++ b/recipes/icons/newslaundry.png
--- a/recipes/newslaundry.recipe
+++ b/recipes/newslaundry.recipe
@ -0,0 +1,36 @@
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 class newslaundry(BasicNewsRecipe):
    title = 'Newslaundry'
    __author__ = 'unkn0wn'
    description = (
        'Newslaundry is a reader-supported, independent news media company. In an industry driven by corporate'
        ' and government interests, we strongly believe in the need for an independent news model, and a free'
        ' and accountable press.'
    )
    language = 'en_IN'
    masthead_url = 'https://images.assettype.com/newslaundry/2020-01/d91cad07-9650-47e9-8bdc-9a6247354d95/Header_logo_NL__2_New.png'
    encoding = 'utf-8'
    no_stylesheets = True
    remove_javascript = True
    oldest_article = 7 # days
    resolve_internal_links = True
    ignore_duplicate_articles = {'url'}
    # keep_only_tags = [classes('headline subheadline authorWithTimeStamp story-card')]
    feeds = [
        ('Articles', 'https://www.newslaundry.com/stories.rss?time-period=last-7-days')
    ]
    # def preprocess_html(self, soup):
    #     if h1 := soup.find(**classes('headline')):
    #        h1.name = 'h1'
    #     if h3 := soup.find(**classes('subheadline')):
    #        h3.name = 'h3'
    #    return soup
    def print_version(self, url):
        if 'hindi.newslaundry' in url: self.abort_article('Skipping hindi article') # remove this line if you want hindi articles.
        return url
--- a/recipes/reuters.recipe
+++ b/recipes/reuters.recipe
@ -58,27 +58,29 @@ class Reuters(BasicNewsRecipe):
    __author__ = 'Kovid Goyal'
    language = 'en'
    keep_only_tags = [
        prefixed_classes('article-body__container__ article-header__container__'),
    ]
    remove_tags = [
        prefixed_classes(
-            'context-widget__tabs___'
+            'context-widget__tabs___ article-header__toolbar__ read-next-mobile__container__ toolbar__container__ button__link__'
            ' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___'
-            ' SocialEmbed__inner___'
+            ' SocialEmbed__inner___ trust-badge author-bio__social__ with-spinner__spinner__ author-bio__author-image__'
        ),
-        dict(name=['button', 'link']),
+        dict(name=['button', 'link', 'svg']),
    ]
-    remove_attributes = ['style']
+    remove_attributes = ['style', 'height', 'width']
    extra_css = '''
    img { max-width: 100%; }
    '''
-    def preprocess_html(self, soup, *a):
+    extra_css = '''
-        for noscript in soup.findAll('noscript'):
+        img { max-width: 100%; }
-            if noscript.findAll('img'):
+        [class^="article-header__tags__"],
-                noscript.name = 'div'
+        [class^="author-bio__author-card__"],
-        return soup
+        [class^="article-header__author-date__"] {
            font-size:small;
        }
        [data-testid="primary-gallery"], [data-testid="primary-image"] { font-size:small; text-align:center; }
    '''
    def parse_index(self):
        base, sections = country_defs[country]
@ -103,6 +105,9 @@ class Reuters(BasicNewsRecipe):
            self.log('\t', article['title'], article['url'])
    def preprocess_html(self, soup):
        for noscript in soup.findAll('noscript'):
            if noscript.findAll('img'):
                noscript.name = 'div'
        for img in soup.findAll('img', attrs={'srcset':True}):
            img['src'] = img['srcset'].split()[0]
        return soup