...

2025-09-29 15:31:08 -04:00 · 2024-07-25 12:49:25 +05:30 · 2024-07-25 12:49:25 +05:30 · 76684b3a2b
commit 76684b3a2b
parent b9c50b071a
4 changed files with 49 additions and 6 deletions
--- a/recipes/al_jazeera.recipe
+++ b/recipes/al_jazeera.recipe
@ -1,3 +1,5 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 __license__ = 'GPL v3'
 __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
@ -35,6 +37,20 @@ class AlJazeera(BasicNewsRecipe):
                   'meta', 'base', 'iframe', 'embed']),
    ]
    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article)
        }
    }
    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)
    feeds = [(u'Al Jazeera English',
              u'http://www.aljazeera.com/xml/rss/all.xml')]
--- a/recipes/independent.recipe
+++ b/recipes/independent.recipe
@ -40,6 +40,20 @@ class TheIndependentNew(BasicNewsRecipe):
    encoding = 'utf-8'
    compress_news_images = True
    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article)
        }
    }
    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)
    keep_only_tags = [
        dict(id=['articleHeader', 'main']),
        classes('headline sub-headline breadcrumb author publish-date hero-image body-content'),
--- a/recipes/livemint.recipe
+++ b/recipes/livemint.recipe
@ -143,9 +143,7 @@ class LiveMint(BasicNewsRecipe):
            # remove empty p tags
            raw = re.sub(
                r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', '\g<2>', re.sub(
-                    r'(<p>\s*&nbsp;\s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+>&nbsp;\s*<\/p>)', '', re.sub(
+                    r'(<p>\s*&nbsp;\s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+>&nbsp;\s*<\/p>)', '', raw
                        r'(?=<h2>\s*Also\s*Read).*?(?<=</h2>)', '', raw
                    )
                )
            )
            if '<script>var wsjFlag=true;</script>' in raw:
@ -186,10 +184,11 @@ class LiveMint(BasicNewsRecipe):
            for span in soup.findAll('span', attrs={'class':'exclusive'}):
                span.extract()
            for al in soup.findAll('a', attrs={'class':'manualbacklink'}):
-                pa = al.findParent('p')
+                pa = al.findParent(['p', 'h2', 'h3', 'h4'])
                if pa:
                    pa.extract()
-            if wa := soup.find(**classes('autobacklink-topic')):
+            wa = soup.find(**classes('autobacklink-topic'))
            if wa:
                p = wa.findParent('p')
                if p:
                    p.extract()
--- a/recipes/new_scientist_mag.recipe
+++ b/recipes/new_scientist_mag.recipe
@ -1,3 +1,5 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 '''
 newscientist.com
 '''
@ -70,8 +72,20 @@ class NewScientist(BasicNewsRecipe):
        classes('ArticleHeader__SocialWrapper AdvertWrapper ReadMoreWithImage ArticleTopics')
    ]
    recipe_specific_options = {
        'issue': {
            'short': 'Enter the Issue Number you want to download ',
            'long': 'For example, 3498'
        }
    }
    def parse_index(self):
-        soup = self.index_to_soup('https://www.newscientist.com/issues/current/')
+        issue_url = 'https://www.newscientist.com/issues/current/'
        d = self.recipe_specific_options.get('issue')
        if d and isinstance(d, str):
            issue_url = 'https://www.newscientist.com/issue/' + d
        soup = self.index_to_soup(issue_url)
        div = soup.find('div', attrs={'class':'ThisWeeksMagazineHero__CoverInfo'})
        tme = div.find(**classes('ThisWeeksMagazineHero__MagInfoHeading'))
        self.log('Downloading issue:', self.tag_to_string(tme))