Update Outlook Magazine

fix images and remove some tags
2026-04-05 00:31:59 -04:00 · 2025-08-24 20:14:55 +05:30 · 2025-08-24 20:14:55 +05:30 · 33fc94b96d
commit 33fc94b96d
parent 7676b281ae
2 changed files with 27 additions and 14 deletions
--- a/recipes/hbr.recipe
+++ b/recipes/hbr.recipe
@ -177,7 +177,7 @@ class HBR(BasicNewsRecipe):
            div.name = 'blockquote'
        for sidebar in soup.findAll(('article-sidebar', 'article-ideainbrief')):
            sidebar.name = 'blockquote'
-        for img in soup.findAll(attrs={'srcset': True}):
+        for img in soup.findAll('img', attrs={'srcset': True}):
            split = img['srcset'].split(',')
            for x in split:
                if '700w' in x:
--- a/recipes/outlook_india.recipe
+++ b/recipes/outlook_india.recipe
@ -33,23 +33,25 @@ class outlook(BasicNewsRecipe):

    remove_tags = [
        dict(name='svg'),
-        dict(name='a', attrs={'href':lambda x: x and x.startswith('https://www.whatsapp.com/')}),
-        classes('ads-box info-img-absolute mobile-info-id story-dec-time-mobile sb-also-read ads-box1')
+        dict(
+            name='a',
+            attrs={'href': lambda x: x and x.startswith('https://www.whatsapp.com/')},
+        ),
+        classes(
+            'ads-box info-img-absolute mobile-info-id story-dec-time-mobile sb-also-read ads-box1 story-mag-issue-section'
+        ),
    ]

    recipe_specific_options = {
        'date': {
            'short': 'The date of the edition to download (DD-Month-YYYY format)',
-            'long': 'For example, 10-june-2024'
+            'long': 'For example, 10-june-2024',
        }
    }

-    def get_browser(self):
-        return BasicNewsRecipe.get_browser(self, user_agent='common_words/based', verify_ssl_certificates=False)
-
    def parse_index(self):
        self.log(
-            '\n***\nif this recipe fails, report it on: '
+            'try again and again\n***\nif this recipe fails, report it on: '
            'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
        )

@ -58,18 +60,18 @@ class outlook(BasicNewsRecipe):
            url = 'https://www.outlookindia.com/magazine/' + d
        else:
            soup = self.index_to_soup('https://www.outlookindia.com/magazine')
-            a = soup.find('a', attrs={'aria-label':'magazine-cover-image'})
+            a = soup.find('a', attrs={'aria-label': 'magazine-cover-image'})
            url = a['href']

        self.log('Downloading issue:', url)

        soup = self.index_to_soup(url)
-        cov = soup.find(attrs={'aria-label':'magazine-cover-image'})
+        cov = soup.find(attrs={'aria-label': 'magazine-cover-image'})
        self.cover_url = cov.img['src'].split('?')[0]
-        summ = soup.find(attrs={'data-test-id':'magazine-summary'})
+        summ = soup.find(attrs={'data-test-id': 'magazine-summary'})
        if summ:
            self.description = self.tag_to_string(summ)
-        tme = soup.find(attrs={'class':'arr__timeago'})
+        tme = soup.find(attrs={'class': 'arr__timeago'})
        if tme:
            self.timefmt = ' [' + self.tag_to_string(tme).split('-')[-1].strip() + ']'

@ -80,10 +82,12 @@ class outlook(BasicNewsRecipe):
            url = a['href']
            title = self.tag_to_string(a)
            desc = ''
-            p = div.find_next_sibling('p', attrs={'class':lambda x: x and 'article-desc' in x.split()})
+            p = div.find_next_sibling(
+                'p', attrs={'class': lambda x: x and 'article-desc' in x.split()}
+            )
            if p:
                desc = self.tag_to_string(p)
-            auth = div.find_next_sibling('p', attrs={'class':'author'})
+            auth = div.find_next_sibling('p', attrs={'class': 'author'})
            if auth:
                desc = self.tag_to_string(auth) + ' | ' + desc
            self.log('\t', title)
@ -91,3 +95,12 @@ class outlook(BasicNewsRecipe):
            self.log('\t\t', url)
            ans.append({'title': title, 'url': url, 'description': desc})
        return [('Articles', ans)]
+
+    def preprocess_html(self, soup):
+        if sub := soup.find(**classes('subcap-story')):
+            sub.name = 'p'
+        for h2 in soup.findAll(['h2', 'h3']):
+            h2.name = 'h4'
+        for img in soup.findAll('img', attrs={'data-src': True}):
+            img['src'] = img['data-src'].split('?')[0] + '?w=600'
+        return soup