Update Outlook Magazine

fix images and remove some tags
This commit is contained in:
unkn0w7n 2025-08-24 20:14:55 +05:30
parent 7676b281ae
commit 33fc94b96d
2 changed files with 27 additions and 14 deletions

View File

@ -177,7 +177,7 @@ class HBR(BasicNewsRecipe):
div.name = 'blockquote'
for sidebar in soup.findAll(('article-sidebar', 'article-ideainbrief')):
sidebar.name = 'blockquote'
for img in soup.findAll(attrs={'srcset': True}):
for img in soup.findAll('img', attrs={'srcset': True}):
split = img['srcset'].split(',')
for x in split:
if '700w' in x:

View File

@ -33,23 +33,25 @@ class outlook(BasicNewsRecipe):
remove_tags = [
dict(name='svg'),
dict(name='a', attrs={'href':lambda x: x and x.startswith('https://www.whatsapp.com/')}),
classes('ads-box info-img-absolute mobile-info-id story-dec-time-mobile sb-also-read ads-box1')
dict(
name='a',
attrs={'href': lambda x: x and x.startswith('https://www.whatsapp.com/')},
),
classes(
'ads-box info-img-absolute mobile-info-id story-dec-time-mobile sb-also-read ads-box1 story-mag-issue-section'
),
]
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (DD-Month-YYYY format)',
'long': 'For example, 10-june-2024'
'long': 'For example, 10-june-2024',
}
}
def get_browser(self):
return BasicNewsRecipe.get_browser(self, user_agent='common_words/based', verify_ssl_certificates=False)
def parse_index(self):
self.log(
'\n***\nif this recipe fails, report it on: '
'try again and again\n***\nif this recipe fails, report it on: '
'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
)
@ -58,18 +60,18 @@ class outlook(BasicNewsRecipe):
url = 'https://www.outlookindia.com/magazine/' + d
else:
soup = self.index_to_soup('https://www.outlookindia.com/magazine')
a = soup.find('a', attrs={'aria-label':'magazine-cover-image'})
a = soup.find('a', attrs={'aria-label': 'magazine-cover-image'})
url = a['href']
self.log('Downloading issue:', url)
soup = self.index_to_soup(url)
cov = soup.find(attrs={'aria-label':'magazine-cover-image'})
cov = soup.find(attrs={'aria-label': 'magazine-cover-image'})
self.cover_url = cov.img['src'].split('?')[0]
summ = soup.find(attrs={'data-test-id':'magazine-summary'})
summ = soup.find(attrs={'data-test-id': 'magazine-summary'})
if summ:
self.description = self.tag_to_string(summ)
tme = soup.find(attrs={'class':'arr__timeago'})
tme = soup.find(attrs={'class': 'arr__timeago'})
if tme:
self.timefmt = ' [' + self.tag_to_string(tme).split('-')[-1].strip() + ']'
@ -80,10 +82,12 @@ class outlook(BasicNewsRecipe):
url = a['href']
title = self.tag_to_string(a)
desc = ''
p = div.find_next_sibling('p', attrs={'class':lambda x: x and 'article-desc' in x.split()})
p = div.find_next_sibling(
'p', attrs={'class': lambda x: x and 'article-desc' in x.split()}
)
if p:
desc = self.tag_to_string(p)
auth = div.find_next_sibling('p', attrs={'class':'author'})
auth = div.find_next_sibling('p', attrs={'class': 'author'})
if auth:
desc = self.tag_to_string(auth) + ' | ' + desc
self.log('\t', title)
@ -91,3 +95,12 @@ class outlook(BasicNewsRecipe):
self.log('\t\t', url)
ans.append({'title': title, 'url': url, 'description': desc})
return [('Articles', ans)]
def preprocess_html(self, soup):
if sub := soup.find(**classes('subcap-story')):
sub.name = 'p'
for h2 in soup.findAll(['h2', 'h3']):
h2.name = 'h4'
for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src'].split('?')[0] + '?w=600'
return soup