minor recipe updates

This commit is contained in:
Kovid Goyal 2022-11-15 10:40:56 +05:30
parent 423fbbed4a
commit 708155f2d6
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 33 additions and 36 deletions

View File

@ -11,7 +11,7 @@ def absurl(url):
return url return url
local_edition = None local_edition = 'th_hyderabad'
# Chennai is default edition, for other editions use 'th_hyderabad', 'th_bangalore', 'th_delhi', 'th_kolkata' etc # Chennai is default edition, for other editions use 'th_hyderabad', 'th_bangalore', 'th_delhi', 'th_kolkata' etc
@ -23,7 +23,8 @@ class TheHindu(BasicNewsRecipe):
masthead_url = 'https://www.thehindu.com/theme/images/th-online/thehindu-logo.svg' masthead_url = 'https://www.thehindu.com/theme/images/th-online/thehindu-logo.svg'
remove_attributes = ['style', 'height', 'width'] remove_attributes = ['style', 'height', 'width']
extra_css = '.caption{font-size:small; text-align:center;}'\ extra_css = '.caption{font-size:small; text-align:center;}'\
'.author{font-size:small;}' '.author{font-size:small;}'\
'.subhead{font-weight:bold;}'
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
@ -36,27 +37,17 @@ class TheHindu(BasicNewsRecipe):
] ]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for cap in soup.findAll('p', attrs={'class':'caption'}):
cap.name = 'span'
for img in soup.findAll('img', attrs={'data-original':True}): for img in soup.findAll('img', attrs={'data-original':True}):
img['src'] = img['data-original'] img['src'] = img['data-original']
return soup return soup
def get_cover_url(self): def populate_article_metadata(self, article, soup, first):
cover = 'https://img.kiosko.net/' + str( if first and hasattr(self, 'add_toc_thumbnail'):
date.today().year image = soup.find('img')
) + '/' + date.today().strftime('%m') + '/' + date.today( if image is not None:
).strftime('%d') + '/in/hindu.750.jpg' self.add_toc_thumbnail(article, image['src'])
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
index = 'https://en.kiosko.net/in/np/hindu.html'
soup = self.index_to_soup(index)
for image in soup.findAll('img', src=True):
if image['src'].endswith('750.jpg'):
return image['src']
self.log("\nCover unavailable")
cover = None
return cover
def parse_index(self): def parse_index(self):
if local_edition: if local_edition:
@ -69,6 +60,9 @@ class TheHindu(BasicNewsRecipe):
raw = self.index_to_soup(url, raw=True) raw = self.index_to_soup(url, raw=True)
soup = self.index_to_soup(raw) soup = self.index_to_soup(raw)
ans = self.hindu_parse_index(soup) ans = self.hindu_parse_index(soup)
cover = soup.find(attrs={'class':'hindu-ad'})
if cover:
self.cover_url = cover.img['src']
if not ans: if not ans:
raise ValueError( raise ValueError(
'The Hindu Newspaper is not published Today.' 'The Hindu Newspaper is not published Today.'

View File

@ -79,7 +79,7 @@ class MitTechnologyReview(BasicNewsRecipe):
feeds = OrderedDict() feeds = OrderedDict()
classNamePrefixes = [ classNamePrefixes = [
"teaserItem__title", "teaserItem--aside__title" "magazineHero__letter--", "teaserItem__title", "teaserItem--aside__title"
] ]
for div in soup.findAll( for div in soup.findAll(
attrs={ attrs={
@ -92,7 +92,8 @@ class MitTechnologyReview(BasicNewsRecipe):
a = div.find('a', href=True) a = div.find('a', href=True)
title = self.tag_to_string(a).strip() title = self.tag_to_string(a).strip()
href = absurl(a['href']) href = absurl(a['href'])
desc = ''
section_title = 'Letter from the editor'
d = div.findParent( d = div.findParent(
attrs={ attrs={
'class': 'class':
@ -100,24 +101,26 @@ class MitTechnologyReview(BasicNewsRecipe):
startswith(('teaserItem__wrapper', 'teaserItem--aside__wrapper')) startswith(('teaserItem__wrapper', 'teaserItem--aside__wrapper'))
} }
) )
desc = self.tag_to_string( if d:
d.find(
excerpt = d.find(
attrs={
'class':
lambda x: x and x.startswith(
('teaserItem__excerpt', 'teaserItem--aside__excerpt')
)
}
)
if excerpt:
desc = self.tag_to_string(excerpt).strip()
sec = d.find(
attrs={ attrs={
'class': 'class': lambda x: x and x.startswith('teaserItem__eyebrowText')
lambda x: x and x.startswith(
('teaserItem__excerpt', 'teaserItem--aside__excerpt')
)
} }
) )
).strip() if sec:
section_title = self.tag_to_string(sec).replace('Categorized in ',
sec = d.find(
attrs={
'class': lambda x: x and x.startswith('teaserItem__eyebrowText')
}
)
section_title = self.tag_to_string(sec).replace('Categorized in ',
'').strip() '').strip()
if not href or not title: if not href or not title: