From 708155f2d69222a97f6156fabeae3d7fc7502770 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Nov 2022 10:40:56 +0530 Subject: [PATCH] minor recipe updates --- recipes/hindu.recipe | 32 ++++++++++-------------- recipes/mit_technology_review.recipe | 37 +++++++++++++++------------- 2 files changed, 33 insertions(+), 36 deletions(-) diff --git a/recipes/hindu.recipe b/recipes/hindu.recipe index c33e3f6e6c..81ea342c1d 100644 --- a/recipes/hindu.recipe +++ b/recipes/hindu.recipe @@ -11,7 +11,7 @@ def absurl(url): return url -local_edition = None +local_edition = 'th_hyderabad' # Chennai is default edition, for other editions use 'th_hyderabad', 'th_bangalore', 'th_delhi', 'th_kolkata' etc @@ -23,7 +23,8 @@ class TheHindu(BasicNewsRecipe): masthead_url = 'https://www.thehindu.com/theme/images/th-online/thehindu-logo.svg' remove_attributes = ['style', 'height', 'width'] extra_css = '.caption{font-size:small; text-align:center;}'\ - '.author{font-size:small;}' + '.author{font-size:small;}'\ + '.subhead{font-weight:bold;}' ignore_duplicate_articles = {'url'} @@ -36,27 +37,17 @@ class TheHindu(BasicNewsRecipe): ] def preprocess_html(self, soup): + for cap in soup.findAll('p', attrs={'class':'caption'}): + cap.name = 'span' for img in soup.findAll('img', attrs={'data-original':True}): img['src'] = img['data-original'] return soup - def get_cover_url(self): - cover = 'https://img.kiosko.net/' + str( - date.today().year - ) + '/' + date.today().strftime('%m') + '/' + date.today( - ).strftime('%d') + '/in/hindu.750.jpg' - br = BasicNewsRecipe.get_browser(self) - try: - br.open(cover) - except: - index = 'https://en.kiosko.net/in/np/hindu.html' - soup = self.index_to_soup(index) - for image in soup.findAll('img', src=True): - if image['src'].endswith('750.jpg'): - return image['src'] - self.log("\nCover unavailable") - cover = None - return cover + def populate_article_metadata(self, article, soup, first): + if first and hasattr(self, 'add_toc_thumbnail'): + image = soup.find('img') + if image is not None: + self.add_toc_thumbnail(article, image['src']) def parse_index(self): if local_edition: @@ -69,6 +60,9 @@ class TheHindu(BasicNewsRecipe): raw = self.index_to_soup(url, raw=True) soup = self.index_to_soup(raw) ans = self.hindu_parse_index(soup) + cover = soup.find(attrs={'class':'hindu-ad'}) + if cover: + self.cover_url = cover.img['src'] if not ans: raise ValueError( 'The Hindu Newspaper is not published Today.' diff --git a/recipes/mit_technology_review.recipe b/recipes/mit_technology_review.recipe index e749850fad..70fedadf41 100644 --- a/recipes/mit_technology_review.recipe +++ b/recipes/mit_technology_review.recipe @@ -79,7 +79,7 @@ class MitTechnologyReview(BasicNewsRecipe): feeds = OrderedDict() classNamePrefixes = [ - "teaserItem__title", "teaserItem--aside__title" + "magazineHero__letter--", "teaserItem__title", "teaserItem--aside__title" ] for div in soup.findAll( attrs={ @@ -92,7 +92,8 @@ class MitTechnologyReview(BasicNewsRecipe): a = div.find('a', href=True) title = self.tag_to_string(a).strip() href = absurl(a['href']) - + desc = '' + section_title = 'Letter from the editor' d = div.findParent( attrs={ 'class': @@ -100,24 +101,26 @@ class MitTechnologyReview(BasicNewsRecipe): startswith(('teaserItem__wrapper', 'teaserItem--aside__wrapper')) } ) - desc = self.tag_to_string( - d.find( + if d: + + excerpt = d.find( + attrs={ + 'class': + lambda x: x and x.startswith( + ('teaserItem__excerpt', 'teaserItem--aside__excerpt') + ) + } + ) + if excerpt: + desc = self.tag_to_string(excerpt).strip() + + sec = d.find( attrs={ - 'class': - lambda x: x and x.startswith( - ('teaserItem__excerpt', 'teaserItem--aside__excerpt') - ) + 'class': lambda x: x and x.startswith('teaserItem__eyebrowText') } ) - ).strip() - - sec = d.find( - attrs={ - 'class': lambda x: x and x.startswith('teaserItem__eyebrowText') - } - ) - - section_title = self.tag_to_string(sec).replace('Categorized in ', + if sec: + section_title = self.tag_to_string(sec).replace('Categorized in ', '').strip() if not href or not title: