From fe60dd936f6afbde8e38f37fa74e7bf895c2e78f Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 27 Jan 2024 12:55:10 +0530 Subject: [PATCH] Update Business Today --- recipes/business_today.recipe | 46 ++++++++++++++++++++++++++++------- recipes/toiprint.recipe | 33 ++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 10 deletions(-) diff --git a/recipes/business_today.recipe b/recipes/business_today.recipe index ca6077648b..44c5705ba3 100644 --- a/recipes/business_today.recipe +++ b/recipes/business_today.recipe @@ -7,6 +7,8 @@ class BT(BasicNewsRecipe): __author__ = 'unkn0wn' no_stylesheets = True use_embedded_content = False + remove_javascript = True + encoding = 'utf-8' remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url'} description = ( @@ -16,24 +18,36 @@ class BT(BasicNewsRecipe): masthead_url = 'https://akm-img-a-in.tosshub.com/businesstoday/resource/img/logo.png' keep_only_tags = [ - classes('story-heading sab-head-tranlate-sec user-detial-left main-img field--name-body'), + classes('story-heading sab-head-tranlate-sec brand-detial-main main-img field--name-body'), ] + remove_tags = [ + dict(name=['link', 'meta', 'svg', 'button', 'script']), dict(name='a', attrs={'title': 'videos'}), - classes('tranding-topics-main newsltter-iframe hedlineteg') + classes( + 'tranding-topics-main newsltter-iframe hedlineteg stoybday-ad story-recommended-chunk ' + 'banner_content' + ) ] - extra_css = 'a[href^="https://www.businesstoday.in/videos"]{display: none;}' + extra_css = ''' + img {display:block; margin:0 auto;} + em { color:#202020; } + .main-img { font-size:small; text-align:center; } + .summary {font-style:italic; color:#202020; } + ''' def parse_index(self): - soup = self.index_to_soup('https://www.businesstoday.in/magazine') - issue = soup.find(attrs={'class': 'swiper-wrapper'}) - a = issue.findAll('a', href=lambda x: x and '/magazine/issue/' in x)[1] + self.log( + '\n***\nif this recipe fails, report it on: ' + 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' + ) + soup = self.index_to_soup('https://www.businesstoday.in') + a = soup.findAll('a', attrs={'class':'mag_sld_img'})[1] + self.cover_url = a.img['data-src'].split('?')[0] url = a['href'] self.log('issue =', url) + self.timefmt = ' [' + url.split('/')[-1] + ']' soup = self.index_to_soup(url) - tag = soup.find(attrs={'class': 'issue-image'}) - if tag: - self.cover_url = tag.find('img')['src'] section = None sections = {} @@ -78,6 +92,20 @@ class BT(BasicNewsRecipe): return feeds def preprocess_html(self, soup): + auth = soup.find(**classes('brand-detial-main')) + if auth: + ul = auth.find('ul') + if ul: + ul.decompose() + for vid in soup.findAll('a', attrs={ + 'href': lambda x: x and 'businesstoday.in/videos' in x + }): + vid.decompose() + summ = soup.find(**classes('summary')) + if summ: + h2 = summ.find('h2') + if h2: + h2.name = 'p' for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'].split('?')[0] return soup diff --git a/recipes/toiprint.recipe b/recipes/toiprint.recipe index d901a85c64..3f9f503501 100644 --- a/recipes/toiprint.recipe +++ b/recipes/toiprint.recipe @@ -23,6 +23,20 @@ date_ = dt.strftime('%d_%m_%Y') index = 'https://asset.harnscloud.com/PublicationData/TOI/' + le + '/' + date0 img_index = 'https://cmsimages.timesgroup.com/image-resizer?epaper_s3_path=PublicationData/TOI/' + le + '/' + date0 +def handle_images(x, soup): + img = soup.find('img') + if img: + img_div = img.findParent('div') + cap = img_div.next_sibling + if cap and cap.has_attr('class') and 'cap' in cap['class']: + x.insert_after(img_div) + img_div.insert_after(cap) + else: + x.insert_after(img_div) + lead = soup.find('div', attrs={'class':'lead'}) + if lead: + x.insert_after(lead) + class toiprint(BasicNewsRecipe): title = 'TOI Print Edition' language = 'en_IN' @@ -43,6 +57,7 @@ class toiprint(BasicNewsRecipe): .cap { text-align:center; font-size:small; } img { display:block; margin:0 auto; } .info { font-size:small; color:#404040; } + .lead { color:#404040; } ''' def get_cover_url(self): @@ -111,7 +126,7 @@ class toiprint(BasicNewsRecipe): elif x['TagName'] == 'Author': body += '

' + x['ZoneText'].replace('
', '') + '

' elif x['TagName'] in 'ArticleBody': - body += x['ZoneText'] + body += '' + x['ZoneText'] + '' elif x['TagName'] in 'Information': body += '

' + x['ZoneText'] + '

' elif x['TagName'] in {'LinkTo', 'LinkFrom'}: @@ -122,12 +137,28 @@ class toiprint(BasicNewsRecipe): + x['ZoneID'] + '.jpg&bucket=andre-toi-out&q=50') elif x['TagName'] == 'ImageCaption': body += '
' + x['ZoneText'] + '

' + elif x['TagName'] == 'Lead': + body += '

' + x['ZoneText'] + '

' elif 'ZoneText' in x: body += '

' + x['ZoneText'] + '

' return '
' \ + body.replace('
', '

').replace('
', '

').replace('<br>', '

').replace('\n', '
') \ + '

' + def preprocess_html(self, soup): + h1 = soup.find('h1') + if h1: + h2 = h1.next_sibling + if h2 and h2.has_attr('class') and 'sub' in h2['class']: + h3 = h2.next_sibling + if h3 and h3.has_attr('class') and 'sub' in h3['class']: + handle_images(h3, soup) + else: + handle_images(h2, soup) + else: + handle_images(h1, soup) + return soup + def print_version(self, url): return index + '/ArticleZoneJson/' + url.split('_')[-3] + '/' + url + '.json'