From c11113f37bc330f67c222c55c75aa459b66ddb50 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 24 May 2022 20:22:38 +0530 Subject: [PATCH] Outlook Business Magazine by unkn0wn --- recipes/outlook_business_magazine.recipe | 70 ++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 recipes/outlook_business_magazine.recipe diff --git a/recipes/outlook_business_magazine.recipe b/recipes/outlook_business_magazine.recipe new file mode 100644 index 0000000000..af570be3d9 --- /dev/null +++ b/recipes/outlook_business_magazine.recipe @@ -0,0 +1,70 @@ +import json +import re + +from calibre.web.feeds.news import BasicNewsRecipe, classes + + +class outlook(BasicNewsRecipe): + title = 'Outlook Business Magazine' + __author__ = 'unkn0wn' + description = ( + 'Outlook Business (Monthly) Magazine produces Business, Market, Startup and Leadership' + ' content that is differentiated to offer a deeper understanding of trends shaping India. Read to hone your leadership skills.' + ) + language = 'en_IN' + use_embedded_content = False + no_stylesheets = True + remove_javascript = True + remove_attributes = ['height', 'width', 'style'] + ignore_duplicate_articles = {'url'} + masthead_url = 'https://imgnew.outlookindia.com/uploadimage/library/free_files/jpg/logo_2022_04_30_092331.jpg' + resolve_internal_links = True + + keep_only_tags = [classes('__story_detail')] + remove_tags = [ + classes( + 'social_sharing_article left_trending left-sticky __tag_links next_prev_stories downarrow uparrow more_from_author_links next prev' + ) + ] + + def parse_index(self): + soup = self.index_to_soup('https://business.outlookindia.com') + a = soup.find('a', href=lambda x: x and x.startswith('/magazine/issue/')) + url = a['href'] + self.log('Downloading issue:', url) + soup = self.index_to_soup('https://business.outlookindia.com' + url) + cover = soup.find(**classes('listingPage_lead_story')) + self.cover_url = cover.find('img', attrs={'src': True})['src'] + ans = [] + + for h3 in soup.findAll(['h3', 'h4'], + attrs={'class': 'tk-kepler-std-condensed-subhead'}): + a = h3.find('a', href=lambda x: x) + url = a['href'] + title = self.tag_to_string(a) + desc = '' + p = h3.find_next_sibling('p') + if p: + desc = self.tag_to_string(p) + self.log('\t', title) + self.log('\t', desc) + self.log('\t\t', url) + ans.append({'title': title, 'url': url, 'description': desc}) + return [('Articles', ans)] + + def preprocess_raw_html(self, raw, *a): + return raw + m = re.search('.*?script.*?>', raw, flags=re.DOTALL) + raw = raw[m.end():].lstrip() + data = json.JSONDecoder().raw_decode(raw)[0] + title = data['headline'] + body = data['articleBody'] + body = body.replace('\r\n', '

') + author = ' and '.join(x['name'] for x in data['author']) + image = desc = '' + if data.get('image'): + image = '

'.format(data['image']['url']) + if data.get('description'): + desc = '

' + data['description'] + '

' + html = '

' + title + '

' + desc + '

' + author + '

' + image + '

' + body + return html