From 7d37665a8372445ffc39534b6eebf0c8faa5580e Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 16 Jul 2023 23:59:05 +0530 Subject: [PATCH 1/3] Update bloomberg-business-week.recipe --- recipes/bloomberg-business-week.recipe | 58 ++++++++++++++++++++------ 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe index 941604dfb0..8b241b1dee 100644 --- a/recipes/bloomberg-business-week.recipe +++ b/recipes/bloomberg-business-week.recipe @@ -1,5 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre import browser +from html5_parser import parse import json import re @@ -19,7 +20,7 @@ class Bloomberg(BasicNewsRecipe): #auth {font-size:small; font-weight:bold;} #time, .chart {font-size:small;} #subhead {font-style:italic; color:#404040;} - .news-figure-caption-text, #cap {font-size:small; text-align:center;} + .news-figure-caption-text, #cap, #img {font-size:small; text-align:center;} .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' @@ -70,17 +71,22 @@ class Bloomberg(BasicNewsRecipe): return feeds def preprocess_raw_html(self, raw, *a): - m = re.search('data-component-props="ArticleBody">', raw) + root = parse(raw) + m = root.xpath('//script[@data-component-props="ArticleBody"]') if not m: - m = re.search('data-component-props="FeatureBody">', raw) + m = root.xpath('//script[@data-component-props="FeatureBody"]') if not m: - return raw - - raw = raw[m.start():] - raw = raw.split('>', 1)[1] - data = json.JSONDecoder().raw_decode(raw)[0] - data = data['story'] + m2 = root.xpath('//script[@id="__NEXT_DATA__"]') + if not m2: + return raw + if m: + data = json.loads(m[0].text) + data = data['story'] + else: + data = json.loads(m2[0].text) + data = data['props']['pageProps']['story'] + title = '

' + data['headline'] + '

' cat = subhead = lede = auth = caption = '' @@ -104,11 +110,39 @@ class Bloomberg(BasicNewsRecipe): if 'ledeImageUrl' in data: if data['ledeImageUrl'] is not None: lede = '

'.format(data['ledeImageUrl']) + + if 'ledeDescription' in data: + if data['ledeDescription'] is not None: + caption = '' + data['ledeDescription'] + '' + else: + if 'lede' in data: + if data['lede'] is not None: + if 'alt' in data['lede']: + if data['lede']['alt'] is not None: + caption = '' + data['lede']['alt'] + '' - if data['ledeDescription'] is not None: - caption = '' + data['ledeDescription'] + '' + if m: + body = data['body'] + else: + body = '' + body_data = data['body']['content'] + for objects in body_data: + if objects['type'] == 'media': + if objects['subType'] == 'photo': + body += '

'.format(objects['data']['photo']['src']) + body += '' + objects['data']['photo']['caption'] + '

' + if objects['type'] == 'paragraph' or 'heading': + body += '

' + if not 'content' in objects: + continue + for content in objects['content']: + if 'value' in content: + body += content['value'] + elif 'content' in content: + for val_cont in content['content']: + if 'value' in val_cont: + body += val_cont['value'] - body = data['body'] html = '' + cat + title + subhead + auth + lede + caption + '

' + body return html From 65fd5b915a31a10eae3cc4a7a6ef128ea65a5d3c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 16 Jul 2023 23:59:13 +0530 Subject: [PATCH 2/3] Update bloomberg.recipe --- recipes/bloomberg.recipe | 63 ++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe index 8ac2aca586..d24e2f9696 100644 --- a/recipes/bloomberg.recipe +++ b/recipes/bloomberg.recipe @@ -1,5 +1,6 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre import browser +from html5_parser import parse from calibre.ptempfile import PersistentTemporaryFile import json import re @@ -21,7 +22,7 @@ class Bloomberg(BasicNewsRecipe): #time, .chart {font-size:small;} #subhead {font-style:italic; color:#404040;} #cat {font-size:small; color:gray;} - .news-figure-caption-text, #cap {font-size:small; text-align:center;} + .news-figure-caption-text, #cap, #img {font-size:small; text-align:center;} .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' @@ -35,7 +36,10 @@ class Bloomberg(BasicNewsRecipe): url = e.hdrs.get('location') soup = self.index_to_soup(url) link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.bloomberg.com')}) - if '/videos/' in link['href']: + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/media/', 'podcast' + ] + if any(x in link['href'] for x in skip_sections): self.abort_article('Aborting Video article') self.log('Found link: ', link['href']) html = br.open(link['href']).read() @@ -58,18 +62,25 @@ class Bloomberg(BasicNewsRecipe): 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'), ('News', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'), - ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en') + ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2F&hl=en-US&gl=US&ceid=US:en') ] def preprocess_raw_html(self, raw, *a): - m = re.search('data-component-props="ArticleBody">', raw) + root = parse(raw) + m = root.xpath('//script[@data-component-props="ArticleBody"]') if not m: - m = re.search('data-component-props="FeatureBody">', raw) + m = root.xpath('//script[@data-component-props="FeatureBody"]') + if not m: + m2 = root.xpath('//script[@id="__NEXT_DATA__"]') + if not m2: + return raw + if m: + data = json.loads(m[0].text) + data = data['story'] - raw = raw[m.start():] - raw = raw.split('>', 1)[1] - data = json.JSONDecoder().raw_decode(raw)[0] - data = data['story'] + else: + data = json.loads(m2[0].text) + data = data['props']['pageProps']['story'] title = '

' + data['headline'] + '

' @@ -95,10 +106,38 @@ class Bloomberg(BasicNewsRecipe): if data['ledeImageUrl'] is not None: lede = '

'.format(data['ledeImageUrl']) - if data['ledeDescription'] is not None: - caption = '' + data['ledeDescription'] + '' + if 'ledeDescription' in data: + if data['ledeDescription'] is not None: + caption = '' + data['ledeDescription'] + '' + else: + if 'lede' in data: + if data['lede'] is not None: + if 'alt' in data['lede']: + if data['lede']['alt'] is not None: + caption = '' + data['lede']['alt'] + '' + + if m: + body = data['body'] + else: + body = '' + body_data = data['body']['content'] + for objects in body_data: + if objects['type'] == 'media': + if objects['subType'] == 'photo': + body += '

'.format(objects['data']['photo']['src']) + body += '' + objects['data']['photo']['caption'] + '

' + if objects['type'] == 'paragraph' or 'heading': # lists are missed :( + body += '

' + if not 'content' in objects: + continue + for content in objects['content']: + if 'value' in content: + body += content['value'] + elif 'content' in content: + for val_cont in content['content']: + if 'value' in val_cont: + body += val_cont['value'] - body = data['body'] html = '' + cat + title + subhead + auth + lede + caption + '

' + body return html From 2ea7c1d6c3b3043a6eb1ef2d286871eb0f6ff40c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 16 Jul 2023 23:59:17 +0530 Subject: [PATCH 3/3] Update livemint.recipe --- recipes/livemint.recipe | 3 +++ 1 file changed, 3 insertions(+) diff --git a/recipes/livemint.recipe b/recipes/livemint.recipe index 75a6a5e514..3418fc249a 100644 --- a/recipes/livemint.recipe +++ b/recipes/livemint.recipe @@ -138,6 +138,9 @@ class LiveMint(BasicNewsRecipe): return raw def preprocess_html(self, soup): + for embed in soup.findAll('div', attrs={'class':'embed'}): + if nos := embed.find('noscript'): + nos.name = 'span' for span in soup.findAll('figcaption'): span['id'] = 'img-cap' for auth in soup.findAll('span', attrs={'class':lambda x: x and 'articleInfo' in x.split()}):