from calibre.web.feeds.news import BasicNewsRecipe from calibre import browser from calibre.ptempfile import PersistentTemporaryFile import json import re class Bloomberg(BasicNewsRecipe): title = u'Bloomberg' language = 'en' __author__ = 'unkn0wn' no_stylesheets = True use_embedded_content = False remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url', 'title'} resolve_internal_links = True oldest_article = 2 # days delay = 1.5 extra_css = ''' #auth {font-size:small; font-weight:bold;} #time {font-size:small;} #subhead {font-style:italic; color:#404040;} #cat {font-size:small; color:gray;} .news-figure-caption-text, #cap {font-size:small; text-align:center;} .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' articles_are_obfuscated = True def get_obfuscated_article(self, url): br = self.get_browser() try: br.open(url) except Exception as e: url = e.hdrs.get('location') soup = self.index_to_soup(url) link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.bloomberg.com')}) if '/videos/' in link['href']: self.abort_article('Aborting Video article') self.log('Found link: ', link['href']) html = br.open(link['href']).read() pt = PersistentTemporaryFile('.html') pt.write(html) pt.close() return pt.name def get_browser(self): br = browser() br.set_handle_redirect(False) return br feeds = [ ('Features', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Ffeatures%2F&hl=en-US&gl=US&ceid=US:en'), ('News', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'), ('Opinion', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fopinion%2F&hl=en-US&gl=US&ceid=US:en'), ('Newsletters', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'), ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en') ] def preprocess_raw_html(self, raw, *a): m = re.search('data-component-props="ArticleBody">', raw) if not m: m = re.search('data-component-props="FeatureBody">', raw) raw = raw[m.start():] raw = raw.split('>', 1)[1] data = json.JSONDecoder().raw_decode(raw)[0] data = data['story'] title = '
' + data['primaryCategory'] + '
' if len(data['abstract']) != 0: if len(data['abstract']) == 2: subhead = '' + data['abstract'][0] + '
' + data['abstract'][1] + '
'.format(data['ledeImageUrl'])
if data['ledeDescription'] is not None:
caption = '' + data['ledeDescription'] + ''
body = data['body']
html = '