import json import re from urllib.parse import quote from calibre.web.feeds.news import BasicNewsRecipe class ft(BasicNewsRecipe): title = 'Financial Times' language = 'en' __author__ = "Kovid Goyal" description = 'The Financial Times is one of the world’s leading news organisations, recognised internationally for its authority, integrity and accuracy.' oldest_article = 1.15 max_articles_per_feed = 50 no_stylesheets = True remove_javascript = True remove_empty_feeds = True ignore_duplicate_articles = {'url'} remove_attributes = ['style', 'width', 'height'] masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg' extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}' def get_cover_url(self): from datetime import date cover = 'http://img.kiosko.net/' + str( date.today().year ) + '/' + date.today().strftime('%m') + '/' + date.today( ).strftime('%d') + '/uk/ft_uk.750.jpg' br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: index = 'https://en.kiosko.net/uk/np/ft_uk.html' soup = self.index_to_soup(index) for image in soup.findAll('img', src=True): if image['src'].endswith('750.jpg'): return image['src'] self.log("\nCover unavailable") cover = None return cover feeds = [ ('World', 'https://www.ft.com/world?format=rss'), ('US', 'https://www.ft.com/us?format=rss'), ('Companies', 'https://www.ft.com/companies?format=rss'), ('Tech', 'https://www.ft.com/technology?format=rss'), ('Markets', 'https://www.ft.com/markets?format=rss'), ('Climate', 'https://www.ft.com/climate-capital?format=rss'), ('Opinion', 'https://www.ft.com/opinion?format=rss'), ('Life & Arts', 'https://www.ft.com/life-arts?format=rss'), ('How to spend it', 'https://www.ft.com/htsi?format=rss'), ] def preprocess_raw_html(self, raw, *a): # with open('/t/raw.html', 'w') as f: # f.write(raw) m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw) raw = raw[m.start():] raw = raw.split('>', 1)[1] # with open('/t/raw.json', 'w') as f: # f.write(raw) data = json.JSONDecoder().raw_decode(raw)[0] title = data['headline'] body = data['articleBody'] body = body.replace('\n\n', '

') author = '' if 'author' in data: try: author = data['author']['name'] except TypeError: author = ' and '.join(x['name'] for x in data['author']) image = desc = title_image_url = '' def resize_img(img): a = 'https://www.ft.com/__origami/service/image/v2/images/raw/' b = quote(img, safe='') c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400' # use width = 200, 300, 400,.. 700... return a + b + c if data.get('image'): image_url = data['image']['url'] if body.__contains__(image_url) is False: title_image_url = resize_img(image_url) image = '

'.format(title_image_url) # embedded image links def insert_image(m): url = m.group()[1:-1] if url.__contains__('studio') is False: url = resize_img(url) return '

'.format(url) body = re.sub(r'\[https://\S+?\]', insert_image, body) if data.get('description'): desc = '

' + data['description'] + '

' html = '

' + title + '

' + desc + '

' + author + '

' + image + '

' + body return html def preprocess_html(self, soup): for span in soup.findAll('span'): p = span.findParent('p') if p: p['id'] = 'fig-cap' return soup