import json import re from urllib.parse import quote from calibre import browser from calibre.web.feeds.news import BasicNewsRecipe, classes from html5_parser import parse class ft(BasicNewsRecipe): title = 'Financial Times' language = 'en' __author__ = "Kovid Goyal" description = 'The Financial Times is one of the world’s leading news organisations, recognised internationally for its authority, integrity and accuracy.' oldest_article = 1.15 max_articles_per_feed = 50 no_stylesheets = True remove_javascript = True remove_empty_feeds = True ignore_duplicate_articles = {'url'} resolve_internal_links = True remove_attributes = ['style', 'width', 'height'] masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg' simultaneous_downloads = 1 extra_css = ''' .article-info__time-byline {font-size:small; font-weight:bold; } .o-topper__visual, #fig, .main-image, .n-content-image { text-align:center; font-size:small; } blockquote, i { color:#5c5c5c; } .o-topper__standfirst { font-style:italic; color:#202020; } .o-topper__topic { font-size:small; color:#5c5c5c; } ''' keep_only_tags = [ classes( 'body_json o-topper__topic o-topper__headline o-topper__standfirst o-topper__visual article-info__time-byline main-image' ), dict(name='article', attrs={'id':'article-body'}) ] remove_tags = [ dict(name=['source', 'svg', 'button', 'aside']), dict(name='aside', attrs={'class':'n-content-recommended--single-story'}), dict(attrs={'data-layout-name':'card'}), classes('in-article-advert flourish-disclaimer') ] def get_cover_url(self): soup = self.index_to_soup('https://www.frontpages.com/financial-times/') return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src'] # needs_subscription = 'optional' # # def get_browser(self, *args, **kw): # br = super().get_browser(*args, **kw) # if self.username and self.password: # # ft.com uses a CAPTCHA on its login page so this sadly doesnt work # br.open('https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com') # br.select_form(id='email-form') # br['email'] = self.username # br.submit() # br.select_form(id='login-form') # br['password'] = self.password # br.submit() # return br def get_browser(self, *args, **kwargs): return self def clone_browser(self, *args, **kwargs): return self.get_browser() def open_novisit(self, *args, **kwargs): br = browser(user_agent='Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)') br.addheaders += [ ('Referer', 'https://www.google.com/'), ('X-Forwarded-For', '66.249.66.1') ] return br.open_novisit(*args, **kwargs) open = open_novisit feeds = [ ('FirstFT', 'https://www.ft.com/firstft?format=rss'), ('Opinion', 'https://www.ft.com/opinion?format=rss'), ('World', 'https://www.ft.com/world?format=rss'), ('US', 'https://www.ft.com/us?format=rss'), ('Companies', 'https://www.ft.com/companies?format=rss'), ('Tech', 'https://www.ft.com/technology?format=rss'), ('Markets', 'https://www.ft.com/markets?format=rss'), ('Climate', 'https://www.ft.com/climate-capital?format=rss'), ('Life & Arts', 'https://www.ft.com/life-arts?format=rss'), ('How to spend it', 'https://www.ft.com/htsi?format=rss'), ('Others', 'https://www.ft.com/rss/home/uk') ] def preprocess_raw_html(self, raw, *a): # with open('/t/raw.html', 'w') as f: # f.write(raw) root = parse(raw) if root.xpath('//article[@id="article-body"]'): self.log('**has article content') return raw self.log('**no article content') m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw) raw = raw[m.start():] raw = raw.split('>', 1)[1] # with open('/t/raw.json', 'w') as f: # f.write(raw) data = json.JSONDecoder().raw_decode(raw)[0] title = data['headline'] body = data['articleBody'] body = body.replace('\n\n', '

') author = '' if 'author' in data: try: author = data['author']['name'] except TypeError: author = ' and '.join(x['name'] for x in data['author']) image = desc = title_image_url = '' def resize_img(img): a = 'https://www.ft.com/__origami/service/image/v2/images/raw/' b = quote(img, safe='') c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400' # use width = 200, 300, 400,.. 700... return a + b + c if data.get('image'): image_url = data['image']['url'] if body.__contains__(image_url) is False: title_image_url = resize_img(image_url) image = '

'.format(title_image_url) # embedded image links def insert_image(m): url = m.group()[1:-1] if url.__contains__('studio') is False: url = resize_img(url) return '

'.format(url) body = re.sub(r'\[https://\S+?\]', insert_image, body) if data.get('description'): desc = '

' + data['description'] + '

' html = '

' + title + '

' + desc + '

' + author + '

' + image + '

' + body return html def preprocess_html(self, soup): for table in soup.findAll('table'): if len(table.find('tbody').findAll('tr')) > 20: table.find('tbody').decompose() table.string = '** a table that was supposed to be here has been removed.' for con in soup.findAll(attrs={'class':'n-content-layout__slot'}): if con.find('figure'): con['id'] = 'fig' return soup