calibre/recipes/financial_times.recipe
2025-06-27 13:24:29 +05:30

197 lines
7.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# vim:fileencoding=utf-8
import json
import re
from urllib.parse import quote
from html5_parser import parse
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe, classes
class ft(BasicNewsRecipe):
title = 'Financial Times'
language = 'en_GB'
__author__ = 'Kovid Goyal, unkn0wn'
description = 'The Financial Times is one of the worlds leading news organisations, recognised internationally for its authority, integrity and accuracy.'
oldest_article = 1.15
max_articles_per_feed = 50
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
ignore_duplicate_articles = {'url'}
resolve_internal_links = True
remove_attributes = ['style', 'width', 'height']
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
simultaneous_downloads = 1
extra_css = '''
.article-info__time-byline {font-size:small; font-weight:bold; }
.o-topper__visual, #fig, .main-image, .n-content-image { text-align:center; font-size:small; }
blockquote, i { color:#5c5c5c; }
.o-topper__standfirst { font-style:italic; color:#202020; }
.o-topper__topic, .article-info__time-byline-content { font-size:small; color:#5c5c5c; }
'''
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article),
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
keep_only_tags = [
classes(
'body_json o-topper__topic o-topper__headline o-topper__standfirst '
'article-info__time-byline-content o-topper__visual main-image'
),
dict(name='article', attrs={'id': 'article-body'}),
]
remove_tags = [
dict(name=['source', 'svg', 'button', 'aside']),
dict(name='aside', attrs={'class': 'n-content-recommended--single-story'}),
dict(attrs={'data-layout-name': 'card'}),
classes(
'in-article-advert flourish-disclaimer n-myft-ui__preferences-modal n-myft-ui n-myft-ui--follow'
),
]
def get_cover_url(self):
soup = self.index_to_soup('https://www.frontpages.com/financial-times/')
return (
'https://www.frontpages.com'
+ soup.find('img', attrs={'id': 'giornale-img'})['src']
)
# needs_subscription = 'optional'
#
# def get_browser(self, *args, **kw):
# br = super().get_browser(*args, **kw)
# if self.username and self.password:
# # ft.com uses a CAPTCHA on its login page so this sadly doesn't work
# br.open('https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com')
# br.select_form(id='email-form')
# br['email'] = self.username
# br.submit()
# br.select_form(id='login-form')
# br['password'] = self.password
# br.submit()
# return br
def get_browser(self, *args, **kwargs):
return self
def clone_browser(self, *args, **kwargs):
return self.get_browser()
def open_novisit(self, *args, **kwargs):
br = browser(user_agent='Mozilla/5.0 (Java) outbrain')
return br.open_novisit(*args, **kwargs)
open = open_novisit
feeds = [
('FirstFT', 'https://www.ft.com/firstft?format=rss'),
('Opinion', 'https://www.ft.com/opinion?format=rss'),
('World', 'https://www.ft.com/world?format=rss'),
('US', 'https://www.ft.com/us?format=rss'),
('Companies', 'https://www.ft.com/companies?format=rss'),
('Tech', 'https://www.ft.com/technology?format=rss'),
('Markets', 'https://www.ft.com/markets?format=rss'),
('Climate', 'https://www.ft.com/climate-capital?format=rss'),
('Life & Arts', 'https://www.ft.com/life-arts?format=rss'),
('How to spend it', 'https://www.ft.com/htsi?format=rss'),
('Others', 'https://www.ft.com/rss/home/uk'),
]
def preprocess_raw_html(self, raw, *a):
# with open('/t/raw.html', 'w') as f:
# f.write(raw)
root = parse(raw)
if root.xpath('//article[@id="article-body"]'):
self.log('**has article content')
return raw
self.log('**no article content')
m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw)
raw = raw[m.start() :]
raw = raw.split('>', 1)[1]
# with open('/t/raw.json', 'w') as f:
# f.write(raw)
data = json.JSONDecoder().raw_decode(raw)[0]
title = data['headline']
body = data['articleBody']
body = body.replace('\n\n', '<p>')
author = ''
if 'author' in data:
try:
author = data['author']['name']
except TypeError:
author = ' and '.join(x['name'] for x in data['author'])
image = desc = title_image_url = ''
def resize_img(img):
a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
b = quote(img, safe='')
c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400'
# use width = 200, 300, 400,.. 700...
return a + b + c
if data.get('image'):
image_url = data['image']['url']
if body.__contains__(image_url) is False:
title_image_url = resize_img(image_url)
image = '<p><img src="{}">'.format(title_image_url)
# embedded image links
def insert_image(m):
url = m.group()[1:-1]
if url.__contains__('studio') is False:
url = resize_img(url)
return '<span><img src="{}"></span></p><p>'.format(url)
body = re.sub(r'\[https://\S+?\]', insert_image, body)
if data.get('description'):
desc = '<h2>' + data['description'] + '</h2>'
html = (
'<html><body><div class="body_json"><h1>'
+ title
+ '</h1>'
+ desc
+ '<h3>'
+ author
+ '</h3>'
+ image
+ '<p>'
+ body
)
return html
def preprocess_html(self, soup):
p = soup.find(**classes('o-topper__standfirst'))
if p:
p.name = 'p'
div = soup.findAll(**classes('article-info__time-byline-content'))
for d in div:
for p_ in d.findAll('p'):
p_.name = 'div'
for tim in soup.findAll('time'):
if tim.string:
tim.string = ' ' + tim.string
for table in soup.findAll('table'):
if len(table.find('tbody').findAll('tr')) > 20:
table.find('tbody').decompose()
table.string = '** a table that was supposed to be here has been removed.'
for con in soup.findAll(attrs={'class': 'n-content-layout__slot'}):
if con.find('figure'):
con['id'] = 'fig'
return soup