calibre/recipes/financial_times.recipe
2022-08-31 20:45:21 +05:30

106 lines
4.0 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
from urllib.parse import quote
from calibre.web.feeds.news import BasicNewsRecipe
class ft(BasicNewsRecipe):
title = 'Financial Times'
language = 'en'
__author__ = "Kovid Goyal"
description = 'The Financial Times is one of the worlds leading news organisations, recognised internationally for its authority, integrity and accuracy.'
oldest_article = 1.15
max_articles_per_feed = 50
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'width', 'height']
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}'
def get_cover_url(self):
from datetime import date
cover = 'http://img.kiosko.net/' + str(
date.today().year
) + '/' + date.today().strftime('%m') + '/' + date.today(
).strftime('%d') + '/uk/ft_uk.750.jpg'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
index = 'https://en.kiosko.net/uk/np/ft_uk.html'
soup = self.index_to_soup(index)
for image in soup.findAll('img', src=True):
if image['src'].endswith('750.jpg'):
return image['src']
self.log("\nCover unavailable")
cover = None
return cover
feeds = [
('World', 'https://www.ft.com/world?format=rss'),
('US', 'https://www.ft.com/us?format=rss'),
('Companies', 'https://www.ft.com/companies?format=rss'),
('Tech', 'https://www.ft.com/technology?format=rss'),
('Markets', 'https://www.ft.com/markets?format=rss'),
('Climate', 'https://www.ft.com/climate-capital?format=rss'),
('Opinion', 'https://www.ft.com/opinion?format=rss'),
('Life & Arts', 'https://www.ft.com/life-arts?format=rss'),
('How to spend it', 'https://www.ft.com/htsi?format=rss'),
]
def preprocess_raw_html(self, raw, *a):
# with open('/t/raw.html', 'w') as f:
# f.write(raw)
m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw)
raw = raw[m.start():]
raw = raw.split('>', 1)[1]
# with open('/t/raw.json', 'w') as f:
# f.write(raw)
data = json.JSONDecoder().raw_decode(raw)[0]
title = data['headline']
body = data['articleBody']
body = body.replace('\n\n', '<p>')
author = ''
if 'author' in data:
try:
author = data['author']['name']
except TypeError:
author = ' and '.join(x['name'] for x in data['author'])
image = desc = title_image_url = ''
def resize_img(img):
a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
b = quote(img, safe='')
c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400'
# use width = 200, 300, 400,.. 700...
return a + b + c
if data.get('image'):
image_url = data['image']['url']
if body.__contains__(image_url) is False:
title_image_url = resize_img(image_url)
image = '<p><img src="{}">'.format(title_image_url)
# embedded image links
def insert_image(m):
url = m.group()[1:-1]
if url.__contains__('studio') is False:
url = resize_img(url)
return '<span><img src="{}"></span></p><p>'.format(url)
body = re.sub(r'\[https://\S+?\]', insert_image, body)
if data.get('description'):
desc = '<h2>' + data['description'] + '</h2>'
html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
return html
def preprocess_html(self, soup):
for span in soup.findAll('span'):
p = span.findParent('p')
if p:
p['id'] = 'fig-cap'
return soup