calibre/recipes/financial_times.recipe
unkn0w7n 293a538423 ...
2024-05-02 12:54:10 +05:30

160 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
from urllib.parse import quote
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe, classes
from html5_parser import parse
class ft(BasicNewsRecipe):
title = 'Financial Times'
language = 'en'
__author__ = "Kovid Goyal"
description = 'The Financial Times is one of the worlds leading news organisations, recognised internationally for its authority, integrity and accuracy.'
oldest_article = 1.15
max_articles_per_feed = 50
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
ignore_duplicate_articles = {'url'}
resolve_internal_links = True
remove_attributes = ['style', 'width', 'height']
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
simultaneous_downloads = 1
extra_css = '''
.article-info__time-byline {font-size:small; font-weight:bold; }
.o-topper__visual, #fig, .main-image, .n-content-image { text-align:center; font-size:small; }
blockquote, i { color:#5c5c5c; }
.o-topper__standfirst { font-style:italic; color:#202020; }
.o-topper__topic { font-size:small; color:#5c5c5c; }
'''
keep_only_tags = [
classes(
'body_json o-topper__topic o-topper__headline o-topper__standfirst o-topper__visual article-info__time-byline main-image'
),
dict(name='article', attrs={'id':'article-body'})
]
remove_tags = [
dict(name=['source', 'svg', 'button', 'aside']),
dict(name='aside', attrs={'class':'n-content-recommended--single-story'}),
dict(attrs={'data-layout-name':'card'}),
classes('in-article-advert flourish-disclaimer')
]
def get_cover_url(self):
soup = self.index_to_soup('https://www.frontpages.com/financial-times/')
return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
# needs_subscription = 'optional'
#
# def get_browser(self, *args, **kw):
# br = super().get_browser(*args, **kw)
# if self.username and self.password:
# # ft.com uses a CAPTCHA on its login page so this sadly doesnt work
# br.open('https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com')
# br.select_form(id='email-form')
# br['email'] = self.username
# br.submit()
# br.select_form(id='login-form')
# br['password'] = self.password
# br.submit()
# return br
def get_browser(self, *args, **kwargs):
return self
def clone_browser(self, *args, **kwargs):
return self.get_browser()
def open_novisit(self, *args, **kwargs):
br = browser(user_agent='Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)')
br.addheaders += [
('Referer', 'https://www.google.com/'),
('X-Forwarded-For', '66.249.66.1')
]
return br.open_novisit(*args, **kwargs)
open = open_novisit
feeds = [
('FirstFT', 'https://www.ft.com/firstft?format=rss'),
('Opinion', 'https://www.ft.com/opinion?format=rss'),
('World', 'https://www.ft.com/world?format=rss'),
('US', 'https://www.ft.com/us?format=rss'),
('Companies', 'https://www.ft.com/companies?format=rss'),
('Tech', 'https://www.ft.com/technology?format=rss'),
('Markets', 'https://www.ft.com/markets?format=rss'),
('Climate', 'https://www.ft.com/climate-capital?format=rss'),
('Life & Arts', 'https://www.ft.com/life-arts?format=rss'),
('How to spend it', 'https://www.ft.com/htsi?format=rss'),
('Others', 'https://www.ft.com/rss/home/uk')
]
def preprocess_raw_html(self, raw, *a):
# with open('/t/raw.html', 'w') as f:
# f.write(raw)
root = parse(raw)
if root.xpath('//article[@id="article-body"]'):
self.log('**has article content')
return raw
self.log('**no article content')
m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw)
raw = raw[m.start():]
raw = raw.split('>', 1)[1]
# with open('/t/raw.json', 'w') as f:
# f.write(raw)
data = json.JSONDecoder().raw_decode(raw)[0]
title = data['headline']
body = data['articleBody']
body = body.replace('\n\n', '<p>')
author = ''
if 'author' in data:
try:
author = data['author']['name']
except TypeError:
author = ' and '.join(x['name'] for x in data['author'])
image = desc = title_image_url = ''
def resize_img(img):
a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
b = quote(img, safe='')
c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400'
# use width = 200, 300, 400,.. 700...
return a + b + c
if data.get('image'):
image_url = data['image']['url']
if body.__contains__(image_url) is False:
title_image_url = resize_img(image_url)
image = '<p><img src="{}">'.format(title_image_url)
# embedded image links
def insert_image(m):
url = m.group()[1:-1]
if url.__contains__('studio') is False:
url = resize_img(url)
return '<span><img src="{}"></span></p><p>'.format(url)
body = re.sub(r'\[https://\S+?\]', insert_image, body)
if data.get('description'):
desc = '<h2>' + data['description'] + '</h2>'
html = '<html><body><div class="body_json"><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
return html
def preprocess_html(self, soup):
p = soup.find(**classes('o-topper__standfirst'))
if p:
p.name = 'p'
for table in soup.findAll('table'):
if len(table.find('tbody').findAll('tr')) > 20:
table.find('tbody').decompose()
table.string = '** a table that was supposed to be here has been removed.'
for con in soup.findAll(attrs={'class':'n-content-layout__slot'}):
if con.find('figure'):
con['id'] = 'fig'
return soup