This commit is contained in:
Kovid Goyal 2025-06-27 11:00:27 +05:30
commit 32c2ac7fac
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -2,7 +2,6 @@
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
import json import json
import re import re
from datetime import date
from urllib.parse import quote from urllib.parse import quote
from html5_parser import parse from html5_parser import parse
@ -32,14 +31,14 @@ class ft(BasicNewsRecipe):
.o-topper__visual, #fig, .main-image, .n-content-image { text-align:center; font-size:small; } .o-topper__visual, #fig, .main-image, .n-content-image { text-align:center; font-size:small; }
blockquote, i { color:#5c5c5c; } blockquote, i { color:#5c5c5c; }
.o-topper__standfirst { font-style:italic; color:#202020; } .o-topper__standfirst { font-style:italic; color:#202020; }
.o-topper__topic { font-size:small; color:#5c5c5c; } .o-topper__topic, .article-info__time-byline-content { font-size:small; color:#5c5c5c; }
''' '''
recipe_specific_options = { recipe_specific_options = {
'days': { 'days': {
'short': 'Oldest article to download from this news source. In days ', 'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours', 'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article) 'default': str(oldest_article),
} }
} }
@ -51,16 +50,19 @@ class ft(BasicNewsRecipe):
keep_only_tags = [ keep_only_tags = [
classes( classes(
'body_json o-topper__topic o-topper__headline o-topper__standfirst o-topper__visual article-info__time-byline main-image' 'body_json o-topper__topic o-topper__headline o-topper__standfirst '
'article-info__time-byline-content o-topper__visual main-image'
), ),
dict(name='article', attrs={'id':'article-body'}) dict(name='article', attrs={'id': 'article-body'}),
] ]
remove_tags = [ remove_tags = [
dict(name=['source', 'svg', 'button', 'aside']), dict(name=['source', 'svg', 'button', 'aside']),
dict(name='aside', attrs={'class':'n-content-recommended--single-story'}), dict(name='aside', attrs={'class': 'n-content-recommended--single-story'}),
dict(attrs={'data-layout-name':'card'}), dict(attrs={'data-layout-name': 'card'}),
classes('in-article-advert flourish-disclaimer') classes(
'in-article-advert flourish-disclaimer n-myft-ui__preferences-modal n-myft-ui n-myft-ui--follow'
),
] ]
def get_cover_url(self): def get_cover_url(self):
@ -108,7 +110,7 @@ class ft(BasicNewsRecipe):
('Climate', 'https://www.ft.com/climate-capital?format=rss'), ('Climate', 'https://www.ft.com/climate-capital?format=rss'),
('Life & Arts', 'https://www.ft.com/life-arts?format=rss'), ('Life & Arts', 'https://www.ft.com/life-arts?format=rss'),
('How to spend it', 'https://www.ft.com/htsi?format=rss'), ('How to spend it', 'https://www.ft.com/htsi?format=rss'),
('Others', 'https://www.ft.com/rss/home/uk') ('Others', 'https://www.ft.com/rss/home/uk'),
] ]
def preprocess_raw_html(self, raw, *a): def preprocess_raw_html(self, raw, *a):
@ -120,7 +122,7 @@ class ft(BasicNewsRecipe):
return raw return raw
self.log('**no article content') self.log('**no article content')
m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw) m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw)
raw = raw[m.start():] raw = raw[m.start() :]
raw = raw.split('>', 1)[1] raw = raw.split('>', 1)[1]
# with open('/t/raw.json', 'w') as f: # with open('/t/raw.json', 'w') as f:
# f.write(raw) # f.write(raw)
@ -159,18 +161,33 @@ class ft(BasicNewsRecipe):
body = re.sub(r'\[https://\S+?\]', insert_image, body) body = re.sub(r'\[https://\S+?\]', insert_image, body)
if data.get('description'): if data.get('description'):
desc = '<h2>' + data['description'] + '</h2>' desc = '<h2>' + data['description'] + '</h2>'
html = '<html><body><div class="body_json"><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body html = (
'<html><body><div class="body_json"><h1>'
+ title
+ '</h1>'
+ desc
+ '<h3>'
+ author
+ '</h3>'
+ image
+ '<p>'
+ body
)
return html return html
def preprocess_html(self, soup): def preprocess_html(self, soup):
p = soup.find(**classes('o-topper__standfirst')) p = soup.find(**classes('o-topper__standfirst'))
if p: if p:
p.name = 'p' p.name = 'p'
div = soup.findAll(**classes('article-info__time-byline-content'))
for d in div:
if p_ := d.find('p'):
p_.name = 'div'
for table in soup.findAll('table'): for table in soup.findAll('table'):
if len(table.find('tbody').findAll('tr')) > 20: if len(table.find('tbody').findAll('tr')) > 20:
table.find('tbody').decompose() table.find('tbody').decompose()
table.string = '** a table that was supposed to be here has been removed.' table.string = '** a table that was supposed to be here has been removed.'
for con in soup.findAll(attrs={'class':'n-content-layout__slot'}): for con in soup.findAll(attrs={'class': 'n-content-layout__slot'}):
if con.find('figure'): if con.find('figure'):
con['id'] = 'fig' con['id'] = 'fig'
return soup return soup