delete FT Print Edition

This commit is contained in:
unkn0w7n 2024-01-09 19:23:56 +05:30
parent 3a4d42304f
commit eb4d2a590a
2 changed files with 0 additions and 169 deletions

View File

@ -1,169 +0,0 @@
import json
import re
from datetime import date
from calibre.web.feeds.news import BasicNewsRecipe, classes
from urllib.parse import quote
class ft(BasicNewsRecipe):
title = 'Financial Times - Print Edition'
language = 'en'
__author__ = "Kovid Goyal"
description = 'The Financial Times is one of the worlds leading news organisations, recognised internationally for its authority, integrity and accuracy.'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'width', 'height']
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}'
# needs_subscription = 'optional'
#
# def get_browser(self, *args, **kw):
# br = super().get_browser(*args, **kw)
# if self.username and self.password:
# # ft.com uses a CAPTCHA on its login page so this sadly doesnt work
# br.open('https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com')
# br.select_form(id='email-form')
# br['email'] = self.username
# br.submit()
# br.select_form(id='login-form')
# br['password'] = self.password
# br.submit()
# return br
def get_browser(self, *args, **kw):
br = super().get_browser(*args, **kw)
br.set_current_header('Referer', 'https://www.google.com/')
return br
def get_cover_url(self):
from datetime import date
cover = 'http://img.kiosko.net/' + str(
date.today().year
) + '/' + date.today().strftime('%m') + '/' + date.today(
).strftime('%d') + '/uk/ft_uk.750.jpg'
br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)
try:
br.open(cover)
except:
index = 'https://en.kiosko.net/uk/np/ft_uk.html'
soup = self.index_to_soup(index)
for image in soup.findAll('img', src=True):
if image['src'].endswith('750.jpg'):
return image['src']
self.log("\nCover unavailable")
cover = None
return cover
def parse_index(self):
soup = self.index_to_soup('https://www.ft.com/todaysnewspaper/uk')
# International edition: https://www.ft.com/todaysnewspaper/international
ans = self.ft_parse_index(soup)
if not ans:
is_sunday = date.today().weekday() == 6
if is_sunday:
raise ValueError(
'The Financial Times Newspaper is not published on Sundays.'
)
else:
raise ValueError(
'The Financial Times Newspaper is not published today.'
)
return ans
def ft_parse_index(self, soup):
feeds = []
for section in soup.findAll(**classes('o-teaser-collection')):
h2 = section.find('h2')
secname = self.tag_to_string(h2)
self.log(secname)
articles = []
for a in section.findAll(
'a', href=True, **classes('js-teaser-heading-link')
):
url = a['href']
url = 'https://www.ft.com' + url
title = self.tag_to_string(a)
desc = ''
desc_parent = a.findParent('div')
div = desc_parent.find_previous_sibling(
'div', **classes('o-teaser__meta')
)
if div is not None:
desc = div.find('a', **classes('o-teaser__tag'))
desc = self.tag_to_string(desc)
prefix = div.find('span', **classes('o-teaser__tag-prefix'))
if prefix is not None:
prefix = self.tag_to_string(prefix)
desc = prefix + ' ' + desc
articles.append({
'title': title,
'url': url,
'description': desc
})
self.log('\t', desc)
self.log('\t', title)
self.log('\t\t', url)
if articles:
feeds.append((secname, articles))
return feeds
def preprocess_raw_html(self, raw, *a):
# with open('/t/raw.html', 'w') as f:
# f.write(raw)
m = re.search(
r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw
)
raw = raw[m.start():]
raw = raw.split('>', 1)[1]
# with open('/t/raw.json', 'w') as f:
# f.write(raw)
data = json.JSONDecoder().raw_decode(raw)[0]
title = data['headline']
body = data['articleBody']
body = body.replace('\n\n', '<p>')
author = ''
if 'author' in data:
try:
author = data['author']['name']
except TypeError:
author = ' and '.join(x['name'] for x in data['author'])
image = desc = title_image_url = ''
def resize_img(img):
a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
b = quote(img, safe='')
c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400'
# use width = 200, 300, 400,.. 700...
return a + b + c
if data.get('image'):
image_url = data['image']['url']
if body.__contains__(image_url) is False:
title_image_url = resize_img(image_url)
image = '<p><img src="{}">'.format(title_image_url)
# embedded image links
def insert_image(m):
url = m.group()[1:-1]
if url.__contains__('studio') is False:
url = resize_img(url)
return '<span><img src="{}"></span></p><p>'.format(url)
body = re.sub(r'\[https://\S+?\]', insert_image, body)
if data.get('description'):
desc = '<h2>' + data['description'] + '</h2>'
html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
return html
def preprocess_html(self, soup):
for span in soup.findAll('span'):
p = span.findParent('p')
if p:
p['id'] = 'fig-cap'
return soup

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.9 KiB