Update Financial Times and MIT Technology Review

This commit is contained in:
Kovid Goyal 2022-08-28 09:10:43 +05:30
parent 6714efa4d6
commit 902e80ec17
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 43 additions and 22 deletions

View File

@ -1,5 +1,6 @@
import json import json
import re import re
from urllib.parse import quote
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -9,7 +10,7 @@ class ft(BasicNewsRecipe):
language = 'en' language = 'en'
__author__ = "Kovid Goyal" __author__ = "Kovid Goyal"
description = 'The Financial Times is one of the worlds leading news organisations, recognised internationally for its authority, integrity and accuracy.' description = 'The Financial Times is one of the worlds leading news organisations, recognised internationally for its authority, integrity and accuracy.'
oldest_article = 1.5 oldest_article = 1.15
max_articles_per_feed = 50 max_articles_per_feed = 50
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
@ -17,6 +18,7 @@ class ft(BasicNewsRecipe):
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'width', 'height'] remove_attributes = ['style', 'width', 'height']
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg' masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}'
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup( soup = self.index_to_soup(
@ -58,19 +60,36 @@ class ft(BasicNewsRecipe):
except TypeError: except TypeError:
author = ' and '.join(x['name'] for x in data['author']) author = ' and '.join(x['name'] for x in data['author'])
image = desc = title_image_url = '' image = desc = title_image_url = ''
if data.get('image'):
title_image_url = data['image']['url']
image = '<p><img src="{}">'.format(title_image_url)
def resize_img(img):
a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
b = quote(img, safe='')
c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400'
# use width = 200, 300, 400,.. 700...
return a + b + c
if data.get('image'):
image_url = data['image']['url']
if body.__contains__(image_url) is False:
title_image_url = resize_img(image_url)
image = '<p><img src="{}">'.format(title_image_url)
# embedded image links # embedded image links
def insert_image(m): def insert_image(m):
url = m.group()[1:-1] url = m.group()[1:-1]
if url == title_image_url: if url.__contains__('studio') is False:
return '' url = resize_img(url)
return '<p><img src="{}">'.format(url) return '<span><img src="{}"></span></p><p>'.format(url)
body = re.sub(r'\[https://\S+?\]', insert_image, body) body = re.sub(r'\[https://\S+?\]', insert_image, body)
if data.get('description'): if data.get('description'):
desc = '<h2>' + data['description'] + '</h2>' desc = '<h2>' + data['description'] + '</h2>'
html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
return html return html
def preprocess_html(self, soup):
for span in soup.findAll('span'):
p = span.findParent('p')
if p:
p['id'] = 'fig-cap'
return soup

View File

@ -16,6 +16,7 @@ class ft(BasicNewsRecipe):
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'width', 'height'] remove_attributes = ['style', 'width', 'height']
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg' masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}'
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup( soup = self.index_to_soup(
@ -106,7 +107,7 @@ class ft(BasicNewsRecipe):
def resize_img(img): def resize_img(img):
a = 'https://www.ft.com/__origami/service/image/v2/images/raw/' a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
b = quote(img, safe='') b = quote(img, safe='')
c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=300' c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400'
# use width = 200, 300, 400,.. 700... # use width = 200, 300, 400,.. 700...
return a + b + c return a + b + c
@ -121,7 +122,7 @@ class ft(BasicNewsRecipe):
url = m.group()[1:-1] url = m.group()[1:-1]
if url.__contains__('studio') is False: if url.__contains__('studio') is False:
url = resize_img(url) url = resize_img(url)
return '<p> <img src="{}">'.format(url) return '<span><img src="{}"></span></p><p>'.format(url)
body = re.sub(r'\[https://\S+?\]', insert_image, body) body = re.sub(r'\[https://\S+?\]', insert_image, body)
@ -129,3 +130,10 @@ class ft(BasicNewsRecipe):
desc = '<h2>' + data['description'] + '</h2>' desc = '<h2>' + data['description'] + '</h2>'
html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
return html return html
def preprocess_html(self, soup):
for span in soup.findAll('span'):
p = span.findParent('p')
if p:
p['id'] = 'fig-cap'
return soup

View File

@ -61,6 +61,12 @@ class MitTechnologyReview(BasicNewsRecipe):
), ),
] ]
def get_cover_url(self):
soup = self.index_to_soup('https://www.technologyreview.com/')
div = soup.find('div', attrs={'class':lambda s: s and s.startswith('magazineSidebar__imageWrap')})
img = div.find('img', src=True)
return img['src']
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
self.timefmt = ' [{}]'.format( self.timefmt = ' [{}]'.format(
@ -72,19 +78,7 @@ class MitTechnologyReview(BasicNewsRecipe):
) )
) )
) )
# find cover
self.cover_url = soup.find(
"div",
attrs={
"class":
lambda name: name.startswith("magazineHero__image")
if name else False
}
).find(
"img",
srcset=True,
attrs={"class": lambda x: x.startswith('image__img') if x else False}
)['srcset'].split()[0]
# parse articles # parse articles
feeds = OrderedDict() feeds = OrderedDict()