From 902e80ec173bc40037efb164031043994044ec6c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 28 Aug 2022 09:10:43 +0530 Subject: [PATCH] Update Financial Times and MIT Technology Review --- recipes/financial_times.recipe | 33 +++++++++++++++----- recipes/financial_times_print_edition.recipe | 12 +++++-- recipes/mit_technology_review.recipe | 20 +++++------- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/recipes/financial_times.recipe b/recipes/financial_times.recipe index a67fbedd5a..6d0df16df3 100644 --- a/recipes/financial_times.recipe +++ b/recipes/financial_times.recipe @@ -1,5 +1,6 @@ import json import re +from urllib.parse import quote from calibre.web.feeds.news import BasicNewsRecipe @@ -9,7 +10,7 @@ class ft(BasicNewsRecipe): language = 'en' __author__ = "Kovid Goyal" description = 'The Financial Times is one of the world’s leading news organisations, recognised internationally for its authority, integrity and accuracy.' - oldest_article = 1.5 + oldest_article = 1.15 max_articles_per_feed = 50 no_stylesheets = True remove_javascript = True @@ -17,6 +18,7 @@ class ft(BasicNewsRecipe): ignore_duplicate_articles = {'url'} remove_attributes = ['style', 'width', 'height'] masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg' + extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}' def get_cover_url(self): soup = self.index_to_soup( @@ -58,19 +60,36 @@ class ft(BasicNewsRecipe): except TypeError: author = ' and '.join(x['name'] for x in data['author']) image = desc = title_image_url = '' - if data.get('image'): - title_image_url = data['image']['url'] - image = '

'.format(title_image_url) + def resize_img(img): + a = 'https://www.ft.com/__origami/service/image/v2/images/raw/' + b = quote(img, safe='') + c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400' + # use width = 200, 300, 400,.. 700... + return a + b + c + + if data.get('image'): + image_url = data['image']['url'] + if body.__contains__(image_url) is False: + title_image_url = resize_img(image_url) + image = '

'.format(title_image_url) # embedded image links + def insert_image(m): url = m.group()[1:-1] - if url == title_image_url: - return '' - return '

'.format(url) + if url.__contains__('studio') is False: + url = resize_img(url) + return '

'.format(url) body = re.sub(r'\[https://\S+?\]', insert_image, body) if data.get('description'): desc = '

' + data['description'] + '

' html = '

' + title + '

' + desc + '

' + author + '

' + image + '

' + body return html + + def preprocess_html(self, soup): + for span in soup.findAll('span'): + p = span.findParent('p') + if p: + p['id'] = 'fig-cap' + return soup diff --git a/recipes/financial_times_print_edition.recipe b/recipes/financial_times_print_edition.recipe index 82d83db55b..a26d1e4889 100644 --- a/recipes/financial_times_print_edition.recipe +++ b/recipes/financial_times_print_edition.recipe @@ -16,6 +16,7 @@ class ft(BasicNewsRecipe): ignore_duplicate_articles = {'url'} remove_attributes = ['style', 'width', 'height'] masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg' + extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}' def get_cover_url(self): soup = self.index_to_soup( @@ -106,7 +107,7 @@ class ft(BasicNewsRecipe): def resize_img(img): a = 'https://www.ft.com/__origami/service/image/v2/images/raw/' b = quote(img, safe='') - c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=300' + c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400' # use width = 200, 300, 400,.. 700... return a + b + c @@ -121,7 +122,7 @@ class ft(BasicNewsRecipe): url = m.group()[1:-1] if url.__contains__('studio') is False: url = resize_img(url) - return '

'.format(url) + return '

'.format(url) body = re.sub(r'\[https://\S+?\]', insert_image, body) @@ -129,3 +130,10 @@ class ft(BasicNewsRecipe): desc = '

' + data['description'] + '

' html = '

' + title + '

' + desc + '

' + author + '

' + image + '

' + body return html + + def preprocess_html(self, soup): + for span in soup.findAll('span'): + p = span.findParent('p') + if p: + p['id'] = 'fig-cap' + return soup diff --git a/recipes/mit_technology_review.recipe b/recipes/mit_technology_review.recipe index 3d87b5bffb..7501ea8955 100644 --- a/recipes/mit_technology_review.recipe +++ b/recipes/mit_technology_review.recipe @@ -61,6 +61,12 @@ class MitTechnologyReview(BasicNewsRecipe): ), ] + def get_cover_url(self): + soup = self.index_to_soup('https://www.technologyreview.com/') + div = soup.find('div', attrs={'class':lambda s: s and s.startswith('magazineSidebar__imageWrap')}) + img = div.find('img', src=True) + return img['src'] + def parse_index(self): soup = self.index_to_soup(self.INDEX) self.timefmt = ' [{}]'.format( @@ -72,19 +78,7 @@ class MitTechnologyReview(BasicNewsRecipe): ) ) ) - # find cover - self.cover_url = soup.find( - "div", - attrs={ - "class": - lambda name: name.startswith("magazineHero__image") - if name else False - } - ).find( - "img", - srcset=True, - attrs={"class": lambda x: x.startswith('image__img') if x else False} - )['srcset'].split()[0] + # parse articles feeds = OrderedDict()