mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Financial Times and MIT Technology Review
This commit is contained in:
parent
6714efa4d6
commit
902e80ec17
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
@ -9,7 +10,7 @@ class ft(BasicNewsRecipe):
|
|||||||
language = 'en'
|
language = 'en'
|
||||||
__author__ = "Kovid Goyal"
|
__author__ = "Kovid Goyal"
|
||||||
description = 'The Financial Times is one of the world’s leading news organisations, recognised internationally for its authority, integrity and accuracy.'
|
description = 'The Financial Times is one of the world’s leading news organisations, recognised internationally for its authority, integrity and accuracy.'
|
||||||
oldest_article = 1.5
|
oldest_article = 1.15
|
||||||
max_articles_per_feed = 50
|
max_articles_per_feed = 50
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
@ -17,6 +18,7 @@ class ft(BasicNewsRecipe):
|
|||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
remove_attributes = ['style', 'width', 'height']
|
remove_attributes = ['style', 'width', 'height']
|
||||||
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
|
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
|
||||||
|
extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}'
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup(
|
soup = self.index_to_soup(
|
||||||
@ -58,19 +60,36 @@ class ft(BasicNewsRecipe):
|
|||||||
except TypeError:
|
except TypeError:
|
||||||
author = ' and '.join(x['name'] for x in data['author'])
|
author = ' and '.join(x['name'] for x in data['author'])
|
||||||
image = desc = title_image_url = ''
|
image = desc = title_image_url = ''
|
||||||
if data.get('image'):
|
|
||||||
title_image_url = data['image']['url']
|
|
||||||
image = '<p><img src="{}">'.format(title_image_url)
|
|
||||||
|
|
||||||
|
def resize_img(img):
|
||||||
|
a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
|
||||||
|
b = quote(img, safe='')
|
||||||
|
c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400'
|
||||||
|
# use width = 200, 300, 400,.. 700...
|
||||||
|
return a + b + c
|
||||||
|
|
||||||
|
if data.get('image'):
|
||||||
|
image_url = data['image']['url']
|
||||||
|
if body.__contains__(image_url) is False:
|
||||||
|
title_image_url = resize_img(image_url)
|
||||||
|
image = '<p><img src="{}">'.format(title_image_url)
|
||||||
# embedded image links
|
# embedded image links
|
||||||
|
|
||||||
def insert_image(m):
|
def insert_image(m):
|
||||||
url = m.group()[1:-1]
|
url = m.group()[1:-1]
|
||||||
if url == title_image_url:
|
if url.__contains__('studio') is False:
|
||||||
return ''
|
url = resize_img(url)
|
||||||
return '<p><img src="{}">'.format(url)
|
return '<span><img src="{}"></span></p><p>'.format(url)
|
||||||
|
|
||||||
body = re.sub(r'\[https://\S+?\]', insert_image, body)
|
body = re.sub(r'\[https://\S+?\]', insert_image, body)
|
||||||
if data.get('description'):
|
if data.get('description'):
|
||||||
desc = '<h2>' + data['description'] + '</h2>'
|
desc = '<h2>' + data['description'] + '</h2>'
|
||||||
html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
|
html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for span in soup.findAll('span'):
|
||||||
|
p = span.findParent('p')
|
||||||
|
if p:
|
||||||
|
p['id'] = 'fig-cap'
|
||||||
|
return soup
|
||||||
|
@ -16,6 +16,7 @@ class ft(BasicNewsRecipe):
|
|||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
remove_attributes = ['style', 'width', 'height']
|
remove_attributes = ['style', 'width', 'height']
|
||||||
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
|
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
|
||||||
|
extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}'
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup(
|
soup = self.index_to_soup(
|
||||||
@ -106,7 +107,7 @@ class ft(BasicNewsRecipe):
|
|||||||
def resize_img(img):
|
def resize_img(img):
|
||||||
a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
|
a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
|
||||||
b = quote(img, safe='')
|
b = quote(img, safe='')
|
||||||
c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=300'
|
c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400'
|
||||||
# use width = 200, 300, 400,.. 700...
|
# use width = 200, 300, 400,.. 700...
|
||||||
return a + b + c
|
return a + b + c
|
||||||
|
|
||||||
@ -121,7 +122,7 @@ class ft(BasicNewsRecipe):
|
|||||||
url = m.group()[1:-1]
|
url = m.group()[1:-1]
|
||||||
if url.__contains__('studio') is False:
|
if url.__contains__('studio') is False:
|
||||||
url = resize_img(url)
|
url = resize_img(url)
|
||||||
return '<p> <img src="{}">'.format(url)
|
return '<span><img src="{}"></span></p><p>'.format(url)
|
||||||
|
|
||||||
body = re.sub(r'\[https://\S+?\]', insert_image, body)
|
body = re.sub(r'\[https://\S+?\]', insert_image, body)
|
||||||
|
|
||||||
@ -129,3 +130,10 @@ class ft(BasicNewsRecipe):
|
|||||||
desc = '<h2>' + data['description'] + '</h2>'
|
desc = '<h2>' + data['description'] + '</h2>'
|
||||||
html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
|
html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for span in soup.findAll('span'):
|
||||||
|
p = span.findParent('p')
|
||||||
|
if p:
|
||||||
|
p['id'] = 'fig-cap'
|
||||||
|
return soup
|
||||||
|
@ -61,6 +61,12 @@ class MitTechnologyReview(BasicNewsRecipe):
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
soup = self.index_to_soup('https://www.technologyreview.com/')
|
||||||
|
div = soup.find('div', attrs={'class':lambda s: s and s.startswith('magazineSidebar__imageWrap')})
|
||||||
|
img = div.find('img', src=True)
|
||||||
|
return img['src']
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
self.timefmt = ' [{}]'.format(
|
self.timefmt = ' [{}]'.format(
|
||||||
@ -72,19 +78,7 @@ class MitTechnologyReview(BasicNewsRecipe):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
# find cover
|
|
||||||
self.cover_url = soup.find(
|
|
||||||
"div",
|
|
||||||
attrs={
|
|
||||||
"class":
|
|
||||||
lambda name: name.startswith("magazineHero__image")
|
|
||||||
if name else False
|
|
||||||
}
|
|
||||||
).find(
|
|
||||||
"img",
|
|
||||||
srcset=True,
|
|
||||||
attrs={"class": lambda x: x.startswith('image__img') if x else False}
|
|
||||||
)['srcset'].split()[0]
|
|
||||||
# parse articles
|
# parse articles
|
||||||
feeds = OrderedDict()
|
feeds = OrderedDict()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user