mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-12 17:47:07 -05:00
173 lines
6.0 KiB
Python
173 lines
6.0 KiB
Python
#!/usr/bin/env python
|
|
from __future__ import unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
|
|
# Written April 2015
|
|
# Last edited 08/2022
|
|
'''
|
|
technologyreview.com
|
|
'''
|
|
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
|
from collections import OrderedDict
|
|
import re
|
|
|
|
def absurl(x):
|
|
if x.startswith('//'):
|
|
x = 'http:' + x
|
|
elif not x.startswith('http'):
|
|
x = "http://www.technologyreview.com" + x
|
|
return x
|
|
|
|
|
|
def classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
return dict(
|
|
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
|
|
)
|
|
|
|
|
|
class MitTechnologyReview(BasicNewsRecipe):
|
|
|
|
title = 'MIT Technology Review Magazine'
|
|
__author__ = 'Michael Marotta, revised by unkn0wn'
|
|
description = (
|
|
'Bi-monthly magazine version of MIT Technology Review.'
|
|
' This is different than the recipe named simply "Technology Review"'
|
|
' which downloads the rss feed with daily articles from the website.'
|
|
)
|
|
INDEX = 'http://www.technologyreview.com/magazine/'
|
|
language = 'en'
|
|
encoding = 'utf-8'
|
|
tags = 'news, technology, science'
|
|
no_stylesheets = True
|
|
remove_empty_feeds = True
|
|
remove_attributes = ['height', 'width', 'style', 'padding', 'padding-top']
|
|
delay = 1
|
|
masthead_url = 'https://wp-preprod.technologyreview.com/wp-content/uploads/2021/08/Screen-Shot-2021-08-20-at-11.11.12-AM-e1629473232355.png'
|
|
extra_css = '''
|
|
#pub-d{font-size:small;}
|
|
#cre-d{font-size:xx-small; text-align:center; color:gray;}
|
|
#cap-d{font-size:small; text-align:center;}
|
|
blockquote{text-align:center; color:#404040;}
|
|
em { color:#202020;}
|
|
'''
|
|
keep_only_tags = [
|
|
prefixed_classes('contentHeader contentArticleHeader contentBody')
|
|
]
|
|
remove_tags = [
|
|
dict(name="aside"),
|
|
dict(name="svg"),
|
|
prefixed_classes(
|
|
'image__placeholder sliderAd__wrapper eyebrow__wrap-- screen-reader-text'
|
|
),
|
|
]
|
|
|
|
def get_cover_url(self):
|
|
soup = self.index_to_soup('https://www.technologyreview.com/')
|
|
if script := soup.find('script', id='preload'):
|
|
link = re.search('(https\S+?front_cover\S+?(jpg|png))', self.tag_to_string(script))
|
|
return link.group(1) + '?fit=572,786'
|
|
|
|
def parse_index(self):
|
|
soup = self.index_to_soup(self.INDEX)
|
|
issue = soup.find(attrs={'class':lambda x: x and x.startswith('magazineHero__title')})
|
|
time = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__date')})
|
|
self.title = 'MIT Tech Review ' + self.tag_to_string(issue)
|
|
self.timefmt = ' [' + self.tag_to_string(time) + ']'
|
|
self.log('Downloading issue: ', self.timefmt)
|
|
|
|
# parse articles
|
|
feeds = OrderedDict()
|
|
|
|
classNamePrefixes = [
|
|
"magazineHero__letter--", "teaserItem__title", "teaserItem--aside__title"
|
|
]
|
|
for div in soup.findAll(
|
|
attrs={
|
|
'class':
|
|
lambda x: any(x.startswith(prefix) for prefix in classNamePrefixes)
|
|
if x else False
|
|
}
|
|
):
|
|
articles = []
|
|
a = div.find('a', href=True)
|
|
title = self.tag_to_string(a).strip()
|
|
href = absurl(a['href'])
|
|
desc = ''
|
|
section_title = 'Letter from the editor'
|
|
d = div.findParent(
|
|
attrs={
|
|
'class':
|
|
lambda z: z and z.
|
|
startswith(('teaserItem__wrapper', 'teaserItem--aside__wrapper'))
|
|
}
|
|
)
|
|
if d:
|
|
|
|
excerpt = d.find(
|
|
attrs={
|
|
'class':
|
|
lambda x: x and x.startswith(
|
|
('teaserItem__excerpt', 'teaserItem--aside__excerpt')
|
|
)
|
|
}
|
|
)
|
|
if excerpt:
|
|
desc = self.tag_to_string(excerpt).strip()
|
|
|
|
sec = d.find(
|
|
attrs={
|
|
'class': lambda x: x and x.startswith('teaserItem__eyebrowText')
|
|
}
|
|
)
|
|
if sec:
|
|
section_title = self.tag_to_string(sec).replace('Categorized in ',
|
|
'').strip()
|
|
|
|
if not href or not title:
|
|
continue
|
|
|
|
self.log(section_title)
|
|
self.log('\t', title)
|
|
self.log('\t', desc)
|
|
self.log('\t\t', href)
|
|
|
|
articles.append({'title': title, 'url': href, 'description': desc})
|
|
if articles:
|
|
if section_title not in feeds:
|
|
feeds[section_title] = []
|
|
feeds[section_title] += articles
|
|
ans = [(key, val) for key, val in feeds.items()]
|
|
return ans
|
|
|
|
def preprocess_html(self, soup):
|
|
for bq in soup.findAll('blockquote'):
|
|
for strong in bq.findAll('strong'):
|
|
strong.name = 'div'
|
|
for date in soup.findAll(
|
|
attrs={
|
|
'class':
|
|
lambda x: x and x.
|
|
startswith(('contentArticleHeader__publishDate', 'byline__wrapper'))
|
|
}
|
|
):
|
|
date['id'] = 'pub-d'
|
|
for li in date.findAll(('li', 'ul')):
|
|
li.name = 'span'
|
|
for cap in soup.findAll('figcaption'):
|
|
cap['id'] = 'cap-d'
|
|
for credit in soup.findAll(
|
|
attrs={
|
|
'class':
|
|
lambda x: x and x.startswith(('image__credit', 'image-credit'))
|
|
}
|
|
):
|
|
credit['id'] = 'cre-d'
|
|
for img in soup.findAll(srcset=True):
|
|
img['src'] = absurl(img['srcset'].split()[0])
|
|
del img['srcset']
|
|
for img in soup.findAll('img', attrs={'src': True}):
|
|
img['src'] = img['src'].split('?')[0] + '?w=800'
|
|
return soup
|