mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-01 11:07:02 -04:00
186 lines
6.3 KiB
Python
186 lines
6.3 KiB
Python
#!/usr/bin/env python
|
|
from __future__ import unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
|
|
# Written April 2015
|
|
# Last edited 07/2024
|
|
'''
|
|
technologyreview.com
|
|
'''
|
|
import json
|
|
import re
|
|
from collections import OrderedDict
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
|
|
|
|
|
def absurl(x):
|
|
if x.startswith('//'):
|
|
x = 'http:' + x
|
|
elif not x.startswith('http'):
|
|
x = 'http://www.technologyreview.com' + x
|
|
return x
|
|
|
|
|
|
def classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
return dict(
|
|
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
|
|
)
|
|
|
|
|
|
class MitTechnologyReview(BasicNewsRecipe):
|
|
|
|
title = 'MIT Technology Review Magazine'
|
|
__author__ = 'Michael Marotta, revised by unkn0wn'
|
|
description = (
|
|
'Bi-monthly magazine version of MIT Technology Review.'
|
|
' This is different than the recipe named simply "Technology Review"'
|
|
' which downloads the rss feed with daily articles from the website.'
|
|
)
|
|
language = 'en'
|
|
encoding = 'utf-8'
|
|
tags = 'news, technology, science'
|
|
no_stylesheets = True
|
|
remove_empty_feeds = True
|
|
remove_attributes = ['height', 'width', 'style', 'padding', 'padding-top']
|
|
delay = 1
|
|
masthead_url = 'https://wp-preprod.technologyreview.com/wp-content/uploads/2021/08/Screen-Shot-2021-08-20-at-11.11.12-AM-e1629473232355.png'
|
|
extra_css = '''
|
|
#pub-d{font-size:small;}
|
|
#cre-d{font-size:xx-small; text-align:center; color:gray;}
|
|
#cap-d{font-size:small; text-align:center;}
|
|
blockquote{text-align:center; color:#404040;}
|
|
em { color:#202020;}
|
|
'''
|
|
keep_only_tags = [
|
|
prefixed_classes('contentHeader contentArticleHeader contentBody')
|
|
]
|
|
remove_tags = [
|
|
dict(name='aside'),
|
|
dict(name='svg'),
|
|
prefixed_classes(
|
|
'image__placeholder sliderAd__wrapper eyebrow__wrap-- screen-reader-text'
|
|
),
|
|
]
|
|
|
|
recipe_specific_options = {
|
|
'issue_url': {
|
|
'short': 'The issue URL ',
|
|
'long': 'For example, https://www.technologyreview.com/magazines/the-education-issue/',
|
|
'default': 'http://www.technologyreview.com/magazine/'
|
|
}
|
|
}
|
|
|
|
def parse_index(self):
|
|
# for past editions, change the issue link below
|
|
issue = 'http://www.technologyreview.com/magazine/'
|
|
d = self.recipe_specific_options.get('issue_url')
|
|
if d and isinstance(d, str):
|
|
issue = d
|
|
soup = self.index_to_soup(issue)
|
|
if script := soup.find('script', id='preload'):
|
|
raw = script.contents[0]
|
|
m = re.search(r'\"children\":\[{\"name\":\"magazine-hero\"', raw)
|
|
spl = re.split(r'(?=\{)', raw[m.start():], 1)[1]
|
|
data = json.JSONDecoder().raw_decode(spl)[0]
|
|
self.cover_url = data['children'][0]['config']['src'] + '?fit=572,786'
|
|
self.timefmt = ' [' + data['config']['issueDate'] + ']'
|
|
self.description = data['config']['description']
|
|
self.title = 'MIT TR: ' + data['config']['title']
|
|
|
|
# parse articles
|
|
feeds = OrderedDict()
|
|
|
|
classNamePrefixes = [
|
|
'magazineHero__letter--', 'teaserItem__title', 'teaserItem--aside__title'
|
|
]
|
|
for div in soup.findAll(
|
|
attrs={
|
|
'class':
|
|
lambda x: any(x.startswith(prefix) for prefix in classNamePrefixes)
|
|
if x else False
|
|
}
|
|
):
|
|
articles = []
|
|
a = div.find('a', href=True)
|
|
title = self.tag_to_string(a).strip()
|
|
href = absurl(a['href'])
|
|
desc = ''
|
|
section_title = 'Letter from the editor'
|
|
d = div.findParent(
|
|
attrs={
|
|
'class':
|
|
lambda z: z and z.
|
|
startswith(('teaserItem__wrapper', 'teaserItem--aside__wrapper'))
|
|
}
|
|
)
|
|
if d:
|
|
|
|
excerpt = d.find(
|
|
attrs={
|
|
'class':
|
|
lambda x: x and x.startswith(
|
|
('teaserItem__excerpt', 'teaserItem--aside__excerpt')
|
|
)
|
|
}
|
|
)
|
|
if excerpt:
|
|
desc = self.tag_to_string(excerpt).strip()
|
|
|
|
sec = d.find(
|
|
attrs={
|
|
'class': lambda x: x and x.startswith('teaserItem__eyebrowText')
|
|
}
|
|
)
|
|
if sec:
|
|
section_title = self.tag_to_string(sec).replace('Categorized in ',
|
|
'').strip()
|
|
|
|
if not href or not title:
|
|
continue
|
|
|
|
self.log(section_title)
|
|
self.log('\t', title)
|
|
self.log('\t', desc)
|
|
self.log('\t\t', href)
|
|
|
|
articles.append({'title': title, 'url': href, 'description': desc})
|
|
if articles:
|
|
if section_title not in feeds:
|
|
feeds[section_title] = []
|
|
feeds[section_title] += articles
|
|
ans = list(feeds.items())
|
|
return ans
|
|
|
|
def preprocess_html(self, soup):
|
|
for bq in soup.findAll('blockquote'):
|
|
for strong in bq.findAll('strong'):
|
|
strong.name = 'div'
|
|
for date in soup.findAll(
|
|
attrs={
|
|
'class':
|
|
lambda x: x and x.
|
|
startswith(('contentArticleHeader__publishDate', 'byline__wrapper'))
|
|
}
|
|
):
|
|
date['id'] = 'pub-d'
|
|
for li in date.findAll(('li', 'ul')):
|
|
li.name = 'span'
|
|
for cap in soup.findAll('figcaption'):
|
|
cap['id'] = 'cap-d'
|
|
for credit in soup.findAll(
|
|
attrs={
|
|
'class':
|
|
lambda x: x and x.startswith(('image__credit', 'image-credit'))
|
|
}
|
|
):
|
|
credit['id'] = 'cre-d'
|
|
for img in soup.findAll(srcset=True):
|
|
img['src'] = absurl(img['srcset'].split()[0])
|
|
del img['srcset']
|
|
for img in soup.findAll('img', attrs={'src': True}):
|
|
img['src'] = img['src'].split('?')[0] + '?w=800'
|
|
return soup
|