mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update MIT Technology Review
This commit is contained in:
parent
f270dae588
commit
401c92737f
@ -1,13 +1,15 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
|
__copyright__ = '2015 Michael Marotta <mikefm at gmail.net>'
|
||||||
# Written April 2015
|
# Written April 2015
|
||||||
# Last edited 4/18/15
|
# Last edited 08/2022
|
||||||
'''
|
'''
|
||||||
technologyreview.com
|
technologyreview.com
|
||||||
'''
|
'''
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
def absurl(x):
|
def absurl(x):
|
||||||
@ -20,55 +22,156 @@ def absurl(x):
|
|||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
q = frozenset(classes.split(' '))
|
q = frozenset(classes.split(' '))
|
||||||
return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
return dict(
|
||||||
|
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class MitTechnologyReview(BasicNewsRecipe):
|
class MitTechnologyReview(BasicNewsRecipe):
|
||||||
|
|
||||||
title = 'MIT Technology Review Magazine'
|
title = 'MIT Technology Review Magazine'
|
||||||
__author__ = 'Michael Marotta'
|
__author__ = 'Michael Marotta, revised by unkn0wn'
|
||||||
description = ('Bi-monthly magazine version of MIT Technology Review.'
|
description = (
|
||||||
' This is different than the recipe named simply "Technology Review"'
|
'Bi-monthly magazine version of MIT Technology Review.'
|
||||||
' which downloads the rss feed with daily articles from the website.')
|
' This is different than the recipe named simply "Technology Review"'
|
||||||
|
' which downloads the rss feed with daily articles from the website.'
|
||||||
|
)
|
||||||
INDEX = 'http://www.technologyreview.com/magazine/'
|
INDEX = 'http://www.technologyreview.com/magazine/'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
simultaneous_downloads = 20
|
|
||||||
tags = 'news, technology, science'
|
tags = 'news, technology, science'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
remove_empty_feeds = True
|
||||||
|
remove_attributes = ['height', 'width', 'style', 'padding', 'padding-top']
|
||||||
|
masthead_url = 'https://wp-preprod.technologyreview.com/wp-content/uploads/2021/08/Screen-Shot-2021-08-20-at-11.11.12-AM-e1629473232355.png'
|
||||||
|
extra_css = '''
|
||||||
|
#pub-d{font-size:small;}
|
||||||
|
#cre-d{font-size:xx-small; text-align:center; color:gray;}
|
||||||
|
#cap-d{font-size:small; text-align:center;}
|
||||||
|
blockquote{text-align:center; color:#404040;}
|
||||||
|
'''
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
prefixed_classes('contentHeader contentArticleHeader contentBody')
|
prefixed_classes('contentHeader contentArticleHeader contentBody')
|
||||||
]
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name="aside"),
|
dict(name="aside"),
|
||||||
dict(name="svg"),
|
dict(name="svg"),
|
||||||
dict(name="blockquote"),
|
prefixed_classes(
|
||||||
prefixed_classes('image__placeholder sliderAd__wrapper'),
|
'image__placeholder sliderAd__wrapper eyebrow__wrap-- screen-reader-text'
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup(self.INDEX)
|
soup = self.index_to_soup(self.INDEX)
|
||||||
|
self.timefmt = ' [{}]'.format(
|
||||||
|
self.tag_to_string(
|
||||||
|
soup.find(
|
||||||
|
attrs={
|
||||||
|
'class': lambda x: x and x.startswith('magazineHero__date')
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
# find cover
|
# find cover
|
||||||
self.cover_url = soup.find(
|
self.cover_url = soup.find(
|
||||||
"div", attrs={"class":lambda name: name.startswith("magazineHero__image") if name else False}).find(
|
"div",
|
||||||
"img",
|
attrs={
|
||||||
src=True, attrs={"class":lambda x: x.startswith('image__img') if x else False}
|
"class":
|
||||||
)['src']
|
lambda name: name.startswith("magazineHero__image")
|
||||||
|
if name else False
|
||||||
|
}
|
||||||
|
).find(
|
||||||
|
"img",
|
||||||
|
srcset=True,
|
||||||
|
attrs={"class": lambda x: x.startswith('image__img') if x else False}
|
||||||
|
)['srcset'].split()[0]
|
||||||
# parse articles
|
# parse articles
|
||||||
current_articles = []
|
feeds = OrderedDict()
|
||||||
classNamePrefixes = ["magazineHero__letter--", "teaserItem__title", "teaserItem--aside__title"]
|
|
||||||
for div in soup.findAll(attrs={'class': lambda x: any(x.startswith(prefix) for prefix in classNamePrefixes) if x else False}):
|
classNamePrefixes = [
|
||||||
|
"magazineHero__letter--", "teaserItem__title", "teaserItem--aside__title"
|
||||||
|
]
|
||||||
|
for div in soup.findAll(
|
||||||
|
attrs={
|
||||||
|
'class':
|
||||||
|
lambda x: any(x.startswith(prefix) for prefix in classNamePrefixes)
|
||||||
|
if x else False
|
||||||
|
}
|
||||||
|
):
|
||||||
|
articles = []
|
||||||
a = div.find('a', href=True)
|
a = div.find('a', href=True)
|
||||||
title = self.tag_to_string(a).strip()
|
title = self.tag_to_string(a).strip()
|
||||||
href = absurl(a['href'])
|
href = absurl(a['href'])
|
||||||
if href and title:
|
|
||||||
current_articles.append({'title': title, 'url': href})
|
d = div.findParent(
|
||||||
self.log(title, '[%s]' % href)
|
attrs={
|
||||||
return [('Articles', current_articles)]
|
'class':
|
||||||
|
lambda z: z and z.
|
||||||
|
startswith(('teaserItem__wrapper', 'teaserItem--aside__wrapper'))
|
||||||
|
}
|
||||||
|
)
|
||||||
|
desc = self.tag_to_string(
|
||||||
|
d.find(
|
||||||
|
attrs={
|
||||||
|
'class':
|
||||||
|
lambda x: x and x.startswith(
|
||||||
|
('teaserItem__excerpt', 'teaserItem--aside__excerpt')
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
sec = d.find(
|
||||||
|
attrs={
|
||||||
|
'class': lambda x: x and x.startswith('teaserItem__eyebrowText')
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
section_title = self.tag_to_string(sec).replace('Categorized in ',
|
||||||
|
'').strip()
|
||||||
|
|
||||||
|
if not href or not title:
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.log(section_title)
|
||||||
|
self.log('\t', title)
|
||||||
|
self.log('\t', desc)
|
||||||
|
self.log('\t\t', href)
|
||||||
|
|
||||||
|
articles.append({'title': title, 'url': href, 'description': desc})
|
||||||
|
if articles:
|
||||||
|
if section_title not in feeds:
|
||||||
|
feeds[section_title] = []
|
||||||
|
feeds[section_title] += articles
|
||||||
|
ans = [(key, val) for key, val in feeds.items()]
|
||||||
|
return ans
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
for bq in soup.findAll('blockquote'):
|
||||||
|
for strong in bq.findAll('strong'):
|
||||||
|
strong.name = 'div'
|
||||||
|
for date in soup.findAll(
|
||||||
|
attrs={
|
||||||
|
'class':
|
||||||
|
lambda x: x and x.
|
||||||
|
startswith(('contentArticleHeader__publishDate', 'byline__wrapper'))
|
||||||
|
}
|
||||||
|
):
|
||||||
|
date['id'] = 'pub-d'
|
||||||
|
for li in date.findAll(('li', 'ul')):
|
||||||
|
li.name = 'span'
|
||||||
|
for cap in soup.findAll('figcaption'):
|
||||||
|
cap['id'] = 'cap-d'
|
||||||
|
for credit in soup.findAll(
|
||||||
|
attrs={
|
||||||
|
'class':
|
||||||
|
lambda x: x and x.startswith(('image__credit', 'image-credit'))
|
||||||
|
}
|
||||||
|
):
|
||||||
|
credit['id'] = 'cre-d'
|
||||||
for img in soup.findAll(srcset=True):
|
for img in soup.findAll(srcset=True):
|
||||||
img['src'] = absurl(img['srcset'].split()[0])
|
img['src'] = absurl(img['srcset'].split()[0])
|
||||||
del img['srcset']
|
del img['srcset']
|
||||||
|
for img in soup.findAll('img', attrs={'src': True}):
|
||||||
|
img['src'] = img['src'].split('?')[0] + '?w=800'
|
||||||
return soup
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user