MIT Tech Review update

This commit is contained in:
unkn0w7n 2023-11-04 22:18:22 +05:30
parent 018ce6dd93
commit b89cd3566e
3 changed files with 16 additions and 6 deletions

View File

@ -24,7 +24,7 @@ class EpochTimes(BasicNewsRecipe):
]
remove_tags = [
classes('print:hidden h-header shortcode aspect-square'),
dict(name='button'),
dict(name='button', 'svg'),
dict(name='img', attrs={'src':lambda x: x and x.endswith('svg')})
]

View File

@ -50,6 +50,7 @@ class MitTechnologyReview(BasicNewsRecipe):
#cre-d{font-size:xx-small; text-align:center; color:gray;}
#cap-d{font-size:small; text-align:center;}
blockquote{text-align:center; color:#404040;}
em { color:#202020;}
'''
keep_only_tags = [
prefixed_classes('contentHeader contentArticleHeader contentBody')
@ -65,14 +66,15 @@ class MitTechnologyReview(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('https://www.technologyreview.com/')
if script := soup.find('script', id='preload'):
link = re.findall('https\S+?front_cover\S+?.png', self.tag_to_string(script))
return link[-1] + '?fit=572,786'
link = re.search('(https\S+?front_cover\S+?(jpg|png))', self.tag_to_string(script))
return link.group(1) + '?fit=572,786'
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
issue = soup.find('h1', attrs={'class':lambda x: x and x.startswith('magazineHero__title')})
issue = soup.find(attrs={'class':lambda x: x and x.startswith('magazineHero__title')})
time = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__date')})
self.timefmt = ' (' + self.tag_to_string(issue) + ') [' + self.tag_to_string(time) + ']'
self.title = 'MIT Tech Review ' + self.tag_to_string(issue)
self.timefmt = ' [' + self.tag_to_string(time) + ']'
self.log('Downloading issue: ', self.timefmt)
# parse articles

View File

@ -21,16 +21,24 @@ class PhilosophyNow(BasicNewsRecipe):
remove_attributes = ['height', 'width', 'style']
encoding = 'utf-8'
ignore_duplicate_articles = {'url'}
masthead_url = 'https://philosophynow.org/media/images/regulars/logoStructuredData.png'
keep_only_tags = [classes('article_page')]
remove_tags = [dict(name='div', attrs={'id':'welcome_box'})]
extra_css = '''
img {display:block; margin:0 auto;}
.articleImage { font-size:small; text-align:center; }
em, blockquote { color:#202020; }
'''
def parse_index(self):
soup = self.index_to_soup('https://philosophynow.org/')
div = soup.find('div', attrs={'id': 'aside_issue_cover'})
url = div.find('a', href=True)['href']
for issue in div.findAll('div', attrs={'id':'aside_issue_text'}):
if issue := div.find('div', attrs={'id':'aside_issue_text'}):
self.log('Downloading issue:', self.tag_to_string(issue).strip())
self.timefmt = ' [' + self.tag_to_string(issue.find(attrs={'id':'aside_issue_date'})) + ']'
self.title = 'Philosophy Now ' + self.tag_to_string(issue.find(attrs={'id':'aside_issue_number'}))
cov_url = div.find('img', src=True)['src']
self.cover_url = 'https://philosophynow.org' + cov_url
soup = self.index_to_soup('https://philosophynow.org' + url)