MIT Tech Review update

This commit is contained in:
unkn0w7n 2023-11-04 22:18:22 +05:30
parent 018ce6dd93
commit b89cd3566e
3 changed files with 16 additions and 6 deletions

View File

@ -24,7 +24,7 @@ class EpochTimes(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
classes('print:hidden h-header shortcode aspect-square'), classes('print:hidden h-header shortcode aspect-square'),
dict(name='button'), dict(name='button', 'svg'),
dict(name='img', attrs={'src':lambda x: x and x.endswith('svg')}) dict(name='img', attrs={'src':lambda x: x and x.endswith('svg')})
] ]

View File

@ -50,6 +50,7 @@ class MitTechnologyReview(BasicNewsRecipe):
#cre-d{font-size:xx-small; text-align:center; color:gray;} #cre-d{font-size:xx-small; text-align:center; color:gray;}
#cap-d{font-size:small; text-align:center;} #cap-d{font-size:small; text-align:center;}
blockquote{text-align:center; color:#404040;} blockquote{text-align:center; color:#404040;}
em { color:#202020;}
''' '''
keep_only_tags = [ keep_only_tags = [
prefixed_classes('contentHeader contentArticleHeader contentBody') prefixed_classes('contentHeader contentArticleHeader contentBody')
@ -65,14 +66,15 @@ class MitTechnologyReview(BasicNewsRecipe):
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('https://www.technologyreview.com/') soup = self.index_to_soup('https://www.technologyreview.com/')
if script := soup.find('script', id='preload'): if script := soup.find('script', id='preload'):
link = re.findall('https\S+?front_cover\S+?.png', self.tag_to_string(script)) link = re.search('(https\S+?front_cover\S+?(jpg|png))', self.tag_to_string(script))
return link[-1] + '?fit=572,786' return link.group(1) + '?fit=572,786'
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
issue = soup.find('h1', attrs={'class':lambda x: x and x.startswith('magazineHero__title')}) issue = soup.find(attrs={'class':lambda x: x and x.startswith('magazineHero__title')})
time = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__date')}) time = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__date')})
self.timefmt = ' (' + self.tag_to_string(issue) + ') [' + self.tag_to_string(time) + ']' self.title = 'MIT Tech Review ' + self.tag_to_string(issue)
self.timefmt = ' [' + self.tag_to_string(time) + ']'
self.log('Downloading issue: ', self.timefmt) self.log('Downloading issue: ', self.timefmt)
# parse articles # parse articles

View File

@ -21,16 +21,24 @@ class PhilosophyNow(BasicNewsRecipe):
remove_attributes = ['height', 'width', 'style'] remove_attributes = ['height', 'width', 'style']
encoding = 'utf-8' encoding = 'utf-8'
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
masthead_url = 'https://philosophynow.org/media/images/regulars/logoStructuredData.png'
keep_only_tags = [classes('article_page')] keep_only_tags = [classes('article_page')]
remove_tags = [dict(name='div', attrs={'id':'welcome_box'})] remove_tags = [dict(name='div', attrs={'id':'welcome_box'})]
extra_css = '''
img {display:block; margin:0 auto;}
.articleImage { font-size:small; text-align:center; }
em, blockquote { color:#202020; }
'''
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('https://philosophynow.org/') soup = self.index_to_soup('https://philosophynow.org/')
div = soup.find('div', attrs={'id': 'aside_issue_cover'}) div = soup.find('div', attrs={'id': 'aside_issue_cover'})
url = div.find('a', href=True)['href'] url = div.find('a', href=True)['href']
for issue in div.findAll('div', attrs={'id':'aside_issue_text'}): if issue := div.find('div', attrs={'id':'aside_issue_text'}):
self.log('Downloading issue:', self.tag_to_string(issue).strip()) self.log('Downloading issue:', self.tag_to_string(issue).strip())
self.timefmt = ' [' + self.tag_to_string(issue.find(attrs={'id':'aside_issue_date'})) + ']'
self.title = 'Philosophy Now ' + self.tag_to_string(issue.find(attrs={'id':'aside_issue_number'}))
cov_url = div.find('img', src=True)['src'] cov_url = div.find('img', src=True)['src']
self.cover_url = 'https://philosophynow.org' + cov_url self.cover_url = 'https://philosophynow.org' + cov_url
soup = self.index_to_soup('https://philosophynow.org' + url) soup = self.index_to_soup('https://philosophynow.org' + url)