From 8a7805d4af1d54a0b2908c3ff9678752b6f8b2df Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:41:58 +0530 Subject: [PATCH] update New Scientist --- recipes/icons/new_scientist_mag.png | Bin 0 -> 278 bytes recipes/new_scientist.recipe | 70 +++++++++--------- recipes/new_scientist_mag.recipe | 110 ++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 37 deletions(-) create mode 100644 recipes/icons/new_scientist_mag.png create mode 100644 recipes/new_scientist_mag.recipe diff --git a/recipes/icons/new_scientist_mag.png b/recipes/icons/new_scientist_mag.png new file mode 100644 index 0000000000000000000000000000000000000000..16523e4981c00ad151a38d7b7e0557f1f57d3a66 GIT binary patch literal 278 zcmV+x0qOpUP)pFfJsC_R2Y?Q&)W{dFbD=<{?*dM zMyp$$&i8*%3yT+Px3D8mjr9(x^kK#{= z1`yUA3RwHd2Beh@8WJ4kXD4T{C#z_F0j$994tvckKi~t->nb%707*qoM6N<$g5RiheEYour account<' not in res: + raise ValueError('Failed to log in to New Scientist, check your username and password') + return br + + # def print_version(self, url): + # return 'https://webcache.googleusercontent.com/search?q=cache:' + url.split('?')[0] + + extra_css = ''' + img {display:block; margin:0 auto;} + .ArticleHeader__Category { font-size:small; color:#404040; } + .ArticleHeader__Author, .ArticleHeader__DateTimeWrapper { font-size:small; } + .ArticleHeader__Copy { font-style:italic; color:#202020; } + .ArticleImage { font-size:small; text-align:center; } + .ArticleImageCaption__Credit { font-size:smaller; } + ''' + + keep_only_tags = [ + classes('ArticleHeader ArticleContent') + ] + + remove_tags = [ + dict(name=['svg', 'button']), + classes('ArticleHeader__SocialWrapper AdvertWrapper ReadMoreWithImage ArticleTopics') + ] + + def parse_index(self): + soup = self.index_to_soup('https://www.newscientist.com/issues/current/') + div = soup.find('div', attrs={'class':'ThisWeeksMagazineHero__CoverInfo'}) + tme = div.find(**classes('ThisWeeksMagazineHero__MagInfoHeading')) + self.log('Downloading issue:', self.tag_to_string(tme)) + self.timefmt = ' [' + self.tag_to_string(tme) + ']' + self.cover_url = div.find(**classes('ThisWeeksMagazineHero__ImageLink')).img['src'] + + feeds = [] + for cont in soup.findAll(attrs={'class':'TableOfContents__Section'}): + sec = self.tag_to_string(cont.find('h3')) + self.log(sec) + articles = [] + for a in cont.findAll('a', attrs={'class':'CardLink'}): + url = a['href'] + if url.startswith('http') is False: + url = 'https://www.newscientist.com' + a['href'] + title = self.tag_to_string(a.find(**classes('Card__Title'))) + desc = '' + desc += self.tag_to_string(a.find(**classes('Card__Category'))) + teaser = a.find(**classes('Card__TeaserCopy')) + if teaser: + desc += ' | ' + self.tag_to_string(teaser) + self.log('\t', title, '\n\t', desc, '\n\t\t', url) + articles.append({'title': title, 'description': desc, 'url': url}) + if articles: + feeds.append((sec, articles)) + return feeds + + def preprocess_html(self, soup): + time = soup.find(**classes('ArticleHeader__DateTimeWrapper')) + if time: + time.name = 'div' + for img in soup.findAll('img', attrs={'data-src':True}): + img['src'] = img['data-src'].replace('?width=1200', '?width=700') + for figc in soup.findAll('figcaption'): + for p in figc.findAll('p'): + p.name = 'div' + return soup