diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe index 2dd2c0d3a9..f21172927a 100644 --- a/recipes/natgeo.recipe +++ b/recipes/natgeo.recipe @@ -167,6 +167,8 @@ class NatGeo(BasicNewsRecipe): for article in soup.findAll('article'): a = article.find('a') url = a['href'] + if url.startswith('/'): + url = 'https://www.nationalgeographic.com' + url section = self.tag_to_string(article.find(**classes('SectionLabel'))) if section.startswith('Paid Content'): continue diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe index 683a9e73b7..d40f5c727d 100644 --- a/recipes/natgeohis.recipe +++ b/recipes/natgeohis.recipe @@ -146,6 +146,8 @@ class NatGeo(BasicNewsRecipe): for article in soup.findAll('article'): a = article.find('a') url = a['href'] + if url.startswith('/'): + url = 'https://www.nationalgeographic.com' + url title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) ans.append({'title': title, 'url': url}) self.log(title, ' ', url) diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe index fa866a2ec9..5f3aaee5eb 100644 --- a/recipes/natgeomag.recipe +++ b/recipes/natgeomag.recipe @@ -156,12 +156,14 @@ class NatGeo(BasicNewsRecipe): title = self.tag_to_string(photoart) url = photoart['href'] if url.startswith('/'): - url = 'https://www.nationalgeographic.com' + photoart['href'] + url = 'https://www.nationalgeographic.com' + url ans2.append(('Photo Essay', [{'title': title, 'url': url}])) for gird in soup.findAll(attrs={'class':'GridPromoTile'}): for article in soup.findAll('article'): a = article.find('a') url = a['href'] + if url.startswith('/'): + url = 'https://www.nationalgeographic.com' + url if '/graphics/' in url: continue section = self.tag_to_string(article.find(**classes('SectionLabel'))) diff --git a/recipes/science_news.recipe b/recipes/science_news.recipe index fe7a34c68b..6e296b404f 100644 --- a/recipes/science_news.recipe +++ b/recipes/science_news.recipe @@ -5,7 +5,7 @@ __license__ = 'GPL v3' sciencenews.org ''' -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes import datetime import re @@ -16,13 +16,13 @@ class ScienceNewsIssue(BasicNewsRecipe): " in all fields of science. This recipe downloads all the articles from the latest issue.") category = u'Science, Technology, News' publisher = u'Society for Science & the Public' - oldest_article = 14 language = 'en' - max_articles_per_feed = 50 no_stylesheets = True use_embedded_content = False - timefmt = ' [%A, %d %B, %Y]' auto_cleanup = False + remove_attributes = ['height', 'width', 'style'] + ignore_duplicate_articles = {'url'} + resolve_internal_links = True keep_only_tags = [ dict( @@ -37,6 +37,7 @@ class ScienceNewsIssue(BasicNewsRecipe): ) ] remove_tags = [ + dict(name=['svg', 'button']), dict( attrs={'class': lambda x: x and ('newsletter-signup__wrapper___' in x)} ) @@ -44,13 +45,15 @@ class ScienceNewsIssue(BasicNewsRecipe): def parse_index(self): - # Get URL of latest mag page - ld = self._get_mag_date() - url = f"https://www.sciencenews.org/sn-magazine/{ld:%B}-{ld.day}-{ld.year}" - url = url.lower() + index = self.index_to_soup('https://www.sciencenews.org/sn-magazine') + a = index.find(**prefixed_classes('magazine-archive__issue-thumbnail___')) + url = a['href'] + self.timefmt = ' [' + url.split('/')[-1] + ']' + self.cover_url = a.img['src'] # Get articles soup = self.index_to_soup(url) + soup = soup.find('main', attrs={'id':'content'}) re_article = re.compile("https://www.sciencenews.org/article/") stories = [] past_urls = set() @@ -68,6 +71,7 @@ class ScienceNewsIssue(BasicNewsRecipe): continue past_urls.add(article_url) + self.log('\t', article_title, ' ', article_url) article_info = { "url": article_url, "title": article_title, @@ -78,22 +82,3 @@ class ScienceNewsIssue(BasicNewsRecipe): ("Articles", stories), ] return index - - def _get_mag_date(self): - """Return date of latest magazine issue. - It is published every 2 weeks.""" - - d = datetime.date(2022, 6, 18) - t = datetime.date.today() - ld = None - while d <= t: - ld = d - d += datetime.timedelta(days=14) - return ld - - def get_cover_url(self): - ld = self._get_mag_date() - url = ld.strftime( - "https://www.sciencenews.org/wp-content/uploads/%Y/%m/%m%d%y_cover.jpg" - ) - return url