from calibre.web.feeds.news import BasicNewsRecipe, classes CATEGORIES = { 'smart-news': 'Smart News', 'history': 'History', 'science-nature': 'Science', 'innovation': 'Innovation', 'arts-culture': 'Arts & Culture', 'travel': 'Travel', 'smithsonian-institution': 'At the Smithsonian' } class Smithsonian(BasicNewsRecipe): title = 'Smithsonian Magazine' __author__ = 'Kovid Goyal' description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' # noqa: E501 language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [ classes('article-header articleLeft') ] remove_tags = [ classes( 'hidden-phone hidden-tablet hidden-desktop slideshow-nav associated-container' ' widget-article-pixel tag-list recommended-videos comments' ) ] no_javascript = True no_stylesheets = True def parse_section(self, url): soup = self.index_to_soup(url) seen = set() for al in soup.findAll(attrs={'class': 'article-list'}): for article in al.findAll(attrs={'class': 'article-list-item'}): div = article.find(attrs={'class': 'article-list-text'}) a = div.find('a') title = self.tag_to_string(a) if title in seen: continue seen.add(title) url = 'https://www.smithsonianmag.com/' + a['href'].lstrip('/') if '/tag/' in url: continue desc = '' p = div.find(attrs={'class': 'article-list-text'}) if p is not None: desc = self.tag_to_string(p) self.log('\t' + title) yield {'title': title, 'url': url, 'description': desc} def parse_index(self): ans = [] for slug, title in CATEGORIES.items(): url = 'https://www.smithsonianmag.com/category/' + slug + '/' self.log('Parsing section:', title, 'at:', url) articles = list(self.parse_section(url)) if articles: ans.append((title, articles)) if self.test and len(ans) >= self.test[0]: break return ans