#!/usr/bin/env python __license__ = 'GPL v3' from calibre.web.feeds.news import BasicNewsRecipe from css_selectors import Select def absurl(url): if url.startswith('/'): url = 'https://www.scientificamerican.com' + url return url keep_classes = {'article-header', 'article-content', 'article-media', 'article-author', 'article-text'} remove_classes = {'aside-banner', 'moreToExplore', 'article-footer'} class ScientificAmerican(BasicNewsRecipe): title = u'Scientific American' description = u'Popular Science. Monthly magazine. Should be downloaded around the middle of each month.' category = 'science' __author__ = 'Kovid Goyal' no_stylesheets = True language = 'en' publisher = 'Nature Publishing Group' remove_empty_feeds = True remove_javascript = True timefmt = ' [%B %Y]' needs_subscription = 'optional' keep_only_tags = [ dict(attrs={'class': lambda x: x and bool( set(x.split()).intersection(keep_classes))}), ] remove_tags = [ dict(attrs={'class': lambda x: x and bool( set(x.split()).intersection(remove_classes))}), dict(id=['seeAlsoLinks']), ] def get_browser(self, *args): br = BasicNewsRecipe.get_browser(self) if self.username and self.password: br.open('https://www.scientificamerican.com/my-account/login/') br.select_form(predicate=lambda f: f.attrs.get('id') == 'login') br['emailAddress'] = self.username br['password'] = self.password br.submit() return br def parse_index(self): # Get the cover, date and issue URL root = self.index_to_soup( 'https://www.scientificamerican.com/sciammag/', as_tree=True) select = Select(root) url = [x.get('href', '') for x in select('main .store-listing__img a')][0] url = absurl(url) self.cover_url = [x.get('src', '') for x in select('main .store-listing__img img')][0] # Now parse the actual issue to get the list of articles select = Select(self.index_to_soup(url, as_tree=True)) feeds = [] for i, section in enumerate(select('#sa_body .toc-articles')): if i == 0: feeds.append( ('Features', list(self.parse_sciam_features(select, section)))) else: feeds.extend(self.parse_sciam_departments(select, section)) return feeds def parse_sciam_features(self, select, section): for article in select('article[data-article-title]', section): title = article.get('data-article-title') url = 'https://www.scientificamerican.com/{}/'.format(article.get('id').replace('-', '/', 1)) desc = '' for p in select('p.t_body', article): desc += self.tag_to_string(p) break for p in select('.t_meta', article): desc += ' ' + self.tag_to_string(p) break self.log('Found feature article: %s at %s' % (title, url)) self.log('\t' + desc) yield {'title': title, 'url': url, 'description': desc} def parse_sciam_departments(self, select, section): section_title, articles = 'Unknown', [] for li in select('li[data-article-title]', section): for span in select('span.department-title', li): if articles: yield section_title, articles section_title, articles = self.tag_to_string(span), [] self.log('\nFound section: %s' % section_title) break url = 'https://www.scientificamerican.com/{}/'.format(li.get('id').replace('-', '/', 1)) for h2 in select('h2.t_listing-title', li): title = self.tag_to_string(h2) break else: continue articles.append( {'title': title, 'url': url, 'description': ''}) self.log('\tFound article: %s at %s' % (title, url)) if articles: yield section_title, articles