#!/usr/bin/env python2 __license__ = 'GPL v3' from calibre.web.feeds.news import BasicNewsRecipe def absurl(url): if url.startswith('/'): url = 'http://www.scientificamerican.com' + url return url class ScientificAmerican(BasicNewsRecipe): title = u'Scientific American' description = u'Popular Science. Monthly magazine.' category = 'science' __author__ = 'Kovid Goyal' no_stylesheets = True language = 'en' publisher = 'Nature Publishing Group' remove_empty_feeds = True remove_javascript = True needs_subscription = 'optional' keep_only_tags = [ dict(attrs={'class':['article-title', 'article-dek', 'article-author article-date', 'article-content', 'article-slatwallPayWall']}), ] def get_browser(self, *args): br = BasicNewsRecipe.get_browser(self) if self.username and self.password: br.open('https://www.scientificamerican.com/my-account/login/') br.select_form(predicate=lambda f:f.attrs.get('id') == 'login') br['emailAddress'] = self.username br['password'] = self.password br.submit() return br def parse_index(self): # Get the cover, date and issue URL root = self.index_to_soup('http://www.scientificamerican.com/sciammag/', as_tree=True) for a in root.xpath('''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' archiveIssues ')]/descendant-or-self::*/a[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover ') and (@href)]'''): self.cover_url = absurl(a.xpath('descendant-or-self::img[@src]')[0].get('src')) root = self.index_to_soup(absurl(a.get('href')), as_tree=True) for a in a.xpath('following-sibling::a[@href]'): self.timefmt = self.tag_to_string(a).strip() break break else: raise ValueError('The Scientific American website has changed, this recipe needs to be updated') # Now parse the actual issue to get the list of articles feeds = [] for i, div in enumerate(root.xpath('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' toc-features ')] | descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' toc-departments ')]''')): if i == 0: feeds.append(('Features', list(self.parse_sciam_features(div)))) else: feeds.extend(self.parse_sciam_departments(div)) return feeds def parse_sciam_features(self, div): for h4 in div.xpath('''descendant-or-self::li/descendant-or-self::*/a[@href]/descendant-or-self::*/h4'''): title = self.tag_to_string(h4) a = h4.getparent() url = absurl(a.get('href')) desc = '' for span in a.xpath('following-sibling::span'): desc = self.tag_to_string(span) break self.log('Found feature article: %s at %s' % (title, url)) self.log('\t' + desc) yield {'title':title, 'url':url, 'description':desc} def parse_sciam_departments(self, div): section_title, articles = 'Unknown', [] for x in div.xpath('''descendant-or-self::li/descendant-or-self::*/a[@href]/descendant-or-self::*/h3 | descendant-or-self::li/descendant-or-self::*/span[@class and contains(concat(' ', normalize-space(@class), ' '), ' deptTitle ')]/descendant-or-self::*/a[@href]'''): if x.tag == 'a': if articles: yield section_title, list(articles) section_title = self.tag_to_string(x) del articles[:] self.log('\nFound section: %s' % section_title) else: title = self.tag_to_string(x) a = x.getparent() url = absurl(a.get('href')) articles.append({'title':title, 'url':url, 'description':''}) self.log('\tFound article: %s at %s' % (title, url))