import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class Smithsonian(BasicNewsRecipe): title = 'Smithsonian Magazine' __author__ = 'Kovid Goyal' description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' # noqa language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [dict(name='main', attrs={'class': 'main'})] remove_tags = [ dict(attrs={'class': lambda x: x and set(x.split()).intersection( {'hidden-phone', 'hidden-tablet', 'hidden-desktop'})}), dict(attrs={'class': ['slideshow-nav', 'associated-container']}), ] remove_tags_after = dict( name='div', attrs={'class': lambda x: x and 'article-body' in x.split()}) no_javascript = True no_stylesheets = True def parse_index(self): # Go to the issue soup = self.index_to_soup( 'http://www.smithsonianmag.com/issue/archive/?no-ist') li = soup.find('li', attrs={'class': 'issue'}) url_prefix = 'http://www.smithsonianmag.com' current_issue_url = url_prefix + \ li.find('a', href=True)['href'] + '?no-ist' self.log('Downloading issue:', current_issue_url) soup = self.index_to_soup(current_issue_url) # Go to the main body div = soup.find('div', id='Page-Content') # Find date date = re.sub( r'.*\:\W*', "", self.tag_to_string(div.find('h1')).strip()) self.timefmt = u' [%s]' % date # Find cover self.cover_url = div.find( 'img', alt=lambda x: x and 'Cover' in x, src=True)['src'] feeds = OrderedDict() section_title = '' articles = [] for div in soup.findAll('div', attrs={'class': 'article-list'}): section_title = self.tag_to_string( div.find('h2', attrs={'class': 'headline'})).capitalize() self.log('\n\nFound section:', section_title) articles = feeds[section_title] = [] for sec in div.findAll('section', attrs={'class': lambda x: x and 'article-teaser' in x.split()}): head = sec.find(attrs={'class': 'headline'}) url = head.find('a', href=True)['href'] + '?all&no-ist' if url.startswith('/'): url = url_prefix + url title = self.tag_to_string(head) desc = sec.find(attrs={'class': 'sub-title'}) desc = '' if desc is None else self.tag_to_string(desc) self.log('Found article:', title) self.log('\t', url) articles.append( {'title': title, 'url': url, 'description': desc}) ans = [(key, val) for key, val in feeds.items()] return ans