import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class Smithsonian(BasicNewsRecipe): title = 'Smithsonian Magazine' __author__ = 'Kovid Goyal' description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' # noqa language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [dict(name='main', attrs={'class':'main'})] remove_tags = [ dict(attrs={'class':lambda x: x and set(x.split()).intersection({'hidden-phone', 'hidden-tablet', 'hidden-desktop'})}), dict(attrs={'class':['slideshow-nav', 'associated-container']}), ] remove_tags_after = dict(name='div', attrs={'class':lambda x:x and 'article-body' in x.split()}) no_javascript = True no_stylesheets = True def parse_index(self): # Go to the issue soup = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/?no-ist') li = soup.find('li', attrs={'class':'issue'}) url_prefix = 'http://www.smithsonianmag.com' current_issue_url = url_prefix + li.find('a', href=True)['href'] + '?no-ist' self.log('Downloading issue:', current_issue_url) soup = self.index_to_soup(current_issue_url) # Go to the main body div = soup.find('div', id='Page-Content') # Find date date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h1')).strip()) self.timefmt = u' [%s]'%date # Find cover self.cover_url = div.find('img', alt=lambda x: x and 'Cover' in x, src=True)['src'] feeds = OrderedDict() section_title = '' articles = [] for div in soup.findAll('div', attrs={'class':'article-list'}): section_title = self.tag_to_string(div.find('h2', attrs={'class':'headline'})).capitalize() self.log('\n\nFound section:', section_title) articles = feeds[section_title] = [] for sec in div.findAll('section', attrs={'class':lambda x:x and 'article-teaser' in x.split()}): head = sec.find(attrs={'class':'headline'}) url = head.find('a', href=True)['href'] + '?all&no-ist' if url.startswith('/'): url = url_prefix + url title = self.tag_to_string(head) desc = sec.find(attrs={'class':'sub-title'}) desc = '' if desc is None else self.tag_to_string(desc) self.log('Found article:', title) self.log('\t', url) articles.append({'title':title, 'url':url, 'description':desc}) ans = [(key, val) for key, val in feeds.iteritems()] return ans