#!/usr/bin/env python2 __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' theatlantic.com ''' from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class NewYorkMagazine(BasicNewsRecipe): title = 'New York Magazine' __author__ = 'Kovid Goyal' description = 'Food, culture, arts and entertainment in New York' language = 'en' no_stylesheets = True remove_javascript = True encoding = 'utf-8' recursions = 1 match_regexps = [r'http://nymag.com/.+/index[0-9]{1,2}.html$'] keep_only_tags = [ classes('lede-text headline-primary article-timestamp by-authors'), dict(id='main'), dict(itemprop='articleBody'), ] remove_tags = [ classes('related-stories start-discussion'), dict(id=['minibrowserbox', 'article-related', 'article-tools']) ] remove_attributes = ['srcset'] handle_gzip = True PREFIX = 'http://nymag.com' def nymag_get_index(self): return self.index_to_soup('http://nymag.com/includes/tableofcontents.htm') def parse_index(self): soup = self.nymag_get_index() self.cover_url = soup.find(attrs={'class': 'cover'}).find('img', src=True).get('src') feeds = [] current_section = 'Cover Story' current_articles = [] for h in soup.findAll(['h4', 'h5']): if h.name == 'h4': if current_section and current_articles: feeds.append((current_section, current_articles)) current_section = self.tag_to_string(h) self.log('\tFound section:', current_section) current_articles = [] elif h.name == 'h5': title = self.tag_to_string(h) a = h.find('a', href=True) if a is not None: url = a.get('href') if url.startswith('/'): url = self.PREFIX + url if title and url: self.log('\t\tFound article:', title) self.log('\t\t\t', url) desc = '' p = h.findNextSibling('p') if p is not None: desc = self.tag_to_string(p) self.log('\t\t\t', desc) current_articles.append({'title': title, 'url': url, 'date': '', 'description': desc}) return feeds def postprocess_html(self, soup, first): for x in soup.findAll(attrs={'class': 'page-navigation'}): x.extract() if not first: for x in soup.findAll(attrs={'class': 'header-spacing'}): x.extract() return soup