#!/usr/bin/env python __copyright__ = '2008, Kovid Goyal ' __license__ = 'GPL v3' ''' calibre recipe for slate.com ''' from calibre.web.feeds.recipes import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class Slate(BasicNewsRecipe): title = 'Slate' description = 'A general-interest publication offering analysis and commentary about politics, news and culture.' __author__ = 'unkn0wn' no_stylesheets = True language = 'en' encoding = 'utf-8' remove_attributes = ['style', 'height', 'width'] INDEX = 'https://slate.com/' resolve_internal_links = True remove_empty_feeds = True ignore_duplicate_articles = {'url'} extra_css = ''' .article__rubric { font-size:small; color:#404040; } .article__byline, time { font-size:small; } .article__top-image, .image__credit, .image__meta { font-size:small; text-align:center; } em, blockquote, .alternativeHeadline { color:#202020; } ''' keep_only_tags = [ classes('article__header article__top-image article__content article-hotseats__body'), ] remove_tags = [ dict(name=['aside', 'svg', 'iframe']), classes('social-share slate-ad newsletter-signup in-article-recirc'), ] def preprocess_html(self, soup): for h2 in soup.findAll('h2'): h2.name = 'h4' for img in soup.findAll('img', attrs={'data-srcset': True}): img['src'] = img['data-src'] + '&width=600' return soup def parse_index(self): ans = [] for sectitle, url in ( ('News & Politics', 'news-and-politics'), ('Culture', 'culture'), ('Technology', 'technology'), ('Business', 'business'), ('Life', 'life'), ('Advice', 'advice'), ): url = self.INDEX + url self.log('\nFound section:', sectitle, url) articles = self.slate_section_articles(url) if articles: ans.append((sectitle, articles)) return ans def slate_section_articles(self, url): from datetime import date soup = self.index_to_soup(url) ans = [] dt = date.today().strftime('/%Y/') for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(url + dt)}): url = a['href'] head = a.find(attrs={'class':[ 'section-feed-two-column__card-headline', 'section-feed-three-column__teaser-headline', 'section-feed-two-column__teaser-headline', 'topic-story__hed' ]}) if head: title = self.tag_to_string(head).strip() self.log('\t' + title) self.log('\t\t' + url) ans.append({'title': title, 'url': url}) return ans def populate_article_metadata(self, article, soup, first): summ = soup.find(attrs={'class':'article__dek'}) if summ: article.summary = article.text_summary = self.tag_to_string(summ)