#!/usr/bin/env python2 # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2016, Kovid Goyal import json import re from collections import defaultdict from calibre import browser from calibre.ebooks.BeautifulSoup import Tag from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict( attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) def absurl(x): if x.startswith('/') and not x.startswith('//'): x = 'https://www.newyorker.com' + x return x class NewYorker(BasicNewsRecipe): title = u'New Yorker Magazine' description = u'Content from the New Yorker website' url_list = [] language = 'en' __author__ = 'Kovid Goyal' no_stylesheets = True timefmt = ' [%b %d]' encoding = 'utf-8' extra_css = ''' .byline { font-size:xx-small; font-weight: bold;} h3 { margin-bottom: 6px; } .caption { font-size: xx-small; font-style: italic; font-weight: normal; } ''' keep_only_tags = [ dict(attrs={'class': lambda x: x and 'ArticleHeader__hed___' in x}), dict(attrs={'class': lambda x: x and 'ArticleHeader__dek___' in x}), dict(attrs={'class': lambda x: x and 'Byline__articleHeader___' in x}), dict(attrs={'class': lambda x: x and 'ArticleLedeImage__container___' in x}), dict(itemprop=['headline', 'alternativeHeadline']), dict(name='h1'), classes( 'featured-image byline-and-date inset-mobile-crop-image hero-image-caption' ), dict(id=['articleBody', 'article-content']), dict(attrs={'class': lambda x: x and 'ArticleDisclaimer__articleDisclaimer___' in x}), dict(attrs={'class': lambda x: x and 'ArticleContributors__bio___' in x}), ] remove_tags = [ classes('content-ad-wrapper social-hover background-image'), dict(id=['newsletter-signup']), dict(attrs={'class': lambda x: x and 'ImageEmbed__button___' in x}), dict(attrs={'class': lambda x: x and 'ArticleLedeImage__button___' in x}), dict(name='links source'.split()), ] remove_attributes = ['style'] def parse_index(self): soup = self.index_to_soup( 'https://www.newyorker.com/magazine?intcid=magazine') # soup = self.index_to_soup('file:///t/raw.html') cover_img = soup.find( attrs={'class': lambda x: x and 'MagazineCover__cover___' in x}) if cover_img is not None: cover_img = cover_img.find('img') if cover_img is not None: self.cover_url = cover_img.get('src') self.log('Found cover:', self.cover_url) stories = defaultdict(list) last_section = 'Unknown' for story in soup.findAll( attrs={'class': lambda x: x and 'River__riverItemContent___' in x}): try: section = self.tag_to_string( story.find('a')['title']) or last_section except KeyError: section = last_section last_section = section h4 = story.find('h4') title = self.tag_to_string(h4) a = story.find('h4').parent url = absurl(a['href']) desc = '' body = story.find(attrs={'class': 'River__dek___CayIg'}) if body is not None: desc = body.contents[0] self.log('Found article:', title) self.log('\t' + url) self.log('\t' + desc) self.log('') stories[section].append({ 'title': title, 'url': url, 'description': desc}) return [(k, stories[k]) for k in sorted(stories)] def preprocess_raw_html(self, html, url): self.featured_image = None m = m = re.search(r'"featured_image".+?,"url":("https[^"]+")', html) if m is not None: self.featured_image = json.loads(m.group(1)) self.log('Found featured image in JSON at', url, ':', self.featured_image) return html def preprocess_html(self, soup): body = soup.find('body') if not body.find('h1'): title = soup.find('meta', itemprop='name') if title: if self.featured_image: img = Tag(soup, 'img') img['src'] = self.featured_image div = Tag(soup, 'div') div.append(img) body.insert(0, div) h1 = Tag(soup, 'h1') h1.append(title.get('content')) body.insert(0, h1) for attr in 'srcset data-src-mobile'.split(): for img in soup.findAll('img'): try: ds = img[attr].split()[0] del img[attr] except KeyError: continue if ds: img['src'] = ds return soup # The New Yorker changes the content it delivers based on cookies, so the # following ensures that we send no cookies def get_browser(self, *args, **kwargs): return self def clone_browser(self, *args, **kwargs): return self.get_browser() def open_novisit(self, *args, **kwargs): br = browser() return br.open_novisit(*args, **kwargs) open = open_novisit