diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index df6400fffa..e39cd746e3 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -2,6 +2,9 @@ # -*- coding: utf-8 -*- __license__ = 'GPL v3' +import json +from urllib import unquote + from collections import defaultdict from calibre.web.feeds.news import BasicNewsRecipe from calibre import browser @@ -13,6 +16,45 @@ def absurl(x): return x +class Tag(list): + + def __init__(self, name, **attrs): + self.name = name + self.attrs = attrs + + def __str__(self): + ans = ['<' + self.name] + for k, v in self.attrs.iteritems(): + ans.append(' {}="{}"'.format(k, v)) + ans.append('>') + for child in self: + ans.append(unicode(child)) + ans.append('{}>'.format(self.name)) + return ''.join(ans) + + +def deserialize(node): + name = node.pop(0) + if name == 'inline-embed': + meta = node.pop(0) + t = meta['type'] + if t in ('image', 'cartoon'): + meta = json.loads(unquote(meta['meta'])) + ans = Tag('img', src=absurl(meta['url'])) + elif t == 'section': + ans = Tag('div') + else: + ans = Tag('span') + else: + ans = Tag(name) + for child in node: + if isinstance(child, list): + ans.append(deserialize(child)) + elif isinstance(child, basestring): + ans.append(child) + return ans + + class NewYorker(BasicNewsRecipe): title = u'New Yorker Magazine' @@ -35,6 +77,11 @@ class NewYorker(BasicNewsRecipe): dict(attrs={'class':lambda x: x and 'ArticleHeader__dek___' in x}), dict(attrs={'class':lambda x: x and 'Byline__articleHeader___' in x}), dict(attrs={'class':lambda x: x and 'ArticleLedeImage__container___' in x}), + dict(itemprop=['headline', 'alternativeHeadline']), + dict(name='h1'), + dict(attrs={'class':lambda x: x and 'byline-and-date' in x}), + dict(attrs={'class':lambda x: x and 'inset-mobile-crop-image' in x}), + dict(attrs={'class':lambda x: x and 'hero-image-caption' in x}), dict(id='articleBody'), dict(attrs={'class':lambda x: x and 'ArticleDisclaimer__articleDisclaimer___' in x}), dict(attrs={'class':lambda x: x and 'ArticleContributors__bio___' in x}), @@ -43,9 +90,18 @@ class NewYorker(BasicNewsRecipe): dict(attrs={'class': lambda x: x and set(x.split()).intersection( {'content-ad-wrapper', 'social-hover', 'background-image'})}), dict(id=['newsletter-signup']), - dict(name='meta link'.split()), + dict(name='meta links source'.split()), ] + # def preprocess_raw_html(self, raw, url): + # import re + # try: + # raw = re.search(r'window.__TNY__.INITIAL_STATE = ({.+?)