More New Yorker updates

This commit is contained in:
Kovid Goyal 2017-06-17 01:07:30 +05:30
parent 530aef002a
commit 417abafe73
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -2,6 +2,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
import json
from urllib import unquote
from collections import defaultdict from collections import defaultdict
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser from calibre import browser
@ -13,6 +16,45 @@ def absurl(x):
return x return x
class Tag(list):
def __init__(self, name, **attrs):
self.name = name
self.attrs = attrs
def __str__(self):
ans = ['<' + self.name]
for k, v in self.attrs.iteritems():
ans.append(' {}="{}"'.format(k, v))
ans.append('>')
for child in self:
ans.append(unicode(child))
ans.append('</{}>'.format(self.name))
return ''.join(ans)
def deserialize(node):
name = node.pop(0)
if name == 'inline-embed':
meta = node.pop(0)
t = meta['type']
if t in ('image', 'cartoon'):
meta = json.loads(unquote(meta['meta']))
ans = Tag('img', src=absurl(meta['url']))
elif t == 'section':
ans = Tag('div')
else:
ans = Tag('span')
else:
ans = Tag(name)
for child in node:
if isinstance(child, list):
ans.append(deserialize(child))
elif isinstance(child, basestring):
ans.append(child)
return ans
class NewYorker(BasicNewsRecipe): class NewYorker(BasicNewsRecipe):
title = u'New Yorker Magazine' title = u'New Yorker Magazine'
@ -35,6 +77,11 @@ class NewYorker(BasicNewsRecipe):
dict(attrs={'class':lambda x: x and 'ArticleHeader__dek___' in x}), dict(attrs={'class':lambda x: x and 'ArticleHeader__dek___' in x}),
dict(attrs={'class':lambda x: x and 'Byline__articleHeader___' in x}), dict(attrs={'class':lambda x: x and 'Byline__articleHeader___' in x}),
dict(attrs={'class':lambda x: x and 'ArticleLedeImage__container___' in x}), dict(attrs={'class':lambda x: x and 'ArticleLedeImage__container___' in x}),
dict(itemprop=['headline', 'alternativeHeadline']),
dict(name='h1'),
dict(attrs={'class':lambda x: x and 'byline-and-date' in x}),
dict(attrs={'class':lambda x: x and 'inset-mobile-crop-image' in x}),
dict(attrs={'class':lambda x: x and 'hero-image-caption' in x}),
dict(id='articleBody'), dict(id='articleBody'),
dict(attrs={'class':lambda x: x and 'ArticleDisclaimer__articleDisclaimer___' in x}), dict(attrs={'class':lambda x: x and 'ArticleDisclaimer__articleDisclaimer___' in x}),
dict(attrs={'class':lambda x: x and 'ArticleContributors__bio___' in x}), dict(attrs={'class':lambda x: x and 'ArticleContributors__bio___' in x}),
@ -43,9 +90,18 @@ class NewYorker(BasicNewsRecipe):
dict(attrs={'class': lambda x: x and set(x.split()).intersection( dict(attrs={'class': lambda x: x and set(x.split()).intersection(
{'content-ad-wrapper', 'social-hover', 'background-image'})}), {'content-ad-wrapper', 'social-hover', 'background-image'})}),
dict(id=['newsletter-signup']), dict(id=['newsletter-signup']),
dict(name='meta link'.split()), dict(name='meta links source'.split()),
] ]
# def preprocess_raw_html(self, raw, url):
# import re
# try:
# raw = re.search(r'window.__TNY__.INITIAL_STATE = ({.+?)</script', raw).group(1)
# except AttributeError:
# return raw
# data = json.loads(raw.strip().rstrip(';'))
# return '<html><body><div id="articleBody">' + unicode(deserialize(data['primary']['body']))
#
def parse_index(self): def parse_index(self):
soup = self.index_to_soup( soup = self.index_to_soup(
'https://www.newyorker.com/magazine?intcid=magazine') 'https://www.newyorker.com/magazine?intcid=magazine')
@ -80,10 +136,11 @@ class NewYorker(BasicNewsRecipe):
return [(k, stories[k]) for k in sorted(stories)] return [(k, stories[k]) for k in sorted(stories)]
def preprocess_html(self, soup): def preprocess_html(self, soup):
for attr in 'srcset data-src-mobile'.split():
for img in soup.findAll('img'): for img in soup.findAll('img'):
try: try:
ds = img['srcset'].split()[0] ds = img[attr].split()[0]
del img['srcset'] del img[attr]
except KeyError: except KeyError:
continue continue
if ds: if ds: