Update The Atlantic

This commit is contained in:
Kovid Goyal 2021-08-12 14:45:46 +05:30
parent d7d329554b
commit 42b04ddeeb
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 154 additions and 14 deletions

View File

@ -1,18 +1,72 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import unicode_literals from __future__ import unicode_literals
__license__ = 'GPL v3' import json
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' from xml.sax.saxutils import escape, quoteattr
'''
Recipe for web and magazine versions of the The Atlantic
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
web_version = False web_version = False
test_article = None test_article = None
# test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed' # test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed'
# {{{ parse article JSON
def process_image_block(lines, block):
caption = block.get('captionText')
caption_lines = []
if caption:
if block.get('attributionText', '').strip():
caption += ' (' + block['attributionText'] + ')'
caption_lines.append('<p style="font-style: italic">' + caption + '</p>')
lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(block['url'])))
lines.extend(caption_lines)
lines.append('</div>')
def json_to_html(raw):
data = json.loads(raw)
# open('/t/p.json', 'w').write(json.dumps(data, indent=2))
data = sorted((v['data'] for v in data['props']['pageProps']['urqlState'].values()), key=len)[-1]
article = json.loads(data)['article']
lines = []
lines.append('<h1 style="align: center">' + escape(article['title']) + '</h1>')
lines.append('<h2 style="align: center">' + escape(article['dek']) + '</h2>')
auts = ', '.join(x['displayName'] for x in article['authors'])
if auts:
lines.append('<p style="align: center">by ' + escape(auts) + '</p>')
if article.get('leadArt') and 'image' in article['leadArt']:
process_image_block(lines, article['leadArt']['image'])
for item in article['content']:
tn = item.get('__typename', '')
if tn.endswith('Image'):
process_image_block(lines, item)
continue
html = item.get('innerHtml')
if html is None or '</iframe>' in html:
continue
if 'innerHtml' not in item:
continue
tagname = item.get('tagName', 'P').lower()
lines.append('<{0}>{1}</{0}>'.format(tagname, html))
return '<html><body><div id="from-json-by-calibre">' + '\n'.join(lines) + '</div></body></html>'
class NoJSON(ValueError):
pass
def extract_html(soup):
script = soup.findAll('script', id='__NEXT_DATA__')
if not script:
raise NoJSON('No script tag with JSON data found')
raw = script[0].contents[0]
return json_to_html(raw)
# }}}
def classes(classes): def classes(classes):
q = frozenset(classes.split(' ')) q = frozenset(classes.split(' '))
return dict( return dict(
@ -58,7 +112,7 @@ class TheAtlantic(BasicNewsRecipe):
), ),
dict(itemprop='articleBody'), dict(itemprop='articleBody'),
# these are for photos articles # these are for photos articles
dict(id='article-header'), dict(id=['article-header', 'from-json-by-calibre']),
classes('photos'), classes('photos'),
] ]
remove_tags = [ remove_tags = [
@ -104,6 +158,15 @@ class TheAtlantic(BasicNewsRecipe):
br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com') br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com')
return br return br
def preprocess_raw_html(self, raw_html, url):
try:
return extract_html(self.index_to_soup(raw_html))
except NoJSON:
self.log.warn('No JSON found in: {} falling back to HTML'.format(url))
except Exception:
self.log.exception('Failed to extract JSON data from: {} falling back to HTML'.format(url))
return raw_html
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-srcset': True}): for img in soup.findAll('img', attrs={'data-srcset': True}):
# img['src'] = img['data-srcset'].split()[0] # img['src'] = img['data-srcset'].split()[0]
@ -203,3 +266,10 @@ class TheAtlantic(BasicNewsRecipe):
if current_articles: if current_articles:
feeds.append((current_section, current_articles)) feeds.append((current_section, current_articles))
return feeds return feeds
if __name__ == '__main__':
import sys
from calibre.ebooks.BeautifulSoup import BeautifulSoup
print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))

View File

@ -1,18 +1,72 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import unicode_literals from __future__ import unicode_literals
__license__ = 'GPL v3' import json
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' from xml.sax.saxutils import escape, quoteattr
'''
Recipe for web and magazine versions of the The Atlantic
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
web_version = True web_version = True
test_article = None test_article = None
# test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed' # test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed'
# {{{ parse article JSON
def process_image_block(lines, block):
caption = block.get('captionText')
caption_lines = []
if caption:
if block.get('attributionText', '').strip():
caption += ' (' + block['attributionText'] + ')'
caption_lines.append('<p style="font-style: italic">' + caption + '</p>')
lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(block['url'])))
lines.extend(caption_lines)
lines.append('</div>')
def json_to_html(raw):
data = json.loads(raw)
# open('/t/p.json', 'w').write(json.dumps(data, indent=2))
data = sorted((v['data'] for v in data['props']['pageProps']['urqlState'].values()), key=len)[-1]
article = json.loads(data)['article']
lines = []
lines.append('<h1 style="align: center">' + escape(article['title']) + '</h1>')
lines.append('<h2 style="align: center">' + escape(article['dek']) + '</h2>')
auts = ', '.join(x['displayName'] for x in article['authors'])
if auts:
lines.append('<p style="align: center">by ' + escape(auts) + '</p>')
if article.get('leadArt') and 'image' in article['leadArt']:
process_image_block(lines, article['leadArt']['image'])
for item in article['content']:
tn = item.get('__typename', '')
if tn.endswith('Image'):
process_image_block(lines, item)
continue
html = item.get('innerHtml')
if html is None or '</iframe>' in html:
continue
if 'innerHtml' not in item:
continue
tagname = item.get('tagName', 'P').lower()
lines.append('<{0}>{1}</{0}>'.format(tagname, html))
return '<html><body><div id="from-json-by-calibre">' + '\n'.join(lines) + '</div></body></html>'
class NoJSON(ValueError):
pass
def extract_html(soup):
script = soup.findAll('script', id='__NEXT_DATA__')
if not script:
raise NoJSON('No script tag with JSON data found')
raw = script[0].contents[0]
return json_to_html(raw)
# }}}
def classes(classes): def classes(classes):
q = frozenset(classes.split(' ')) q = frozenset(classes.split(' '))
return dict( return dict(
@ -58,7 +112,7 @@ class TheAtlantic(BasicNewsRecipe):
), ),
dict(itemprop='articleBody'), dict(itemprop='articleBody'),
# these are for photos articles # these are for photos articles
dict(id='article-header'), dict(id=['article-header', 'from-json-by-calibre']),
classes('photos'), classes('photos'),
] ]
remove_tags = [ remove_tags = [
@ -104,6 +158,15 @@ class TheAtlantic(BasicNewsRecipe):
br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com') br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com')
return br return br
def preprocess_raw_html(self, raw_html, url):
try:
return extract_html(self.index_to_soup(raw_html))
except NoJSON:
self.log.warn('No JSON found in: {} falling back to HTML'.format(url))
except Exception:
self.log.exception('Failed to extract JSON data from: {} falling back to HTML'.format(url))
return raw_html
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-srcset': True}): for img in soup.findAll('img', attrs={'data-srcset': True}):
# img['src'] = img['data-srcset'].split()[0] # img['src'] = img['data-srcset'].split()[0]
@ -203,3 +266,10 @@ class TheAtlantic(BasicNewsRecipe):
if current_articles: if current_articles:
feeds.append((current_section, current_articles)) feeds.append((current_section, current_articles))
return feeds return feeds
if __name__ == '__main__':
import sys
from calibre.ebooks.BeautifulSoup import BeautifulSoup
print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))