From 42b04ddeeb3da954aa79b645d5d5d9d557e97e89 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 12 Aug 2021 14:45:46 +0530 Subject: [PATCH] Update The Atlantic --- recipes/atlantic.recipe | 84 +++++++++++++++++++++++++++++++++---- recipes/atlantic_com.recipe | 84 +++++++++++++++++++++++++++++++++---- 2 files changed, 154 insertions(+), 14 deletions(-) diff --git a/recipes/atlantic.recipe b/recipes/atlantic.recipe index 612ed029a4..ed9ae4eb6f 100644 --- a/recipes/atlantic.recipe +++ b/recipes/atlantic.recipe @@ -1,18 +1,72 @@ #!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import unicode_literals -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Recipe for web and magazine versions of the The Atlantic -''' -from calibre.web.feeds.news import BasicNewsRecipe +import json +from xml.sax.saxutils import escape, quoteattr +from calibre.web.feeds.news import BasicNewsRecipe web_version = False test_article = None # test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed' +# {{{ parse article JSON +def process_image_block(lines, block): + caption = block.get('captionText') + caption_lines = [] + if caption: + if block.get('attributionText', '').strip(): + caption += ' (' + block['attributionText'] + ')' + caption_lines.append('

' + caption + '

') + lines.append('
'.format(quoteattr(block['url']))) + lines.extend(caption_lines) + lines.append('
') + + +def json_to_html(raw): + data = json.loads(raw) + # open('/t/p.json', 'w').write(json.dumps(data, indent=2)) + data = sorted((v['data'] for v in data['props']['pageProps']['urqlState'].values()), key=len)[-1] + article = json.loads(data)['article'] + lines = [] + lines.append('

' + escape(article['title']) + '

') + lines.append('

' + escape(article['dek']) + '

') + auts = ', '.join(x['displayName'] for x in article['authors']) + if auts: + lines.append('

by ' + escape(auts) + '

') + if article.get('leadArt') and 'image' in article['leadArt']: + process_image_block(lines, article['leadArt']['image']) + for item in article['content']: + tn = item.get('__typename', '') + if tn.endswith('Image'): + process_image_block(lines, item) + continue + html = item.get('innerHtml') + if html is None or '' in html: + continue + if 'innerHtml' not in item: + continue + tagname = item.get('tagName', 'P').lower() + lines.append('<{0}>{1}'.format(tagname, html)) + return '
' + '\n'.join(lines) + '
' + + +class NoJSON(ValueError): + pass + + +def extract_html(soup): + script = soup.findAll('script', id='__NEXT_DATA__') + if not script: + raise NoJSON('No script tag with JSON data found') + raw = script[0].contents[0] + return json_to_html(raw) + +# }}} + + def classes(classes): q = frozenset(classes.split(' ')) return dict( @@ -58,7 +112,7 @@ class TheAtlantic(BasicNewsRecipe): ), dict(itemprop='articleBody'), # these are for photos articles - dict(id='article-header'), + dict(id=['article-header', 'from-json-by-calibre']), classes('photos'), ] remove_tags = [ @@ -104,6 +158,15 @@ class TheAtlantic(BasicNewsRecipe): br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com') return br + def preprocess_raw_html(self, raw_html, url): + try: + return extract_html(self.index_to_soup(raw_html)) + except NoJSON: + self.log.warn('No JSON found in: {} falling back to HTML'.format(url)) + except Exception: + self.log.exception('Failed to extract JSON data from: {} falling back to HTML'.format(url)) + return raw_html + def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-srcset': True}): # img['src'] = img['data-srcset'].split()[0] @@ -203,3 +266,10 @@ class TheAtlantic(BasicNewsRecipe): if current_articles: feeds.append((current_section, current_articles)) return feeds + + +if __name__ == '__main__': + import sys + + from calibre.ebooks.BeautifulSoup import BeautifulSoup + print(extract_html(BeautifulSoup(open(sys.argv[-1]).read()))) diff --git a/recipes/atlantic_com.recipe b/recipes/atlantic_com.recipe index 2a9e6169d4..74688cddb1 100644 --- a/recipes/atlantic_com.recipe +++ b/recipes/atlantic_com.recipe @@ -1,18 +1,72 @@ #!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import unicode_literals -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -Recipe for web and magazine versions of the The Atlantic -''' -from calibre.web.feeds.news import BasicNewsRecipe +import json +from xml.sax.saxutils import escape, quoteattr +from calibre.web.feeds.news import BasicNewsRecipe web_version = True test_article = None # test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed' +# {{{ parse article JSON +def process_image_block(lines, block): + caption = block.get('captionText') + caption_lines = [] + if caption: + if block.get('attributionText', '').strip(): + caption += ' (' + block['attributionText'] + ')' + caption_lines.append('

' + caption + '

') + lines.append('
'.format(quoteattr(block['url']))) + lines.extend(caption_lines) + lines.append('
') + + +def json_to_html(raw): + data = json.loads(raw) + # open('/t/p.json', 'w').write(json.dumps(data, indent=2)) + data = sorted((v['data'] for v in data['props']['pageProps']['urqlState'].values()), key=len)[-1] + article = json.loads(data)['article'] + lines = [] + lines.append('

' + escape(article['title']) + '

') + lines.append('

' + escape(article['dek']) + '

') + auts = ', '.join(x['displayName'] for x in article['authors']) + if auts: + lines.append('

by ' + escape(auts) + '

') + if article.get('leadArt') and 'image' in article['leadArt']: + process_image_block(lines, article['leadArt']['image']) + for item in article['content']: + tn = item.get('__typename', '') + if tn.endswith('Image'): + process_image_block(lines, item) + continue + html = item.get('innerHtml') + if html is None or '' in html: + continue + if 'innerHtml' not in item: + continue + tagname = item.get('tagName', 'P').lower() + lines.append('<{0}>{1}'.format(tagname, html)) + return '
' + '\n'.join(lines) + '
' + + +class NoJSON(ValueError): + pass + + +def extract_html(soup): + script = soup.findAll('script', id='__NEXT_DATA__') + if not script: + raise NoJSON('No script tag with JSON data found') + raw = script[0].contents[0] + return json_to_html(raw) + +# }}} + + def classes(classes): q = frozenset(classes.split(' ')) return dict( @@ -58,7 +112,7 @@ class TheAtlantic(BasicNewsRecipe): ), dict(itemprop='articleBody'), # these are for photos articles - dict(id='article-header'), + dict(id=['article-header', 'from-json-by-calibre']), classes('photos'), ] remove_tags = [ @@ -104,6 +158,15 @@ class TheAtlantic(BasicNewsRecipe): br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com') return br + def preprocess_raw_html(self, raw_html, url): + try: + return extract_html(self.index_to_soup(raw_html)) + except NoJSON: + self.log.warn('No JSON found in: {} falling back to HTML'.format(url)) + except Exception: + self.log.exception('Failed to extract JSON data from: {} falling back to HTML'.format(url)) + return raw_html + def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-srcset': True}): # img['src'] = img['data-srcset'].split()[0] @@ -203,3 +266,10 @@ class TheAtlantic(BasicNewsRecipe): if current_articles: feeds.append((current_section, current_articles)) return feeds + + +if __name__ == '__main__': + import sys + + from calibre.ebooks.BeautifulSoup import BeautifulSoup + print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))