#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2015, Kovid Goyal import json from xml.sax.saxutils import escape, quoteattr from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import prefixed_classes as prefix_classes web_version = False test_article = None # test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed' # parse article JSON {{{ def process_image_block(lines, block): caption = block.get('captionText') caption_lines = [] if caption: if block.get('attributionText', '').strip(): caption += ' (' + block['attributionText'] + ')' caption_lines.append('

' + caption + '

') lines.append('
'.format(quoteattr(block['url']))) lines.extend(caption_lines) lines.append('
') def json_to_html(raw): data = json.loads(raw) # open('/t/p.json', 'w').write(json.dumps(data, indent=2)) data = sorted((v['data'] for v in data['props']['pageProps']['urqlState'].values()), key=len)[-1] article = json.loads(data)['article'] lines = [] lines.append('

' + escape(article['title']) + '

') lines.append('

' + escape(article['dek']) + '

') auts = ', '.join(x['displayName'] for x in article['authors']) if auts: lines.append('

by ' + escape(auts) + '

') if article.get('leadArt') and 'image' in article['leadArt']: process_image_block(lines, article['leadArt']['image']) for item in article['content']: tn = item.get('__typename', '') if tn.endswith('Image'): process_image_block(lines, item) continue html = item.get('innerHtml') if html is None or '' in html: continue if 'innerHtml' not in item: continue tagname = item.get('tagName', 'P').lower() lines.append('<{0}>{1}'.format(tagname, html)) return '
' + '\n'.join(lines) + '
' class NoJSON(ValueError): pass def extract_html(soup): script = soup.findAll('script', id='__NEXT_DATA__') if not script: raise NoJSON('No script tag with JSON data found') raw = script[0].contents[0] return json_to_html(raw) # }}} class TheAtlantic(BasicNewsRecipe): if web_version: title = 'TheAtlantic.com' description = 'News and editorial about politics, culture, entertainment, tech, etc. Contains many articles not seen in The Atlantic magazine' else: title = 'The Atlantic' description = 'Current affairs and politics focussed on the US' INDEX = 'https://www.theatlantic.com/magazine/' __author__ = 'Kovid Goyal' language = 'en' encoding = 'utf-8' recipe_specific_options = { 'date': { 'short': 'The date of the edition to download (YYYY/MM format)', 'long': 'For example, 2024/05' } } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('date') if d and isinstance(d, str): self.INDEX = 'https://www.theatlantic.com/magazine/toc/' + d + '/' keep_only_tags = [ dict(itemprop=['headline']), classes( 'c-article-header__hed c-rubric article-header c-article-meta c-lead-media' ' lead-img article-cover-extra article-body article-magazine article-cover-content' ), prefix_classes( 'ArticleHeader_root__ ArticleLayoutSection_main__ ArticleBody_root__' ), dict(itemprop='articleBody'), # these are for photos articles dict(id=['article-header', 'from-json-by-calibre']), classes('photos'), ] remove_tags = [ classes( 'c-ad c-share-social c-recirculation-link social-kit-top letter-writer-info callout secondary-byline embed-wrapper' ' offset-wrapper boxtop-most-popular social-icons hints read-more c-article-writer__social' ), prefix_classes('ArticleRecirc_inline__'), { 'name': ['meta', 'link', 'noscript', 'aside', 'h3'] }, { 'attrs': { 'class': ['offset-wrapper', 'boxtop-most-popular'] } }, { 'attrs': { 'class': lambda x: x and 'article-tools' in x } }, { 'src': lambda x: x and 'spotxchange.com' in x }, ] remove_tags_after = classes('article-body') no_stylesheets = True remove_attributes = ['style'] extra_css = ''' .credit { text-align: right; font-size: 75%; display: block } .figcaption { font-size: 75% } .caption { font-size: 75% } .lead-img { display: block } p.dropcap:first-letter { float: left; text-transform: uppercase; font-weight: bold; font-size: 5.55em; line-height: 0.83; margin: 0; padding-right: 7px; margin-bottom: -2px; text-align: center; } ''' def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com') return br def preprocess_raw_html(self, raw_html, url): try: return extract_html(self.index_to_soup(raw_html)) except NoJSON: self.log.warn('No JSON found in: {} falling back to HTML'.format(url)) except Exception: self.log.exception('Failed to extract JSON data from: {} falling back to HTML'.format(url)) return raw_html def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-srcset': True}): # img['src'] = img['data-srcset'].split()[0] data_srcset = img['data-srcset'] if ',' in data_srcset: img['src'] = data_srcset.split(',')[0] else: img['src'] = data_srcset.split()[0] for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] return soup def print_version(self, url): ans = url.partition('?')[0] + '?single_page=true' if '/video/' in ans: ans = None return ans if web_version and not test_article: use_embedded_content = False feeds = [ ('The Atlantic', 'https://www.theatlantic.com/feed/all/'), ('Best of The Atlantic', 'https://www.theatlantic.com/feed/best-of/'), ('Politics | The Atlantic', 'https://www.theatlantic.com/feed/channel/politics/'), ('Business | The Atlantic', 'https://www.theatlantic.com/feed/channel/business/'), ('Culture | The Atlantic', 'https://www.theatlantic.com/feed/channel/entertainment/'), ('Global | The Atlantic', 'https://www.theatlantic.com/feed/channel/international/'), ('Technology | The Atlantic', 'https://www.theatlantic.com/feed/channel/technology/'), ('U.S. | The Atlantic', 'https://www.theatlantic.com/feed/channel/national/'), ('Health | The Atlantic', 'https://www.theatlantic.com/feed/channel/health/'), ('Video | The Atlantic', 'https://www.theatlantic.com/feed/channel/video/'), ('Sexes | The Atlantic', 'https://www.theatlantic.com/feed/channel/sexes/'), ('Education | The Atlantic', 'https://www.theatlantic.com/feed/channel/education/'), ('Science | The Atlantic', 'https://www.theatlantic.com/feed/channel/science/'), ('News | The Atlantic', 'https://www.theatlantic.com/feed/channel/news/'), ('Press Releases | The Atlantic', 'https://www.theatlantic.com/feed/channel/press-releases/'), ('Newsletters | The Atlantic', 'https://www.theatlantic.com/feed/channel/newsletters/'), ('The Atlantic Photo', 'https://feeds.feedburner.com/theatlantic/infocus'), ('Notes | The Atlantic', 'https://feeds.feedburner.com/TheAtlanticNotes'), ] else: def parse_index(self): if test_article: return [('Articles', [{'title': 'Test article', 'url': test_article}])] soup = self.index_to_soup(self.INDEX) img = soup.find(**prefix_classes('IssueDescription_cover__')) if img is not None: self.cover_url = img['src'] current_section, current_articles = 'Cover Story', [] feeds = [] for x in soup.findAll(**prefix_classes( 'TocFeaturedSection_heading__ TocSection_heading__ TocHeroGridItem_hedLink__ TocGridItem_hedLink__ RiverGridItem_hedLink__' )): cls = x['class'] if not isinstance(cls, str): cls = ' '.join(cls) title = self.tag_to_string(x).strip() if 'Section' in cls: if current_articles: feeds.append((current_section, current_articles)) current_section, current_articles = title, [] self.log(current_section) continue url = x['href'] current_articles.append({'title': title, 'url': url}) self.log('\t', title, url) if current_articles: feeds.append((current_section, current_articles)) return feeds if __name__ == '__main__': import sys from calibre.ebooks.BeautifulSoup import BeautifulSoup print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))