mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
241 lines
9.6 KiB
Python
241 lines
9.6 KiB
Python
#!/usr/bin/env python
|
|
# vim:fileencoding=utf-8
|
|
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
|
|
import json
|
|
from xml.sax.saxutils import escape, quoteattr
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
|
from calibre.web.feeds.news import prefixed_classes as prefix_classes
|
|
|
|
web_version = False
|
|
test_article = None
|
|
# test_article = 'https://www.theatlantic.com/health/archive/2020/12/covid-19-second-surge/617415/?utm_source=feed'
|
|
|
|
|
|
# parse article JSON {{{
|
|
def process_image_block(lines, block):
|
|
caption = block.get('captionText')
|
|
caption_lines = []
|
|
if caption:
|
|
if block.get('attributionText', '').strip():
|
|
caption += ' (' + block['attributionText'] + ')'
|
|
caption_lines.append('<p style="font-style: italic">' + caption + '</p>')
|
|
lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(block['url'])))
|
|
lines.extend(caption_lines)
|
|
lines.append('</div>')
|
|
|
|
|
|
def json_to_html(raw):
|
|
data = json.loads(raw)
|
|
# open('/t/p.json', 'w').write(json.dumps(data, indent=2))
|
|
data = sorted((v['data'] for v in data['props']['pageProps']['urqlState'].values()), key=len)[-1]
|
|
article = json.loads(data)['article']
|
|
lines = []
|
|
lines.append('<h1 style="align: center">' + escape(article['title']) + '</h1>')
|
|
lines.append('<h2 style="align: center">' + escape(article['dek']) + '</h2>')
|
|
auts = ', '.join(x['displayName'] for x in article['authors'])
|
|
if auts:
|
|
lines.append('<p style="align: center">by ' + escape(auts) + '</p>')
|
|
if article.get('leadArt') and 'image' in article['leadArt']:
|
|
process_image_block(lines, article['leadArt']['image'])
|
|
for item in article['content']:
|
|
tn = item.get('__typename', '')
|
|
if tn.endswith('Image'):
|
|
process_image_block(lines, item)
|
|
continue
|
|
html = item.get('innerHtml')
|
|
if html is None or '</iframe>' in html:
|
|
continue
|
|
if 'innerHtml' not in item:
|
|
continue
|
|
tagname = item.get('tagName', 'P').lower()
|
|
lines.append('<{0}>{1}</{0}>'.format(tagname, html))
|
|
return '<html><body><div id="from-json-by-calibre">' + '\n'.join(lines) + '</div></body></html>'
|
|
|
|
|
|
class NoJSON(ValueError):
|
|
pass
|
|
|
|
|
|
def extract_html(soup):
|
|
script = soup.findAll('script', id='__NEXT_DATA__')
|
|
if not script:
|
|
raise NoJSON('No script tag with JSON data found')
|
|
raw = script[0].contents[0]
|
|
return json_to_html(raw)
|
|
|
|
# }}}
|
|
|
|
|
|
class TheAtlantic(BasicNewsRecipe):
|
|
|
|
if web_version:
|
|
title = 'TheAtlantic.com'
|
|
description = 'News and editorial about politics, culture, entertainment, tech, etc. Contains many articles not seen in The Atlantic magazine'
|
|
else:
|
|
title = 'The Atlantic'
|
|
description = 'Current affairs and politics focussed on the US'
|
|
INDEX = 'https://www.theatlantic.com/magazine/'
|
|
|
|
__author__ = 'Kovid Goyal'
|
|
language = 'en'
|
|
encoding = 'utf-8'
|
|
|
|
recipe_specific_options = {
|
|
'date': {
|
|
'short': 'The date of the edition to download (YYYY/MM format)',
|
|
'long': 'For example, 2024/05'
|
|
}
|
|
}
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
|
d = self.recipe_specific_options.get('date')
|
|
if d and isinstance(d, str):
|
|
self.INDEX = 'https://www.theatlantic.com/magazine/toc/' + d + '/'
|
|
|
|
keep_only_tags = [
|
|
dict(itemprop=['headline']),
|
|
classes(
|
|
'c-article-header__hed c-rubric article-header c-article-meta c-lead-media'
|
|
' lead-img article-cover-extra article-body article-magazine article-cover-content'
|
|
),
|
|
prefix_classes(
|
|
'ArticleHeader_root__ ArticleLayoutSection_main__ ArticleBody_root__'
|
|
),
|
|
dict(itemprop='articleBody'),
|
|
# these are for photos articles
|
|
dict(id=['article-header', 'from-json-by-calibre']),
|
|
classes('photos'),
|
|
]
|
|
remove_tags = [
|
|
classes(
|
|
'c-ad c-share-social c-recirculation-link social-kit-top letter-writer-info callout secondary-byline embed-wrapper'
|
|
' offset-wrapper boxtop-most-popular social-icons hints read-more c-article-writer__social'
|
|
),
|
|
prefix_classes('ArticleRecirc_inline__'),
|
|
{
|
|
'name': ['meta', 'link', 'noscript', 'aside', 'h3']
|
|
},
|
|
{
|
|
'attrs': {
|
|
'class': ['offset-wrapper', 'boxtop-most-popular']
|
|
}
|
|
},
|
|
{
|
|
'attrs': {
|
|
'class': lambda x: x and 'article-tools' in x
|
|
}
|
|
},
|
|
{
|
|
'src': lambda x: x and 'spotxchange.com' in x
|
|
},
|
|
]
|
|
remove_tags_after = classes('article-body')
|
|
|
|
no_stylesheets = True
|
|
remove_attributes = ['style']
|
|
extra_css = '''
|
|
.credit { text-align: right; font-size: 75%; display: block }
|
|
.figcaption { font-size: 75% }
|
|
.caption { font-size: 75% }
|
|
.lead-img { display: block }
|
|
p.dropcap:first-letter {
|
|
float: left; text-transform: uppercase; font-weight: bold; font-size: 5.55em; line-height: 0.83;
|
|
margin: 0; padding-right: 7px; margin-bottom: -2px; text-align: center;
|
|
}
|
|
'''
|
|
|
|
def get_browser(self):
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
br.set_cookie('inEuropeanUnion', '0', '.theatlantic.com')
|
|
return br
|
|
|
|
def preprocess_raw_html(self, raw_html, url):
|
|
try:
|
|
return extract_html(self.index_to_soup(raw_html))
|
|
except NoJSON:
|
|
self.log.warn('No JSON found in: {} falling back to HTML'.format(url))
|
|
except Exception:
|
|
self.log.exception('Failed to extract JSON data from: {} falling back to HTML'.format(url))
|
|
return raw_html
|
|
|
|
def preprocess_html(self, soup):
|
|
for img in soup.findAll('img', attrs={'data-srcset': True}):
|
|
# img['src'] = img['data-srcset'].split()[0]
|
|
data_srcset = img['data-srcset']
|
|
if ',' in data_srcset:
|
|
img['src'] = data_srcset.split(',')[0]
|
|
else:
|
|
img['src'] = data_srcset.split()[0]
|
|
for img in soup.findAll('img', attrs={'data-src': True}):
|
|
img['src'] = img['data-src']
|
|
return soup
|
|
|
|
def print_version(self, url):
|
|
ans = url.partition('?')[0] + '?single_page=true'
|
|
if '/video/' in ans:
|
|
ans = None
|
|
return ans
|
|
|
|
if web_version and not test_article:
|
|
|
|
use_embedded_content = False
|
|
|
|
feeds = [
|
|
('The Atlantic', 'https://www.theatlantic.com/feed/all/'),
|
|
('Best of The Atlantic', 'https://www.theatlantic.com/feed/best-of/'),
|
|
('Politics | The Atlantic', 'https://www.theatlantic.com/feed/channel/politics/'),
|
|
('Business | The Atlantic', 'https://www.theatlantic.com/feed/channel/business/'),
|
|
('Culture | The Atlantic', 'https://www.theatlantic.com/feed/channel/entertainment/'),
|
|
('Global | The Atlantic', 'https://www.theatlantic.com/feed/channel/international/'),
|
|
('Technology | The Atlantic', 'https://www.theatlantic.com/feed/channel/technology/'),
|
|
('U.S. | The Atlantic', 'https://www.theatlantic.com/feed/channel/national/'),
|
|
('Health | The Atlantic', 'https://www.theatlantic.com/feed/channel/health/'),
|
|
('Video | The Atlantic', 'https://www.theatlantic.com/feed/channel/video/'),
|
|
('Sexes | The Atlantic', 'https://www.theatlantic.com/feed/channel/sexes/'),
|
|
('Education | The Atlantic', 'https://www.theatlantic.com/feed/channel/education/'),
|
|
('Science | The Atlantic', 'https://www.theatlantic.com/feed/channel/science/'),
|
|
('News | The Atlantic', 'https://www.theatlantic.com/feed/channel/news/'),
|
|
('Press Releases | The Atlantic', 'https://www.theatlantic.com/feed/channel/press-releases/'),
|
|
('Newsletters | The Atlantic', 'https://www.theatlantic.com/feed/channel/newsletters/'),
|
|
('The Atlantic Photo', 'https://feeds.feedburner.com/theatlantic/infocus'),
|
|
('Notes | The Atlantic', 'https://feeds.feedburner.com/TheAtlanticNotes'),
|
|
]
|
|
else:
|
|
def parse_index(self):
|
|
if test_article:
|
|
return [('Articles', [{'title': 'Test article', 'url': test_article}])]
|
|
soup = self.index_to_soup(self.INDEX)
|
|
img = soup.find(**prefix_classes('IssueDescription_cover__'))
|
|
if img is not None:
|
|
self.cover_url = img['src']
|
|
current_section, current_articles = 'Cover Story', []
|
|
feeds = []
|
|
for x in soup.findAll(**prefix_classes(
|
|
'TocFeaturedSection_heading__ TocSection_heading__ TocHeroGridItem_hedLink__ TocGridItem_hedLink__ RiverGridItem_hedLink__'
|
|
)):
|
|
cls = x['class']
|
|
if not isinstance(cls, str):
|
|
cls = ' '.join(cls)
|
|
title = self.tag_to_string(x).strip()
|
|
if 'Section' in cls:
|
|
if current_articles:
|
|
feeds.append((current_section, current_articles))
|
|
current_section, current_articles = title, []
|
|
self.log(current_section)
|
|
continue
|
|
url = x['href']
|
|
current_articles.append({'title': title, 'url': url})
|
|
self.log('\t', title, url)
|
|
if current_articles:
|
|
feeds.append((current_section, current_articles))
|
|
return feeds
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))
|