#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import unicode_literals import json import re from xml.sax.saxutils import escape, quoteattr from calibre.utils.iso8601 import parse_iso8601 from calibre.web.feeds.news import BasicNewsRecipe # {{{ parse NYT JSON def key_startswith(key, obj): for q, val in obj.items(): if q.startswith(key): return val def is_heading(tn): return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block') def process_inline_text(lines, block, data): text = '' if 'text@stripHtml' in block: text = escape(block['text@stripHtml']) elif 'renderedRepresentation' in block: # happens in byline blocks text = block['renderedRepresentation'] elif 'text' in block: text = block['text'] if text: for fmt in block.get('formats', ()): tn = fmt['typename'] if tn == 'LinkFormat': ab = data[fmt['id']] text = '{}'.format(ab['url'], ab.get('title') or '', text) elif tn == 'BoldFormat': text = '' + text + '' lines.append(text) def process_paragraph(lines, block, data, content_key='content'): tn = block['__typename'] m = re.match(r'Heading([1-6])Block', tn) if m is not None: tag = 'h' + m.group(1) else: tag = 'p' ta = block.get('textAlign') or 'LEFT' style = 'text-align: {}'.format(ta.lower()) lines.append('<{} style="{}">'.format(tag, style)) for item in block[content_key]: tn = item['typename'] if tn in ('TextInline', 'Byline'): process_inline_text(lines, data[item['id']], data) lines.append('') def process_timestamp(lines, block, data): ts = block['timestamp'] dt = parse_iso8601(ts, as_utc=False) lines.append('

' + escape(dt.strftime('%b %d, %Y')) + '

') def process_header(lines, block, data): label = block.get('label') if label: process_paragraph(lines, data[label['id']], data) headline = block.get('headline') if headline: process_paragraph(lines, data[headline['id']], data) summary = block.get('summary') if summary: process_paragraph(lines, data[summary['id']], data) lm = block.get('ledeMedia') if lm and lm.get('typename') == 'ImageBlock': process_image_block(lines, data[lm['id']], data) byline = block.get('byline') if byline: process_paragraph(lines, data[byline['id']], data, content_key='bylines') timestamp = block.get('timestampBlock') if timestamp: process_timestamp(lines, data[timestamp['id']], data) def process_image_block(lines, block, data): media = data[block['media']['id']] caption = media.get('caption') caption_lines = [] if caption: process_inline_text(caption_lines, data[caption['id']], data) crops = key_startswith('crops({', media) renditions = data[crops[0]['id']]['renditions'] img = data[renditions[0]['id']]['url'] lines.append('

'.format(quoteattr(img))) lines.extend(caption_lines) lines.append('

') def json_to_html(raw): data = json.loads(raw) data = data['initialState'] article = next(iter(data.values())) body = data[article['sprinkledBody']['id']] lines = [] for item in body['content@filterEmpty']: tn = item['typename'] if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock'): process_header(lines, data[item['id']], data) elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn): process_paragraph(lines, data[item['id']], data) elif tn == 'ImageBlock': process_image_block(lines, data[item['id']], data) return '' + '\n'.join(lines) + '' def extract_html(soup): script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = type(u'')(script) raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';') return json_to_html(raw) # }}} def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) def absolutize(url): if url.startswith('/'): url = 'https://www.nytimes.com' + url return url class NewYorkTimesBookReview(BasicNewsRecipe): title = u'New York Times Book Review' language = 'en' description = 'The New York Times Sunday Book Review' __author__ = 'Kovid Goyal' no_stylesheets = True no_javascript = True ignore_duplicate_articles = {'title', 'url'} encoding = 'utf-8' def preprocess_raw_html(self, raw_html, url): html = extract_html(self.index_to_soup(raw_html)) return html def parse_index(self): soup = self.index_to_soup( 'https://www.nytimes.com/pages/books/review/index.html') # Find TOC toc = soup.find('section', id='collection-book-review').find('section').find('ol') main_articles, articles = [], [] feeds = [('Features', main_articles), ('Latest', articles)] for li in toc.findAll('li'): h2 = li.find('h2') a = h2.find('a', href=True) if a is not None: title = self.tag_to_string(a) url = absolutize(a['href']) desc = '' p = h2.findNextSibling('p') if p: desc = self.tag_to_string(p) main_articles.append( {'title': title, 'url': url, 'description': desc}) self.log('Found:', title, 'at', url) if desc: self.log('\t', desc) for li in soup.find(id='stream-panel').find('ol').findAll('li'): h2 = li.find('h2') a = h2.findParent('a') url = absolutize(a['href']) p = h2.findNextSibling('p') title = self.tag_to_string(h2) desc = '' if p: desc = self.tag_to_string(p) articles.append({'title': title, 'url': url, 'description': desc}) self.log('Found:', title, 'at', url) if desc: self.log('\t', desc) return feeds if __name__ == '__main__': import sys from calibre.ebooks.BeautifulSoup import BeautifulSoup print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))