diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index 424a6ddda6..475e55b05b 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -3,123 +3,10 @@ # License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import unicode_literals -import json -import re -from xml.sax.saxutils import escape, quoteattr -from calibre.utils.iso8601 import parse_iso8601 from calibre.web.feeds.news import BasicNewsRecipe -# {{{ parse NYT JSON -def is_heading(tn): - return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block') - - -def process_inline_text(lines, block): - text = '' - if 'text@stripHtml' in block: - text = escape(block['text@stripHtml']) - elif 'renderedRepresentation' in block: # happens in byline blocks - text = block['renderedRepresentation'] - elif 'text' in block: - text = block['text'] - if text: - for fmt in block.get('formats', ()): - tn = fmt['__typename'] - if tn == 'LinkFormat': - ab = fmt - text = '{}'.format(ab['url'], ab.get('title') or '', text) - elif tn == 'BoldFormat': - text = '' + text + '' - lines.append(text) - - -def process_paragraph(lines, block, content_key='content'): - tn = block['__typename'] - m = re.match(r'Heading([1-6])Block', tn) - if m is not None: - tag = 'h' + m.group(1) - else: - tag = 'p' - ta = block.get('textAlign') or 'LEFT' - style = 'text-align: {}'.format(ta.lower()) - lines.append('<{} style="{}">'.format(tag, style)) - for item in block[content_key]: - tn = item['__typename'] - if tn in ('TextInline', 'Byline'): - process_inline_text(lines, item) - lines.append('') - - -def process_timestamp(lines, block): - ts = block['timestamp'] - dt = parse_iso8601(ts, as_utc=False) - lines.append('

' + escape(dt.strftime('%b %d, %Y')) + '

') - - -def process_header(lines, block): - label = block.get('label') - if label: - process_paragraph(lines, label) - headline = block.get('headline') - if headline: - process_paragraph(lines, headline) - summary = block.get('summary') - if summary: - process_paragraph(lines, summary) - lm = block.get('ledeMedia') - if lm and lm.get('__typename') == 'ImageBlock': - process_image_block(lines, lm) - byline = block.get('byline') - if byline: - process_paragraph(lines, byline, content_key='bylines') - timestamp = block.get('timestampBlock') - if timestamp: - process_timestamp(lines, timestamp) - - -def process_image_block(lines, block): - media = block['media'] - caption = media.get('caption') - caption_lines = [] - if caption: - process_inline_text(caption_lines, caption) - crops = media['crops'] - renditions = crops[0]['renditions'] - img = renditions[0]['url'] - lines.append('
'.format(quoteattr(img))) - lines.extend(caption_lines) - lines.append('
') - - -def json_to_html(raw): - data = json.loads(raw.replace(':undefined', ':null')) - # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) - data = data['initialData']['data'] - article = next(iter(data.values())) - body = article['sprinkledBody']['content'] - lines = [] - for item in body: - tn = item['__typename'] - if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'): - process_header(lines, item) - elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn): - process_paragraph(lines, item) - elif tn == 'ImageBlock': - process_image_block(lines, item) - return '' + '\n'.join(lines) + '' - - -def extract_html(soup): - script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] - script = type(u'')(script) - raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';') - return json_to_html(raw) - -# }}} - - def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) @@ -143,7 +30,11 @@ class NewYorkTimesBookReview(BasicNewsRecipe): encoding = 'utf-8' def preprocess_raw_html(self, raw_html, url): - html = extract_html(self.index_to_soup(raw_html)) + if not hasattr(self, 'nyt_parser'): + from calibre.live import load_module + m = load_module('calibre.web.site_parsers.nytimes') + self.nyt_parser = m + html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) return html def parse_index(self): @@ -185,9 +76,3 @@ class NewYorkTimesBookReview(BasicNewsRecipe): self.log('\t', desc) return feeds - - -if __name__ == '__main__': - import sys - from calibre.ebooks.BeautifulSoup import BeautifulSoup - print(extract_html(BeautifulSoup(open(sys.argv[-1]).read()))) diff --git a/src/calibre/web/site_parsers/__init__.py b/src/calibre/web/site_parsers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py new file mode 100644 index 0000000000..4e49724234 --- /dev/null +++ b/src/calibre/web/site_parsers/nytimes.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +# License: GPL v3 Copyright: 2022, Kovid Goyal + +import json +import re +from xml.sax.saxutils import escape, quoteattr + +from calibre.utils.iso8601 import parse_iso8601 + + +def is_heading(tn): + return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block') + + +def process_inline_text(lines, block): + text = '' + if 'text@stripHtml' in block: + text = escape(block['text@stripHtml']) + elif 'renderedRepresentation' in block: # happens in byline blocks + text = block['renderedRepresentation'] + elif 'text' in block: + text = block['text'] + if text: + for fmt in block.get('formats', ()): + tn = fmt['__typename'] + if tn == 'LinkFormat': + ab = fmt + text = '{}'.format(ab['url'], ab.get('title') or '', text) + elif tn == 'BoldFormat': + text = '' + text + '' + lines.append(text) + + +def process_paragraph(lines, block, content_key='content'): + tn = block['__typename'] + m = re.match(r'Heading([1-6])Block', tn) + if m is not None: + tag = 'h' + m.group(1) + else: + tag = 'p' + ta = block.get('textAlign') or 'LEFT' + style = 'text-align: {}'.format(ta.lower()) + lines.append('<{} style="{}">'.format(tag, style)) + for item in block[content_key]: + tn = item['__typename'] + if tn in ('TextInline', 'Byline'): + process_inline_text(lines, item) + lines.append('') + + +def process_timestamp(lines, block): + ts = block['timestamp'] + dt = parse_iso8601(ts, as_utc=False) + lines.append('

' + escape(dt.strftime('%b %d, %Y')) + '

') + + +def process_header(lines, block): + label = block.get('label') + if label: + process_paragraph(lines, label) + headline = block.get('headline') + if headline: + process_paragraph(lines, headline) + summary = block.get('summary') + if summary: + process_paragraph(lines, summary) + lm = block.get('ledeMedia') + if lm and lm.get('__typename') == 'ImageBlock': + process_image_block(lines, lm) + byline = block.get('byline') + if byline: + process_paragraph(lines, byline, content_key='bylines') + timestamp = block.get('timestampBlock') + if timestamp: + process_timestamp(lines, timestamp) + + +def process_image_block(lines, block): + media = block['media'] + caption = media.get('caption') + caption_lines = [] + if caption: + process_inline_text(caption_lines, caption) + crops = media['crops'] + renditions = crops[0]['renditions'] + img = renditions[0]['url'] + if 'web.archive.org' in img: + img = img.partition('/')[-1] + img = img[img.find('https://'):] + lines.append('
'.format(quoteattr(img))) + lines.extend(caption_lines) + lines.append('
') + + +def json_to_html(raw): + data = json.loads(raw.replace(':undefined', ':null')) + # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) + data = data['initialData']['data'] + article = next(iter(data.values())) + body = article['sprinkledBody']['content'] + lines = [] + for item in body: + tn = item['__typename'] + if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'): + process_header(lines, item) + elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn): + process_paragraph(lines, item) + elif tn == 'ImageBlock': + process_image_block(lines, item) + return '' + '\n'.join(lines) + '' + + +def extract_html(soup): + script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] + script = type(u'')(script) + raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';') + return json_to_html(raw)