Move NYT JSON parsing code into its own module so it can be shared

2025-07-09 03:04:10 -04:00 · 2022-09-11 21:21:02 +05:30 · 2022-09-11 21:21:02 +05:30 · 1b53ec7fdd
commit 1b53ec7fdd
parent 3eb4a41139
3 changed files with 123 additions and 120 deletions
--- a/recipes/nytimesbook.recipe
+++ b/recipes/nytimesbook.recipe
@ -3,123 +3,10 @@
 # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>

 from __future__ import unicode_literals
-import json
-import re
-from xml.sax.saxutils import escape, quoteattr

-from calibre.utils.iso8601 import parse_iso8601
 from calibre.web.feeds.news import BasicNewsRecipe


-# {{{ parse NYT JSON
-def is_heading(tn):
-    return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
-
-
-def process_inline_text(lines, block):
-    text = ''
-    if 'text@stripHtml' in block:
-        text = escape(block['text@stripHtml'])
-    elif 'renderedRepresentation' in block:  # happens in byline blocks
-        text = block['renderedRepresentation']
-    elif 'text' in block:
-        text = block['text']
-    if text:
-        for fmt in block.get('formats', ()):
-            tn = fmt['__typename']
-            if tn == 'LinkFormat':
-                ab = fmt
-                text = '<a href="{}" title="{}">{}</a>'.format(ab['url'], ab.get('title') or '', text)
-            elif tn == 'BoldFormat':
-                text = '<b>' + text + '</b>'
-        lines.append(text)
-
-
-def process_paragraph(lines, block, content_key='content'):
-    tn = block['__typename']
-    m = re.match(r'Heading([1-6])Block', tn)
-    if m is not None:
-        tag = 'h' + m.group(1)
-    else:
-        tag = 'p'
-    ta = block.get('textAlign') or 'LEFT'
-    style = 'text-align: {}'.format(ta.lower())
-    lines.append('<{} style="{}">'.format(tag, style))
-    for item in block[content_key]:
-        tn = item['__typename']
-        if tn in ('TextInline', 'Byline'):
-            process_inline_text(lines, item)
-    lines.append('</' + tag + '>')
-
-
-def process_timestamp(lines, block):
-    ts = block['timestamp']
-    dt = parse_iso8601(ts, as_utc=False)
-    lines.append('<p class="timestamp">' + escape(dt.strftime('%b %d, %Y')) + '</p>')
-
-
-def process_header(lines, block):
-    label = block.get('label')
-    if label:
-        process_paragraph(lines, label)
-    headline = block.get('headline')
-    if headline:
-        process_paragraph(lines, headline)
-    summary = block.get('summary')
-    if summary:
-        process_paragraph(lines, summary)
-    lm = block.get('ledeMedia')
-    if lm and lm.get('__typename') == 'ImageBlock':
-        process_image_block(lines, lm)
-    byline = block.get('byline')
-    if byline:
-        process_paragraph(lines, byline, content_key='bylines')
-    timestamp = block.get('timestampBlock')
-    if timestamp:
-        process_timestamp(lines, timestamp)
-
-
-def process_image_block(lines, block):
-    media = block['media']
-    caption = media.get('caption')
-    caption_lines = []
-    if caption:
-        process_inline_text(caption_lines, caption)
-    crops = media['crops']
-    renditions = crops[0]['renditions']
-    img = renditions[0]['url']
-    lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(img)))
-    lines.extend(caption_lines)
-    lines.append('</div>')
-
-
-def json_to_html(raw):
-    data = json.loads(raw.replace(':undefined', ':null'))
-    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
-    data = data['initialData']['data']
-    article = next(iter(data.values()))
-    body = article['sprinkledBody']['content']
-    lines = []
-    for item in body:
-        tn = item['__typename']
-        if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'):
-            process_header(lines, item)
-        elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn):
-            process_paragraph(lines, item)
-        elif tn == 'ImageBlock':
-            process_image_block(lines, item)
-    return '<html><body>' + '\n'.join(lines) + '</body></html>'
-
-
-def extract_html(soup):
-    script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
-    script = type(u'')(script)
-    raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
-    return json_to_html(raw)
-
-# }}}
-
-
 def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
@ -143,7 +30,11 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
    encoding = 'utf-8'

    def preprocess_raw_html(self, raw_html, url):
-        html = extract_html(self.index_to_soup(raw_html))
+        if not hasattr(self, 'nyt_parser'):
+            from calibre.live import load_module
+            m = load_module('calibre.web.site_parsers.nytimes')
+            self.nyt_parser = m
+        html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
        return html

    def parse_index(self):
@ -185,9 +76,3 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
                self.log('\t', desc)

        return feeds
-
-
-if __name__ == '__main__':
-    import sys
-    from calibre.ebooks.BeautifulSoup import BeautifulSoup
-    print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))
--- a/src/calibre/web/site_parsers/init.py
+++ b/src/calibre/web/site_parsers/init.py
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@ -0,0 +1,118 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
+
+import json
+import re
+from xml.sax.saxutils import escape, quoteattr
+
+from calibre.utils.iso8601 import parse_iso8601
+
+
+def is_heading(tn):
+    return tn in ('Heading1Block', 'Heading2Block', 'Heading3Block', 'Heading4Block')
+
+
+def process_inline_text(lines, block):
+    text = ''
+    if 'text@stripHtml' in block:
+        text = escape(block['text@stripHtml'])
+    elif 'renderedRepresentation' in block:  # happens in byline blocks
+        text = block['renderedRepresentation']
+    elif 'text' in block:
+        text = block['text']
+    if text:
+        for fmt in block.get('formats', ()):
+            tn = fmt['__typename']
+            if tn == 'LinkFormat':
+                ab = fmt
+                text = '<a href="{}" title="{}">{}</a>'.format(ab['url'], ab.get('title') or '', text)
+            elif tn == 'BoldFormat':
+                text = '<b>' + text + '</b>'
+        lines.append(text)
+
+
+def process_paragraph(lines, block, content_key='content'):
+    tn = block['__typename']
+    m = re.match(r'Heading([1-6])Block', tn)
+    if m is not None:
+        tag = 'h' + m.group(1)
+    else:
+        tag = 'p'
+    ta = block.get('textAlign') or 'LEFT'
+    style = 'text-align: {}'.format(ta.lower())
+    lines.append('<{} style="{}">'.format(tag, style))
+    for item in block[content_key]:
+        tn = item['__typename']
+        if tn in ('TextInline', 'Byline'):
+            process_inline_text(lines, item)
+    lines.append('</' + tag + '>')
+
+
+def process_timestamp(lines, block):
+    ts = block['timestamp']
+    dt = parse_iso8601(ts, as_utc=False)
+    lines.append('<p class="timestamp">' + escape(dt.strftime('%b %d, %Y')) + '</p>')
+
+
+def process_header(lines, block):
+    label = block.get('label')
+    if label:
+        process_paragraph(lines, label)
+    headline = block.get('headline')
+    if headline:
+        process_paragraph(lines, headline)
+    summary = block.get('summary')
+    if summary:
+        process_paragraph(lines, summary)
+    lm = block.get('ledeMedia')
+    if lm and lm.get('__typename') == 'ImageBlock':
+        process_image_block(lines, lm)
+    byline = block.get('byline')
+    if byline:
+        process_paragraph(lines, byline, content_key='bylines')
+    timestamp = block.get('timestampBlock')
+    if timestamp:
+        process_timestamp(lines, timestamp)
+
+
+def process_image_block(lines, block):
+    media = block['media']
+    caption = media.get('caption')
+    caption_lines = []
+    if caption:
+        process_inline_text(caption_lines, caption)
+    crops = media['crops']
+    renditions = crops[0]['renditions']
+    img = renditions[0]['url']
+    if 'web.archive.org' in img:
+        img = img.partition('/')[-1]
+        img = img[img.find('https://'):]
+    lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(img)))
+    lines.extend(caption_lines)
+    lines.append('</div>')
+
+
+def json_to_html(raw):
+    data = json.loads(raw.replace(':undefined', ':null'))
+    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
+    data = data['initialData']['data']
+    article = next(iter(data.values()))
+    body = article['sprinkledBody']['content']
+    lines = []
+    for item in body:
+        tn = item['__typename']
+        if tn in ('HeaderBasicBlock', 'HeaderLegacyBlock', 'HeaderFullBleedVerticalBlock'):
+            process_header(lines, item)
+        elif tn in ('ParagraphBlock', 'LabelBlock', 'DetailBlock') or is_heading(tn):
+            process_paragraph(lines, item)
+        elif tn == 'ImageBlock':
+            process_image_block(lines, item)
+    return '<html><body>' + '\n'.join(lines) + '</body></html>'
+
+
+def extract_html(soup):
+    script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
+    script = type(u'')(script)
+    raw = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
+    return json_to_html(raw)