From 1216e4558019840a32fa7df6f05368279bd7743a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 12 Sep 2022 13:20:45 +0530
Subject: [PATCH] Attempt to parse nytimes live markup

---
 recipes/nytimes.recipe                  |  2 -
 recipes/nytimes_sub.recipe              |  2 -
 src/calibre/web/site_parsers/nytimes.py | 81 ++++++++++++++++++++++++-
 3 files changed, 79 insertions(+), 6 deletions(-)
diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe
index 944502fa93..1f7826de1f 100644
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@@ -93,8 +93,6 @@ class NewYorkTimes(BasicNewsRecipe):
     conversion_options = {'flow_size': 0}
 
     def preprocess_raw_html(self, raw_html, url):
-        if '/live/' in url:
-            self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
         if not hasattr(self, 'nyt_parser'):
             from calibre.live import load_module
             m = load_module('calibre.web.site_parsers.nytimes')
diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 28f5e3582e..6b42573e88 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -93,8 +93,6 @@ class NewYorkTimes(BasicNewsRecipe):
     conversion_options = {'flow_size': 0}
 
     def preprocess_raw_html(self, raw_html, url):
-        if '/live/' in url:
-            self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
         if not hasattr(self, 'nyt_parser'):
             from calibre.live import load_module
             m = load_module('calibre.web.site_parsers.nytimes')
diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py
index 1e1e6b881c..f849ac9098 100644
--- a/src/calibre/web/site_parsers/nytimes.py
+++ b/src/calibre/web/site_parsers/nytimes.py
@@ -5,11 +5,13 @@
 import json
 import re
 from xml.sax.saxutils import escape, quoteattr
+from pprint import pprint
 
 from calibre.utils.iso8601 import parse_iso8601
 
 
-module_version = 1  # needed for live updates
+module_version = 2  # needed for live updates
+pprint
 
 
 def is_heading(tn):
@@ -99,7 +101,11 @@ def process_image_block(lines, block):
 def json_to_html(raw):
     data = json.loads(raw.replace(':undefined', ':null'))
     # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
-    data = data['initialData']['data']
+    try:
+        data = data['initialData']['data']
+    except TypeError:
+        data = data['initialState']
+        return live_json_to_html(data)
     article = next(iter(data.values()))
     body = article['sprinkledBody']['content']
     lines = []
@@ -114,6 +120,65 @@ def json_to_html(raw):
     return '<html><body>' + '\n'.join(lines) + '</body></html>'
 
 
+def add_live_item(item, item_type, lines):
+    a = lines.append
+    if item_type == 'text':
+        a('<p>' + item['value'] + '</p>')
+    elif item_type == 'list':
+        a('<li>' + item['value'] + '</li>')
+    elif item_type == 'bulletedList':
+        a('<ul>')
+        for x in item['value']:
+            a('<li>' + x + '</li>')
+        a('</ul>')
+    elif item_type == 'items':
+        for x in item['value']:
+            a('<h5>' + x['subtitle'] + '</h5>')
+            add_live_item({'value': x['text']}, 'text', lines)
+    elif item_type == 'section':
+        for item in item['value']:
+            add_live_item(item, item['type'], lines)
+    elif item_type == '':
+        b = item
+        if b.get('title'):
+            a('<h3>' + b['title'] + '</h3>')
+        if b.get('imageUrl'):
+            a('<div><img src=' + quoteattr(b['imageUrl']) + '/></div>')
+        if b.get('leadIn'):
+            a('<p>' + b['leadIn'] + '</p>')
+        if 'items' in b:
+            add_live_item({'value': b['items']}, 'items', lines)
+            return
+        if 'bulletedList' in b:
+            add_live_item({'value': b['bulletedList']}, 'bulletedList', lines)
+            return
+        if 'sections' in b:
+            for section in b['sections']:
+                add_live_item({'value': section['section']}, 'section', lines)
+            return
+        raise Exception('Unknown item: %s' % b)
+    else:
+        raise Exception('Unknown item: %s' % b)
+
+
+def live_json_to_html(data):
+    for k, v in data["ROOT_QUERY"].items():
+        if isinstance(v, dict) and 'id' in v:
+            root = data[v['id']]
+    s = data[root['storylines'][0]['id']]
+    s = data[s['storyline']['id']]
+    title = s['displayName']
+    lines = ['<h1>' + escape(title) + '</h1>']
+    for b in json.loads(s['experimentalJsonBlob'])['data'][0]['data']:
+        b = b['data']
+        if isinstance(b, list):
+            for x in b:
+                add_live_item(x, x['type'], lines)
+        else:
+            add_live_item(b, '', lines)
+    return '<html><body>' + '\n'.join(lines) + '</body></html>'
+
+
 def extract_html(soup):
     script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
     script = type(u'')(script)
@@ -134,3 +199,15 @@ def download_url(url, br):
     if not isinstance(raw, bytes):
         raw = raw.encode('utf-8')
     return raw
+
+
+if __name__ == '__main__':
+    import sys
+    f = sys.argv[-1]
+    raw = open(f).read()
+    if f.endswith('.html'):
+        from calibre.ebooks.BeautifulSoup import BeautifulSoup
+        soup = BeautifulSoup(raw)
+        print(extract_html(soup))
+    else:
+        print(json_to_html(raw))