From 1216e4558019840a32fa7df6f05368279bd7743a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 Sep 2022 13:20:45 +0530 Subject: [PATCH] Attempt to parse nytimes live markup --- recipes/nytimes.recipe | 2 - recipes/nytimes_sub.recipe | 2 - src/calibre/web/site_parsers/nytimes.py | 81 ++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 6 deletions(-) diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index 944502fa93..1f7826de1f 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -93,8 +93,6 @@ class NewYorkTimes(BasicNewsRecipe): conversion_options = {'flow_size': 0} def preprocess_raw_html(self, raw_html, url): - if '/live/' in url: - self.abort_article('Cant be bothered decoding the JSON for NYT live articles') if not hasattr(self, 'nyt_parser'): from calibre.live import load_module m = load_module('calibre.web.site_parsers.nytimes') diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 28f5e3582e..6b42573e88 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -93,8 +93,6 @@ class NewYorkTimes(BasicNewsRecipe): conversion_options = {'flow_size': 0} def preprocess_raw_html(self, raw_html, url): - if '/live/' in url: - self.abort_article('Cant be bothered decoding the JSON for NYT live articles') if not hasattr(self, 'nyt_parser'): from calibre.live import load_module m = load_module('calibre.web.site_parsers.nytimes') diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index 1e1e6b881c..f849ac9098 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -5,11 +5,13 @@ import json import re from xml.sax.saxutils import escape, quoteattr +from pprint import pprint from calibre.utils.iso8601 import parse_iso8601 -module_version = 1 # needed for live updates +module_version = 2 # needed for live updates +pprint def is_heading(tn): @@ -99,7 +101,11 @@ def process_image_block(lines, block): def json_to_html(raw): data = json.loads(raw.replace(':undefined', ':null')) # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) - data = data['initialData']['data'] + try: + data = data['initialData']['data'] + except TypeError: + data = data['initialState'] + return live_json_to_html(data) article = next(iter(data.values())) body = article['sprinkledBody']['content'] lines = [] @@ -114,6 +120,65 @@ def json_to_html(raw): return '' + '\n'.join(lines) + '' +def add_live_item(item, item_type, lines): + a = lines.append + if item_type == 'text': + a('

' + item['value'] + '

') + elif item_type == 'list': + a('
  • ' + item['value'] + '
  • ') + elif item_type == 'bulletedList': + a('') + elif item_type == 'items': + for x in item['value']: + a('
    ' + x['subtitle'] + '
    ') + add_live_item({'value': x['text']}, 'text', lines) + elif item_type == 'section': + for item in item['value']: + add_live_item(item, item['type'], lines) + elif item_type == '': + b = item + if b.get('title'): + a('

    ' + b['title'] + '

    ') + if b.get('imageUrl'): + a('
    ') + if b.get('leadIn'): + a('

    ' + b['leadIn'] + '

    ') + if 'items' in b: + add_live_item({'value': b['items']}, 'items', lines) + return + if 'bulletedList' in b: + add_live_item({'value': b['bulletedList']}, 'bulletedList', lines) + return + if 'sections' in b: + for section in b['sections']: + add_live_item({'value': section['section']}, 'section', lines) + return + raise Exception('Unknown item: %s' % b) + else: + raise Exception('Unknown item: %s' % b) + + +def live_json_to_html(data): + for k, v in data["ROOT_QUERY"].items(): + if isinstance(v, dict) and 'id' in v: + root = data[v['id']] + s = data[root['storylines'][0]['id']] + s = data[s['storyline']['id']] + title = s['displayName'] + lines = ['

    ' + escape(title) + '

    '] + for b in json.loads(s['experimentalJsonBlob'])['data'][0]['data']: + b = b['data'] + if isinstance(b, list): + for x in b: + add_live_item(x, x['type'], lines) + else: + add_live_item(b, '', lines) + return '' + '\n'.join(lines) + '' + + def extract_html(soup): script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0] script = type(u'')(script) @@ -134,3 +199,15 @@ def download_url(url, br): if not isinstance(raw, bytes): raw = raw.encode('utf-8') return raw + + +if __name__ == '__main__': + import sys + f = sys.argv[-1] + raw = open(f).read() + if f.endswith('.html'): + from calibre.ebooks.BeautifulSoup import BeautifulSoup + soup = BeautifulSoup(raw) + print(extract_html(soup)) + else: + print(json_to_html(raw))