mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Attempt to parse nytimes live markup
This commit is contained in:
parent
8304e5911e
commit
1216e45580
@ -93,8 +93,6 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
conversion_options = {'flow_size': 0}
|
conversion_options = {'flow_size': 0}
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
if '/live/' in url:
|
|
||||||
self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
|
|
||||||
if not hasattr(self, 'nyt_parser'):
|
if not hasattr(self, 'nyt_parser'):
|
||||||
from calibre.live import load_module
|
from calibre.live import load_module
|
||||||
m = load_module('calibre.web.site_parsers.nytimes')
|
m = load_module('calibre.web.site_parsers.nytimes')
|
||||||
|
@ -93,8 +93,6 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
conversion_options = {'flow_size': 0}
|
conversion_options = {'flow_size': 0}
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
if '/live/' in url:
|
|
||||||
self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
|
|
||||||
if not hasattr(self, 'nyt_parser'):
|
if not hasattr(self, 'nyt_parser'):
|
||||||
from calibre.live import load_module
|
from calibre.live import load_module
|
||||||
m = load_module('calibre.web.site_parsers.nytimes')
|
m = load_module('calibre.web.site_parsers.nytimes')
|
||||||
|
@ -5,11 +5,13 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from xml.sax.saxutils import escape, quoteattr
|
from xml.sax.saxutils import escape, quoteattr
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
from calibre.utils.iso8601 import parse_iso8601
|
from calibre.utils.iso8601 import parse_iso8601
|
||||||
|
|
||||||
|
|
||||||
module_version = 1 # needed for live updates
|
module_version = 2 # needed for live updates
|
||||||
|
pprint
|
||||||
|
|
||||||
|
|
||||||
def is_heading(tn):
|
def is_heading(tn):
|
||||||
@ -99,7 +101,11 @@ def process_image_block(lines, block):
|
|||||||
def json_to_html(raw):
|
def json_to_html(raw):
|
||||||
data = json.loads(raw.replace(':undefined', ':null'))
|
data = json.loads(raw.replace(':undefined', ':null'))
|
||||||
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
|
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
|
||||||
|
try:
|
||||||
data = data['initialData']['data']
|
data = data['initialData']['data']
|
||||||
|
except TypeError:
|
||||||
|
data = data['initialState']
|
||||||
|
return live_json_to_html(data)
|
||||||
article = next(iter(data.values()))
|
article = next(iter(data.values()))
|
||||||
body = article['sprinkledBody']['content']
|
body = article['sprinkledBody']['content']
|
||||||
lines = []
|
lines = []
|
||||||
@ -114,6 +120,65 @@ def json_to_html(raw):
|
|||||||
return '<html><body>' + '\n'.join(lines) + '</body></html>'
|
return '<html><body>' + '\n'.join(lines) + '</body></html>'
|
||||||
|
|
||||||
|
|
||||||
|
def add_live_item(item, item_type, lines):
|
||||||
|
a = lines.append
|
||||||
|
if item_type == 'text':
|
||||||
|
a('<p>' + item['value'] + '</p>')
|
||||||
|
elif item_type == 'list':
|
||||||
|
a('<li>' + item['value'] + '</li>')
|
||||||
|
elif item_type == 'bulletedList':
|
||||||
|
a('<ul>')
|
||||||
|
for x in item['value']:
|
||||||
|
a('<li>' + x + '</li>')
|
||||||
|
a('</ul>')
|
||||||
|
elif item_type == 'items':
|
||||||
|
for x in item['value']:
|
||||||
|
a('<h5>' + x['subtitle'] + '</h5>')
|
||||||
|
add_live_item({'value': x['text']}, 'text', lines)
|
||||||
|
elif item_type == 'section':
|
||||||
|
for item in item['value']:
|
||||||
|
add_live_item(item, item['type'], lines)
|
||||||
|
elif item_type == '':
|
||||||
|
b = item
|
||||||
|
if b.get('title'):
|
||||||
|
a('<h3>' + b['title'] + '</h3>')
|
||||||
|
if b.get('imageUrl'):
|
||||||
|
a('<div><img src=' + quoteattr(b['imageUrl']) + '/></div>')
|
||||||
|
if b.get('leadIn'):
|
||||||
|
a('<p>' + b['leadIn'] + '</p>')
|
||||||
|
if 'items' in b:
|
||||||
|
add_live_item({'value': b['items']}, 'items', lines)
|
||||||
|
return
|
||||||
|
if 'bulletedList' in b:
|
||||||
|
add_live_item({'value': b['bulletedList']}, 'bulletedList', lines)
|
||||||
|
return
|
||||||
|
if 'sections' in b:
|
||||||
|
for section in b['sections']:
|
||||||
|
add_live_item({'value': section['section']}, 'section', lines)
|
||||||
|
return
|
||||||
|
raise Exception('Unknown item: %s' % b)
|
||||||
|
else:
|
||||||
|
raise Exception('Unknown item: %s' % b)
|
||||||
|
|
||||||
|
|
||||||
|
def live_json_to_html(data):
|
||||||
|
for k, v in data["ROOT_QUERY"].items():
|
||||||
|
if isinstance(v, dict) and 'id' in v:
|
||||||
|
root = data[v['id']]
|
||||||
|
s = data[root['storylines'][0]['id']]
|
||||||
|
s = data[s['storyline']['id']]
|
||||||
|
title = s['displayName']
|
||||||
|
lines = ['<h1>' + escape(title) + '</h1>']
|
||||||
|
for b in json.loads(s['experimentalJsonBlob'])['data'][0]['data']:
|
||||||
|
b = b['data']
|
||||||
|
if isinstance(b, list):
|
||||||
|
for x in b:
|
||||||
|
add_live_item(x, x['type'], lines)
|
||||||
|
else:
|
||||||
|
add_live_item(b, '', lines)
|
||||||
|
return '<html><body>' + '\n'.join(lines) + '</body></html>'
|
||||||
|
|
||||||
|
|
||||||
def extract_html(soup):
|
def extract_html(soup):
|
||||||
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
||||||
script = type(u'')(script)
|
script = type(u'')(script)
|
||||||
@ -134,3 +199,15 @@ def download_url(url, br):
|
|||||||
if not isinstance(raw, bytes):
|
if not isinstance(raw, bytes):
|
||||||
raw = raw.encode('utf-8')
|
raw = raw.encode('utf-8')
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
f = sys.argv[-1]
|
||||||
|
raw = open(f).read()
|
||||||
|
if f.endswith('.html'):
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
soup = BeautifulSoup(raw)
|
||||||
|
print(extract_html(soup))
|
||||||
|
else:
|
||||||
|
print(json_to_html(raw))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user