mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Dont fail to download interactive aricles from the Economist
They dont work well and look pretty awful, but better than nothing
This commit is contained in:
parent
bff96f44c2
commit
23e52f2cba
@ -53,9 +53,16 @@ def safe_dict(data, *names):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
class JSONHasNoContent(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def load_article_from_json(raw, root):
|
def load_article_from_json(raw, root):
|
||||||
|
# open('/t/raw.json', 'w').write(raw)
|
||||||
|
try:
|
||||||
data = json.loads(raw)['props']['pageProps']['content']
|
data = json.loads(raw)['props']['pageProps']['content']
|
||||||
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
|
except KeyError as e:
|
||||||
|
raise JSONHasNoContent(e)
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
data = data[0]
|
data = data[0]
|
||||||
body = root.xpath('//body')[0]
|
body = root.xpath('//body')[0]
|
||||||
@ -77,6 +84,20 @@ def load_article_from_json(raw, root):
|
|||||||
process_node(node, article)
|
process_node(node, article)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_html_article(root):
|
||||||
|
main = root.xpath('//main')[0]
|
||||||
|
body = root.xpath('//body')[0]
|
||||||
|
for child in tuple(body):
|
||||||
|
body.remove(child)
|
||||||
|
body.append(main)
|
||||||
|
main.set('id', '')
|
||||||
|
main.tag = 'article'
|
||||||
|
for x in root.xpath('//*[@style]'):
|
||||||
|
x.set('style', '')
|
||||||
|
for x in root.xpath('//button'):
|
||||||
|
x.getparent().remove(x)
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
q = frozenset(classes.split(' '))
|
q = frozenset(classes.split(' '))
|
||||||
return dict(attrs={
|
return dict(attrs={
|
||||||
@ -104,6 +125,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = 'The Economist'
|
title = 'The Economist'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
__author__ = "Kovid Goyal"
|
__author__ = "Kovid Goyal"
|
||||||
description = (
|
description = (
|
||||||
@ -208,7 +230,10 @@ class Economist(BasicNewsRecipe):
|
|||||||
root = parse(raw)
|
root = parse(raw)
|
||||||
script = root.xpath('//script[@id="__NEXT_DATA__"]')
|
script = root.xpath('//script[@id="__NEXT_DATA__"]')
|
||||||
if script:
|
if script:
|
||||||
|
try:
|
||||||
load_article_from_json(script[0].text, root)
|
load_article_from_json(script[0].text, root)
|
||||||
|
except JSONHasNoContent:
|
||||||
|
cleanup_html_article(root)
|
||||||
for div in root.xpath('//div[@class="lazy-image"]'):
|
for div in root.xpath('//div[@class="lazy-image"]'):
|
||||||
noscript = list(div.iter('noscript'))
|
noscript = list(div.iter('noscript'))
|
||||||
if noscript and noscript[0].text:
|
if noscript and noscript[0].text:
|
||||||
@ -248,7 +273,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# return [('Articles', [{'title':'test',
|
# return [('Articles', [{'title':'test',
|
||||||
# 'url':'https://www.economist.com/economic-and-financial-indicators/2022/04/23/economic-data-commodities-and-markets'
|
# 'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-models-are-turbo-charging-ai-progress'
|
||||||
# }])]
|
# }])]
|
||||||
if edition_date:
|
if edition_date:
|
||||||
url = 'https://www.economist.com/weeklyedition/' + edition_date
|
url = 'https://www.economist.com/weeklyedition/' + edition_date
|
||||||
|
@ -53,9 +53,16 @@ def safe_dict(data, *names):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
class JSONHasNoContent(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def load_article_from_json(raw, root):
|
def load_article_from_json(raw, root):
|
||||||
|
# open('/t/raw.json', 'w').write(raw)
|
||||||
|
try:
|
||||||
data = json.loads(raw)['props']['pageProps']['content']
|
data = json.loads(raw)['props']['pageProps']['content']
|
||||||
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
|
except KeyError as e:
|
||||||
|
raise JSONHasNoContent(e)
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
data = data[0]
|
data = data[0]
|
||||||
body = root.xpath('//body')[0]
|
body = root.xpath('//body')[0]
|
||||||
@ -77,6 +84,20 @@ def load_article_from_json(raw, root):
|
|||||||
process_node(node, article)
|
process_node(node, article)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_html_article(root):
|
||||||
|
main = root.xpath('//main')[0]
|
||||||
|
body = root.xpath('//body')[0]
|
||||||
|
for child in tuple(body):
|
||||||
|
body.remove(child)
|
||||||
|
body.append(main)
|
||||||
|
main.set('id', '')
|
||||||
|
main.tag = 'article'
|
||||||
|
for x in root.xpath('//*[@style]'):
|
||||||
|
x.set('style', '')
|
||||||
|
for x in root.xpath('//button'):
|
||||||
|
x.getparent().remove(x)
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
q = frozenset(classes.split(' '))
|
q = frozenset(classes.split(' '))
|
||||||
return dict(attrs={
|
return dict(attrs={
|
||||||
@ -104,6 +125,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = 'The Economist'
|
title = 'The Economist'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
__author__ = "Kovid Goyal"
|
__author__ = "Kovid Goyal"
|
||||||
description = (
|
description = (
|
||||||
@ -208,7 +230,10 @@ class Economist(BasicNewsRecipe):
|
|||||||
root = parse(raw)
|
root = parse(raw)
|
||||||
script = root.xpath('//script[@id="__NEXT_DATA__"]')
|
script = root.xpath('//script[@id="__NEXT_DATA__"]')
|
||||||
if script:
|
if script:
|
||||||
|
try:
|
||||||
load_article_from_json(script[0].text, root)
|
load_article_from_json(script[0].text, root)
|
||||||
|
except JSONHasNoContent:
|
||||||
|
cleanup_html_article(root)
|
||||||
for div in root.xpath('//div[@class="lazy-image"]'):
|
for div in root.xpath('//div[@class="lazy-image"]'):
|
||||||
noscript = list(div.iter('noscript'))
|
noscript = list(div.iter('noscript'))
|
||||||
if noscript and noscript[0].text:
|
if noscript and noscript[0].text:
|
||||||
@ -248,7 +273,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# return [('Articles', [{'title':'test',
|
# return [('Articles', [{'title':'test',
|
||||||
# 'url':'https://www.economist.com/economic-and-financial-indicators/2022/04/23/economic-data-commodities-and-markets'
|
# 'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-models-are-turbo-charging-ai-progress'
|
||||||
# }])]
|
# }])]
|
||||||
if edition_date:
|
if edition_date:
|
||||||
url = 'https://www.economist.com/weeklyedition/' + edition_date
|
url = 'https://www.economist.com/weeklyedition/' + edition_date
|
||||||
|
Loading…
x
Reference in New Issue
Block a user