This commit is contained in:
unkn0w7n 2025-03-16 11:40:07 +05:30
parent 3ebc50d03a
commit 26bb850d62
2 changed files with 40 additions and 5 deletions

View File

@ -11,13 +11,23 @@ from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
def process_list(li_node):
li_html = ''
for li in li_node['items']:
if li.get('textHtml'):
li_html += f'<li>{li.get("textHtml")}</li>'
else:
li_html += f'<li>{li.get("text", "")}</li>'
return li_html
def process_node(node): def process_node(node):
ntype = node.get('type', '') ntype = node.get('type', '')
if ntype == 'CROSSHEAD': if ntype == 'CROSSHEAD':
if node.get('textHtml'): if node.get('textHtml'):
return f'<h4>{node.get("textHtml")}</h4>' return f'<h4>{node.get("textHtml")}</h4>'
return f'<h4>{node.get("text", "")}</h4>' return f'<h4>{node.get("text", "")}</h4>'
elif ntype == 'PARAGRAPH': elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
if node.get('textHtml'): if node.get('textHtml'):
return f'<p>{node.get("textHtml")}</p>' return f'<p>{node.get("textHtml")}</p>'
return f'<p>{node.get("text", "")}</p>' return f'<p>{node.get("text", "")}</p>'
@ -34,9 +44,15 @@ def process_node(node):
return f'<blockquote>{node.get("text", "")}</blockquote>' return f'<blockquote>{node.get("text", "")}</blockquote>'
elif ntype == 'DIVIDER': elif ntype == 'DIVIDER':
return '<hr>' return '<hr>'
elif ntype == 'INFOGRAPHIC':
if node.get('fallback'):
return process_node(node['fallback'])
elif ntype == 'INFOBOX': elif ntype == 'INFOBOX':
for x in safe_dict(node, 'components'): for x in safe_dict(node, 'components'):
return f'<blockquote>{process_node(x)}</blockquote>' return f'<blockquote>{process_node(x)}</blockquote>'
elif ntype == 'UNORDERED_LIST':
if node.get('items'):
return process_list(node)
elif ntype: elif ntype:
print('** ', ntype) print('** ', ntype)
return '' return ''
@ -121,7 +137,7 @@ def process_url(url):
class Econ1843(BasicNewsRecipe): class Econ1843(BasicNewsRecipe):
title = 'Economist 1843' title = 'Economist 1843'
language = 'en' language = 'en_GB'
encoding = 'utf-8' encoding = 'utf-8'
masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'

View File

@ -12,13 +12,23 @@ from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
def process_list(li_node):
li_html = ''
for li in li_node['items']:
if li.get('textHtml'):
li_html += f'<li>{li.get("textHtml")}</li>'
else:
li_html += f'<li>{li.get("text", "")}</li>'
return li_html
def process_node(node): def process_node(node):
ntype = node.get('type', '') ntype = node.get('type', '')
if ntype == 'CROSSHEAD': if ntype == 'CROSSHEAD':
if node.get('textHtml'): if node.get('textHtml'):
return f'<h4>{node.get("textHtml")}</h4>' return f'<h4>{node.get("textHtml")}</h4>'
return f'<h4>{node.get("text", "")}</h4>' return f'<h4>{node.get("text", "")}</h4>'
elif ntype == 'PARAGRAPH': elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
if node.get('textHtml'): if node.get('textHtml'):
return f'<p>{node.get("textHtml")}</p>' return f'<p>{node.get("textHtml")}</p>'
return f'<p>{node.get("text", "")}</p>' return f'<p>{node.get("text", "")}</p>'
@ -35,9 +45,15 @@ def process_node(node):
return f'<blockquote>{node.get("text", "")}</blockquote>' return f'<blockquote>{node.get("text", "")}</blockquote>'
elif ntype == 'DIVIDER': elif ntype == 'DIVIDER':
return '<hr>' return '<hr>'
elif ntype == 'INFOGRAPHIC':
if node.get('fallback'):
return process_node(node['fallback'])
elif ntype == 'INFOBOX': elif ntype == 'INFOBOX':
for x in safe_dict(node, 'components'): for x in safe_dict(node, 'components'):
return f'<blockquote>{process_node(x)}</blockquote>' return f'<blockquote>{process_node(x)}</blockquote>'
elif ntype == 'UNORDERED_LIST':
if node.get('items'):
return process_list(node)
elif ntype: elif ntype:
print('** ', ntype) print('** ', ntype)
return '' return ''
@ -57,7 +73,10 @@ class JSONHasNoContent(ValueError):
def load_article_from_json(raw): def load_article_from_json(raw):
# open('/t/raw.json', 'w').write(raw) # open('/t/raw.json', 'w').write(raw)
body = '' body = ''
data = json.loads(raw)['props']['pageProps']['cp2Content'] try:
data = json.loads(raw)['props']['pageProps']['cp2Content']
except Exception:
data = json.loads(raw)['props']['pageProps']['content']
body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>' body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>'
body += f'<h1>{data["headline"]}</h1>' body += f'<h1>{data["headline"]}</h1>'
body += f'<div style="font-style: italic; color:#202020;">{data.get("rubric", "")}</div>' body += f'<div style="font-style: italic; color:#202020;">{data.get("rubric", "")}</div>'
@ -118,7 +137,7 @@ def process_url(url):
class EconomistWorld(BasicNewsRecipe): class EconomistWorld(BasicNewsRecipe):
title = 'The Economist World Ahead' title = 'The Economist World Ahead'
language = 'en' language = 'en_GB'
encoding = 'utf-8' encoding = 'utf-8'
masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png' masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'