From a06773723401170d8062a979348a478ad539cb00 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Fri, 18 Jul 2025 21:28:16 +0530
Subject: [PATCH] Update economist_news.recipe
Enhanced the parse_txt function to handle 'subscript' and 'superscript' tags.
---
recipes/economist_news.recipe | 33 +++++++++++++++++++++++++++++----
1 file changed, 29 insertions(+), 4 deletions(-)
diff --git a/recipes/economist_news.recipe b/recipes/economist_news.recipe
index 793fc98d5c..7962a9067f 100644
--- a/recipes/economist_news.recipe
+++ b/recipes/economist_news.recipe
@@ -7,6 +7,9 @@ from datetime import datetime, timedelta
from urllib.parse import quote, urlencode
from uuid import uuid4
+from html5_parser import parse
+from lxml import etree
+
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
@@ -45,13 +48,13 @@ def parse_txt(ty):
tag_map = {
'text': lambda: [ty.get('value', '')],
'scaps': lambda: [
- f'{"".join(parse_txt(c))}'
+ f'{"".join(parse_txt(c))}'
for c in children
],
'bold': lambda: [f'{"".join(parse_txt(c))}' for c in children],
'drop_caps': lambda: [f'{"".join(parse_txt(c))}' for c in children],
'italic': lambda: [f'{"".join(parse_txt(c))}' for c in children],
- 'linebreak': lambda: ['
'],
+ 'linebreak': lambda: ['
'],
'external_link': lambda: [
f'{"".join(parse_txt(children[0]))}'
]
@@ -63,6 +66,8 @@ def parse_txt(ty):
if children
else [],
'ufinish': lambda: [text for c in children for text in parse_txt(c)],
+ 'subscript': lambda: [f'{"".join(parse_txt(c))}' for c in children],
+ 'superscript': lambda: [f'{"".join(parse_txt(c))}' for c in children],
}
if typ in tag_map:
@@ -239,7 +244,7 @@ class EconomistNews(BasicNewsRecipe):
def economist_test_article(self):
return [('Articles', [{'title': 'test',
- 'url': 'https://www.economist.com/1843/2025/05/16/the-rise-fall-and-contested-future-of-hizbullah'
+ 'url': 'https://www.economist.com/letters/2025/07/17/the-politicisation-of-the-federal-reserve'
}])]
def economist_return_index(self, ans):
@@ -313,7 +318,27 @@ class EconomistNews(BasicNewsRecipe):
def preprocess_raw_html(self, raw, url):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
- return load_article_from_web_json(raw)
+ html = load_article_from_web_json(raw)
+ root = parse(html)
+ for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
+ x.getparent().remove(x)
+ # the economist uses for small caps with a custom font
+ for init in root.xpath('//span[@data-caps="initial"]'):
+ init.set('style', 'font-weight:bold;')
+ for x in root.xpath('//small'):
+ if x.text and len(x) == 0:
+ x.text = x.text.upper()
+ x.tag = 'span'
+ x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;')
+ for h2 in root.xpath('//h2'):
+ h2.tag = 'h4'
+ for x in root.xpath('//figcaption'):
+ x.set('style', 'text-align:center; font-size:small;')
+ for x in root.xpath('//cite'):
+ x.tag = 'blockquote'
+ x.set('style', 'color:#404040;')
+ raw = etree.tostring(root, encoding='unicode')
+ return raw
def get_article(self, url):
query = {