This commit is contained in:
Kovid Goyal 2025-07-18 21:31:25 +05:30
commit b0bc385250
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -7,6 +7,9 @@ from datetime import datetime, timedelta
from urllib.parse import quote, urlencode from urllib.parse import quote, urlencode
from uuid import uuid4 from uuid import uuid4
from html5_parser import parse
from lxml import etree
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -45,13 +48,13 @@ def parse_txt(ty):
tag_map = { tag_map = {
'text': lambda: [ty.get('value', '')], 'text': lambda: [ty.get('value', '')],
'scaps': lambda: [ 'scaps': lambda: [
f'<span style="font-variant: all-small-caps;">{"".join(parse_txt(c))}</span>' f'<span style="text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;">{"".join(parse_txt(c))}</span>'
for c in children for c in children
], ],
'bold': lambda: [f'<b>{"".join(parse_txt(c))}</b>' for c in children], 'bold': lambda: [f'<b>{"".join(parse_txt(c))}</b>' for c in children],
'drop_caps': lambda: [f'<b>{"".join(parse_txt(c))}</b>' for c in children], 'drop_caps': lambda: [f'<b>{"".join(parse_txt(c))}</b>' for c in children],
'italic': lambda: [f'<i>{"".join(parse_txt(c))}</i>' for c in children], 'italic': lambda: [f'<i>{"".join(parse_txt(c))}</i>' for c in children],
'linebreak': lambda: ['<hr>'], 'linebreak': lambda: ['<br>'],
'external_link': lambda: [ 'external_link': lambda: [
f'<a href="{attr}">{"".join(parse_txt(children[0]))}</a>' f'<a href="{attr}">{"".join(parse_txt(children[0]))}</a>'
] ]
@ -63,6 +66,8 @@ def parse_txt(ty):
if children if children
else [], else [],
'ufinish': lambda: [text for c in children for text in parse_txt(c)], 'ufinish': lambda: [text for c in children for text in parse_txt(c)],
'subscript': lambda: [f'<sub>{"".join(parse_txt(c))}</sub>' for c in children],
'superscript': lambda: [f'<sup>{"".join(parse_txt(c))}</sup>' for c in children],
} }
if typ in tag_map: if typ in tag_map:
@ -239,7 +244,7 @@ class EconomistNews(BasicNewsRecipe):
def economist_test_article(self): def economist_test_article(self):
return [('Articles', [{'title': 'test', return [('Articles', [{'title': 'test',
'url': 'https://www.economist.com/1843/2025/05/16/the-rise-fall-and-contested-future-of-hizbullah' 'url': 'https://www.economist.com/letters/2025/07/17/the-politicisation-of-the-federal-reserve'
}])] }])]
def economist_return_index(self, ans): def economist_return_index(self, ans):
@ -313,7 +318,27 @@ class EconomistNews(BasicNewsRecipe):
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8')) # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
return load_article_from_web_json(raw) html = load_article_from_web_json(raw)
root = parse(html)
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
x.getparent().remove(x)
# the economist uses <small> for small caps with a custom font
for init in root.xpath('//span[@data-caps="initial"]'):
init.set('style', 'font-weight:bold;')
for x in root.xpath('//small'):
if x.text and len(x) == 0:
x.text = x.text.upper()
x.tag = 'span'
x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;')
for h2 in root.xpath('//h2'):
h2.tag = 'h4'
for x in root.xpath('//figcaption'):
x.set('style', 'text-align:center; font-size:small;')
for x in root.xpath('//cite'):
x.tag = 'blockquote'
x.set('style', 'color:#404040;')
raw = etree.tostring(root, encoding='unicode')
return raw
def get_article(self, url): def get_article(self, url):
query = { query = {