1843 & econ news

This commit is contained in:
unkn0w7n 2025-07-31 16:54:40 +05:30
parent 0fcccff441
commit f7fd4b3c8a
2 changed files with 18 additions and 16 deletions

View File

@ -6,8 +6,10 @@ from urllib.parse import quote, urlencode
from uuid import uuid4
from html5_parser import parse
from mechanize import Request
from lxml import etree
from calibre import browser
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
@ -87,7 +89,7 @@ def process_web_node(node):
elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
if node.get('textHtml'):
return f'\n<p>{node.get("textHtml")}</p>'
elif node.get('textJson'):
if node.get('textJson'):
return f'\n<p>{parse_textjson(node["textJson"])}</p>'
return f'\n<p>{node.get("text", "")}</p>'
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
@ -104,13 +106,13 @@ def process_web_node(node):
elif ntype == 'PULL_QUOTE':
if node.get('textHtml'):
return f'<blockquote>{node.get("textHtml")}</blockquote>'
elif node.get('textJson'):
if node.get('textJson'):
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
return f'<blockquote>{node.get("text", "")}</blockquote>'
elif ntype == 'BLOCK_QUOTE':
if node.get('textHtml'):
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
elif node.get('textJson'):
if node.get('textJson'):
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
elif ntype == 'DIVIDER':
@ -162,9 +164,7 @@ class NoArticles(Exception):
def get_content(url_):
from mechanize import Request
from calibre import browser
headers = {
'User-Agent': 'TheEconomist-Liskov-android',
'accept': 'multipart/mixed; deferSpec=20220824, application/json',
@ -287,8 +287,6 @@ class Econ1843(BasicNewsRecipe):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
html = load_article_from_web_json(raw)
root = parse(html)
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
x.getparent().remove(x)
# the economist uses <small> for small caps with a custom font
for init in root.xpath('//span[@data-caps="initial"]'):
init.set('style', 'font-weight:bold;')
@ -296,7 +294,10 @@ class Econ1843(BasicNewsRecipe):
if x.text and len(x) == 0:
x.text = x.text.upper()
x.tag = 'span'
x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;')
x.set(
'style',
'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;',
)
for h2 in root.xpath('//h2'):
h2.tag = 'h4'
for x in root.xpath('//figcaption'):

View File

@ -8,8 +8,10 @@ from urllib.parse import quote, urlencode
from uuid import uuid4
from html5_parser import parse
from mechanize import Request
from lxml import etree
from calibre import browser
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
@ -89,7 +91,7 @@ def process_web_node(node):
elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
if node.get('textHtml'):
return f'\n<p>{node.get("textHtml")}</p>'
elif node.get('textJson'):
if node.get('textJson'):
return f'\n<p>{parse_textjson(node["textJson"])}</p>'
return f'\n<p>{node.get("text", "")}</p>'
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
@ -106,13 +108,13 @@ def process_web_node(node):
elif ntype == 'PULL_QUOTE':
if node.get('textHtml'):
return f'<blockquote>{node.get("textHtml")}</blockquote>'
elif node.get('textJson'):
if node.get('textJson'):
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
return f'<blockquote>{node.get("text", "")}</blockquote>'
elif ntype == 'BLOCK_QUOTE':
if node.get('textHtml'):
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
elif node.get('textJson'):
if node.get('textJson'):
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
elif ntype == 'DIVIDER':
@ -164,9 +166,7 @@ class NoArticles(Exception):
def get_content(url_):
from mechanize import Request
from calibre import browser
headers = {
'User-Agent': 'TheEconomist-Liskov-android',
'accept': 'multipart/mixed; deferSpec=20220824, application/json',
@ -326,8 +326,6 @@ class EconomistNews(BasicNewsRecipe):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
html = load_article_from_web_json(raw)
root = parse(html)
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
x.getparent().remove(x)
# the economist uses <small> for small caps with a custom font
for init in root.xpath('//span[@data-caps="initial"]'):
init.set('style', 'font-weight:bold;')
@ -335,7 +333,10 @@ class EconomistNews(BasicNewsRecipe):
if x.text and len(x) == 0:
x.text = x.text.upper()
x.tag = 'span'
x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;')
x.set(
'style',
'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;'
)
for h2 in root.xpath('//h2'):
h2.tag = 'h4'
for x in root.xpath('//figcaption'):