From f7fd4b3c8a5024c34679d662cfe60065edfb26c2 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 31 Jul 2025 16:54:40 +0530 Subject: [PATCH] 1843 & econ news --- recipes/1843.recipe | 17 +++++++++-------- recipes/economist_news.recipe | 17 +++++++++-------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/recipes/1843.recipe b/recipes/1843.recipe index 44921e5d24..07f6659614 100644 --- a/recipes/1843.recipe +++ b/recipes/1843.recipe @@ -6,8 +6,10 @@ from urllib.parse import quote, urlencode from uuid import uuid4 from html5_parser import parse +from mechanize import Request from lxml import etree +from calibre import browser from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe @@ -87,7 +89,7 @@ def process_web_node(node): elif ntype in ['PARAGRAPH', 'BOOK_INFO']: if node.get('textHtml'): return f'\n
{node.get("textHtml")}
' - elif node.get('textJson'): + if node.get('textJson'): return f'\n{parse_textjson(node["textJson"])}
' return f'\n{node.get("text", "")}
' elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'): @@ -104,13 +106,13 @@ def process_web_node(node): elif ntype == 'PULL_QUOTE': if node.get('textHtml'): return f'{node.get("textHtml")}' - elif node.get('textJson'): + if node.get('textJson'): return f'
{parse_textjson(node["textJson"])}' return f'
{node.get("text", "")}' elif ntype == 'BLOCK_QUOTE': if node.get('textHtml'): return f'
{node.get("textHtml")}' - elif node.get('textJson'): + if node.get('textJson'): return f'
{parse_textjson(node["textJson"])}' return f'
{node.get("text", "")}' elif ntype == 'DIVIDER': @@ -162,9 +164,7 @@ class NoArticles(Exception): def get_content(url_): - from mechanize import Request - from calibre import browser headers = { 'User-Agent': 'TheEconomist-Liskov-android', 'accept': 'multipart/mixed; deferSpec=20220824, application/json', @@ -287,8 +287,6 @@ class Econ1843(BasicNewsRecipe): # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) html = load_article_from_web_json(raw) root = parse(html) - for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'): - x.getparent().remove(x) # the economist uses for small caps with a custom font for init in root.xpath('//span[@data-caps="initial"]'): init.set('style', 'font-weight:bold;') @@ -296,7 +294,10 @@ class Econ1843(BasicNewsRecipe): if x.text and len(x) == 0: x.text = x.text.upper() x.tag = 'span' - x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;') + x.set( + 'style', + 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;', + ) for h2 in root.xpath('//h2'): h2.tag = 'h4' for x in root.xpath('//figcaption'): diff --git a/recipes/economist_news.recipe b/recipes/economist_news.recipe index af6747e721..4fad01311b 100644 --- a/recipes/economist_news.recipe +++ b/recipes/economist_news.recipe @@ -8,8 +8,10 @@ from urllib.parse import quote, urlencode from uuid import uuid4 from html5_parser import parse +from mechanize import Request from lxml import etree +from calibre import browser from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe @@ -89,7 +91,7 @@ def process_web_node(node): elif ntype in ['PARAGRAPH', 'BOOK_INFO']: if node.get('textHtml'): return f'\n
{node.get("textHtml")}
' - elif node.get('textJson'): + if node.get('textJson'): return f'\n{parse_textjson(node["textJson"])}
' return f'\n{node.get("text", "")}
' elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'): @@ -106,13 +108,13 @@ def process_web_node(node): elif ntype == 'PULL_QUOTE': if node.get('textHtml'): return f'{node.get("textHtml")}' - elif node.get('textJson'): + if node.get('textJson'): return f'
{parse_textjson(node["textJson"])}' return f'
{node.get("text", "")}' elif ntype == 'BLOCK_QUOTE': if node.get('textHtml'): return f'
{node.get("textHtml")}' - elif node.get('textJson'): + if node.get('textJson'): return f'
{parse_textjson(node["textJson"])}' return f'
{node.get("text", "")}' elif ntype == 'DIVIDER': @@ -164,9 +166,7 @@ class NoArticles(Exception): def get_content(url_): - from mechanize import Request - from calibre import browser headers = { 'User-Agent': 'TheEconomist-Liskov-android', 'accept': 'multipart/mixed; deferSpec=20220824, application/json', @@ -326,8 +326,6 @@ class EconomistNews(BasicNewsRecipe): # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) html = load_article_from_web_json(raw) root = parse(html) - for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'): - x.getparent().remove(x) # the economist uses for small caps with a custom font for init in root.xpath('//span[@data-caps="initial"]'): init.set('style', 'font-weight:bold;') @@ -335,7 +333,10 @@ class EconomistNews(BasicNewsRecipe): if x.text and len(x) == 0: x.text = x.text.upper() x.tag = 'span' - x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;') + x.set( + 'style', + 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;' + ) for h2 in root.xpath('//h2'): h2.tag = 'h4' for x in root.xpath('//figcaption'):