1843 & econ news

This commit is contained in:
unkn0w7n 2025-07-31 16:54:40 +05:30
parent 0fcccff441
commit f7fd4b3c8a
2 changed files with 18 additions and 16 deletions

View File

@ -6,8 +6,10 @@ from urllib.parse import quote, urlencode
from uuid import uuid4 from uuid import uuid4
from html5_parser import parse from html5_parser import parse
from mechanize import Request
from lxml import etree from lxml import etree
from calibre import browser
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -87,7 +89,7 @@ def process_web_node(node):
elif ntype in ['PARAGRAPH', 'BOOK_INFO']: elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
if node.get('textHtml'): if node.get('textHtml'):
return f'\n<p>{node.get("textHtml")}</p>' return f'\n<p>{node.get("textHtml")}</p>'
elif node.get('textJson'): if node.get('textJson'):
return f'\n<p>{parse_textjson(node["textJson"])}</p>' return f'\n<p>{parse_textjson(node["textJson"])}</p>'
return f'\n<p>{node.get("text", "")}</p>' return f'\n<p>{node.get("text", "")}</p>'
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'): elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
@ -104,13 +106,13 @@ def process_web_node(node):
elif ntype == 'PULL_QUOTE': elif ntype == 'PULL_QUOTE':
if node.get('textHtml'): if node.get('textHtml'):
return f'<blockquote>{node.get("textHtml")}</blockquote>' return f'<blockquote>{node.get("textHtml")}</blockquote>'
elif node.get('textJson'): if node.get('textJson'):
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>' return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
return f'<blockquote>{node.get("text", "")}</blockquote>' return f'<blockquote>{node.get("text", "")}</blockquote>'
elif ntype == 'BLOCK_QUOTE': elif ntype == 'BLOCK_QUOTE':
if node.get('textHtml'): if node.get('textHtml'):
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>' return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
elif node.get('textJson'): if node.get('textJson'):
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>' return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>' return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
elif ntype == 'DIVIDER': elif ntype == 'DIVIDER':
@ -162,9 +164,7 @@ class NoArticles(Exception):
def get_content(url_): def get_content(url_):
from mechanize import Request
from calibre import browser
headers = { headers = {
'User-Agent': 'TheEconomist-Liskov-android', 'User-Agent': 'TheEconomist-Liskov-android',
'accept': 'multipart/mixed; deferSpec=20220824, application/json', 'accept': 'multipart/mixed; deferSpec=20220824, application/json',
@ -287,8 +287,6 @@ class Econ1843(BasicNewsRecipe):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8')) # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
html = load_article_from_web_json(raw) html = load_article_from_web_json(raw)
root = parse(html) root = parse(html)
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
x.getparent().remove(x)
# the economist uses <small> for small caps with a custom font # the economist uses <small> for small caps with a custom font
for init in root.xpath('//span[@data-caps="initial"]'): for init in root.xpath('//span[@data-caps="initial"]'):
init.set('style', 'font-weight:bold;') init.set('style', 'font-weight:bold;')
@ -296,7 +294,10 @@ class Econ1843(BasicNewsRecipe):
if x.text and len(x) == 0: if x.text and len(x) == 0:
x.text = x.text.upper() x.text = x.text.upper()
x.tag = 'span' x.tag = 'span'
x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;') x.set(
'style',
'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;',
)
for h2 in root.xpath('//h2'): for h2 in root.xpath('//h2'):
h2.tag = 'h4' h2.tag = 'h4'
for x in root.xpath('//figcaption'): for x in root.xpath('//figcaption'):

View File

@ -8,8 +8,10 @@ from urllib.parse import quote, urlencode
from uuid import uuid4 from uuid import uuid4
from html5_parser import parse from html5_parser import parse
from mechanize import Request
from lxml import etree from lxml import etree
from calibre import browser
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -89,7 +91,7 @@ def process_web_node(node):
elif ntype in ['PARAGRAPH', 'BOOK_INFO']: elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
if node.get('textHtml'): if node.get('textHtml'):
return f'\n<p>{node.get("textHtml")}</p>' return f'\n<p>{node.get("textHtml")}</p>'
elif node.get('textJson'): if node.get('textJson'):
return f'\n<p>{parse_textjson(node["textJson"])}</p>' return f'\n<p>{parse_textjson(node["textJson"])}</p>'
return f'\n<p>{node.get("text", "")}</p>' return f'\n<p>{node.get("text", "")}</p>'
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'): elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
@ -106,13 +108,13 @@ def process_web_node(node):
elif ntype == 'PULL_QUOTE': elif ntype == 'PULL_QUOTE':
if node.get('textHtml'): if node.get('textHtml'):
return f'<blockquote>{node.get("textHtml")}</blockquote>' return f'<blockquote>{node.get("textHtml")}</blockquote>'
elif node.get('textJson'): if node.get('textJson'):
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>' return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
return f'<blockquote>{node.get("text", "")}</blockquote>' return f'<blockquote>{node.get("text", "")}</blockquote>'
elif ntype == 'BLOCK_QUOTE': elif ntype == 'BLOCK_QUOTE':
if node.get('textHtml'): if node.get('textHtml'):
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>' return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
elif node.get('textJson'): if node.get('textJson'):
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>' return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>' return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
elif ntype == 'DIVIDER': elif ntype == 'DIVIDER':
@ -164,9 +166,7 @@ class NoArticles(Exception):
def get_content(url_): def get_content(url_):
from mechanize import Request
from calibre import browser
headers = { headers = {
'User-Agent': 'TheEconomist-Liskov-android', 'User-Agent': 'TheEconomist-Liskov-android',
'accept': 'multipart/mixed; deferSpec=20220824, application/json', 'accept': 'multipart/mixed; deferSpec=20220824, application/json',
@ -326,8 +326,6 @@ class EconomistNews(BasicNewsRecipe):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8')) # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
html = load_article_from_web_json(raw) html = load_article_from_web_json(raw)
root = parse(html) root = parse(html)
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
x.getparent().remove(x)
# the economist uses <small> for small caps with a custom font # the economist uses <small> for small caps with a custom font
for init in root.xpath('//span[@data-caps="initial"]'): for init in root.xpath('//span[@data-caps="initial"]'):
init.set('style', 'font-weight:bold;') init.set('style', 'font-weight:bold;')
@ -335,7 +333,10 @@ class EconomistNews(BasicNewsRecipe):
if x.text and len(x) == 0: if x.text and len(x) == 0:
x.text = x.text.upper() x.text = x.text.upper()
x.tag = 'span' x.tag = 'span'
x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;') x.set(
'style',
'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;'
)
for h2 in root.xpath('//h2'): for h2 in root.xpath('//h2'):
h2.tag = 'h4' h2.tag = 'h4'
for x in root.xpath('//figcaption'): for x in root.xpath('//figcaption'):