mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
1843 & econ news
This commit is contained in:
parent
0fcccff441
commit
f7fd4b3c8a
@ -6,8 +6,10 @@ from urllib.parse import quote, urlencode
|
||||
from uuid import uuid4
|
||||
|
||||
from html5_parser import parse
|
||||
from mechanize import Request
|
||||
from lxml import etree
|
||||
|
||||
from calibre import browser
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
@ -87,7 +89,7 @@ def process_web_node(node):
|
||||
elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
|
||||
if node.get('textHtml'):
|
||||
return f'\n<p>{node.get("textHtml")}</p>'
|
||||
elif node.get('textJson'):
|
||||
if node.get('textJson'):
|
||||
return f'\n<p>{parse_textjson(node["textJson"])}</p>'
|
||||
return f'\n<p>{node.get("text", "")}</p>'
|
||||
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
|
||||
@ -104,13 +106,13 @@ def process_web_node(node):
|
||||
elif ntype == 'PULL_QUOTE':
|
||||
if node.get('textHtml'):
|
||||
return f'<blockquote>{node.get("textHtml")}</blockquote>'
|
||||
elif node.get('textJson'):
|
||||
if node.get('textJson'):
|
||||
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
|
||||
return f'<blockquote>{node.get("text", "")}</blockquote>'
|
||||
elif ntype == 'BLOCK_QUOTE':
|
||||
if node.get('textHtml'):
|
||||
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
|
||||
elif node.get('textJson'):
|
||||
if node.get('textJson'):
|
||||
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
|
||||
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
|
||||
elif ntype == 'DIVIDER':
|
||||
@ -162,9 +164,7 @@ class NoArticles(Exception):
|
||||
|
||||
|
||||
def get_content(url_):
|
||||
from mechanize import Request
|
||||
|
||||
from calibre import browser
|
||||
headers = {
|
||||
'User-Agent': 'TheEconomist-Liskov-android',
|
||||
'accept': 'multipart/mixed; deferSpec=20220824, application/json',
|
||||
@ -287,8 +287,6 @@ class Econ1843(BasicNewsRecipe):
|
||||
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
|
||||
html = load_article_from_web_json(raw)
|
||||
root = parse(html)
|
||||
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
|
||||
x.getparent().remove(x)
|
||||
# the economist uses <small> for small caps with a custom font
|
||||
for init in root.xpath('//span[@data-caps="initial"]'):
|
||||
init.set('style', 'font-weight:bold;')
|
||||
@ -296,7 +294,10 @@ class Econ1843(BasicNewsRecipe):
|
||||
if x.text and len(x) == 0:
|
||||
x.text = x.text.upper()
|
||||
x.tag = 'span'
|
||||
x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;')
|
||||
x.set(
|
||||
'style',
|
||||
'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;',
|
||||
)
|
||||
for h2 in root.xpath('//h2'):
|
||||
h2.tag = 'h4'
|
||||
for x in root.xpath('//figcaption'):
|
||||
|
@ -8,8 +8,10 @@ from urllib.parse import quote, urlencode
|
||||
from uuid import uuid4
|
||||
|
||||
from html5_parser import parse
|
||||
from mechanize import Request
|
||||
from lxml import etree
|
||||
|
||||
from calibre import browser
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
@ -89,7 +91,7 @@ def process_web_node(node):
|
||||
elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
|
||||
if node.get('textHtml'):
|
||||
return f'\n<p>{node.get("textHtml")}</p>'
|
||||
elif node.get('textJson'):
|
||||
if node.get('textJson'):
|
||||
return f'\n<p>{parse_textjson(node["textJson"])}</p>'
|
||||
return f'\n<p>{node.get("text", "")}</p>'
|
||||
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
|
||||
@ -106,13 +108,13 @@ def process_web_node(node):
|
||||
elif ntype == 'PULL_QUOTE':
|
||||
if node.get('textHtml'):
|
||||
return f'<blockquote>{node.get("textHtml")}</blockquote>'
|
||||
elif node.get('textJson'):
|
||||
if node.get('textJson'):
|
||||
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
|
||||
return f'<blockquote>{node.get("text", "")}</blockquote>'
|
||||
elif ntype == 'BLOCK_QUOTE':
|
||||
if node.get('textHtml'):
|
||||
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
|
||||
elif node.get('textJson'):
|
||||
if node.get('textJson'):
|
||||
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
|
||||
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
|
||||
elif ntype == 'DIVIDER':
|
||||
@ -164,9 +166,7 @@ class NoArticles(Exception):
|
||||
|
||||
|
||||
def get_content(url_):
|
||||
from mechanize import Request
|
||||
|
||||
from calibre import browser
|
||||
headers = {
|
||||
'User-Agent': 'TheEconomist-Liskov-android',
|
||||
'accept': 'multipart/mixed; deferSpec=20220824, application/json',
|
||||
@ -326,8 +326,6 @@ class EconomistNews(BasicNewsRecipe):
|
||||
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
|
||||
html = load_article_from_web_json(raw)
|
||||
root = parse(html)
|
||||
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
|
||||
x.getparent().remove(x)
|
||||
# the economist uses <small> for small caps with a custom font
|
||||
for init in root.xpath('//span[@data-caps="initial"]'):
|
||||
init.set('style', 'font-weight:bold;')
|
||||
@ -335,7 +333,10 @@ class EconomistNews(BasicNewsRecipe):
|
||||
if x.text and len(x) == 0:
|
||||
x.text = x.text.upper()
|
||||
x.tag = 'span'
|
||||
x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;')
|
||||
x.set(
|
||||
'style',
|
||||
'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;'
|
||||
)
|
||||
for h2 in root.xpath('//h2'):
|
||||
h2.tag = 'h4'
|
||||
for x in root.xpath('//figcaption'):
|
||||
|
Loading…
x
Reference in New Issue
Block a user