mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
1843 & econ news
This commit is contained in:
parent
0fcccff441
commit
f7fd4b3c8a
@ -6,8 +6,10 @@ from urllib.parse import quote, urlencode
|
|||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
|
from mechanize import Request
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre import browser
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
@ -87,7 +89,7 @@ def process_web_node(node):
|
|||||||
elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
|
elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
|
||||||
if node.get('textHtml'):
|
if node.get('textHtml'):
|
||||||
return f'\n<p>{node.get("textHtml")}</p>'
|
return f'\n<p>{node.get("textHtml")}</p>'
|
||||||
elif node.get('textJson'):
|
if node.get('textJson'):
|
||||||
return f'\n<p>{parse_textjson(node["textJson"])}</p>'
|
return f'\n<p>{parse_textjson(node["textJson"])}</p>'
|
||||||
return f'\n<p>{node.get("text", "")}</p>'
|
return f'\n<p>{node.get("text", "")}</p>'
|
||||||
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
|
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
|
||||||
@ -104,13 +106,13 @@ def process_web_node(node):
|
|||||||
elif ntype == 'PULL_QUOTE':
|
elif ntype == 'PULL_QUOTE':
|
||||||
if node.get('textHtml'):
|
if node.get('textHtml'):
|
||||||
return f'<blockquote>{node.get("textHtml")}</blockquote>'
|
return f'<blockquote>{node.get("textHtml")}</blockquote>'
|
||||||
elif node.get('textJson'):
|
if node.get('textJson'):
|
||||||
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
|
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
|
||||||
return f'<blockquote>{node.get("text", "")}</blockquote>'
|
return f'<blockquote>{node.get("text", "")}</blockquote>'
|
||||||
elif ntype == 'BLOCK_QUOTE':
|
elif ntype == 'BLOCK_QUOTE':
|
||||||
if node.get('textHtml'):
|
if node.get('textHtml'):
|
||||||
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
|
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
|
||||||
elif node.get('textJson'):
|
if node.get('textJson'):
|
||||||
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
|
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
|
||||||
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
|
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
|
||||||
elif ntype == 'DIVIDER':
|
elif ntype == 'DIVIDER':
|
||||||
@ -162,9 +164,7 @@ class NoArticles(Exception):
|
|||||||
|
|
||||||
|
|
||||||
def get_content(url_):
|
def get_content(url_):
|
||||||
from mechanize import Request
|
|
||||||
|
|
||||||
from calibre import browser
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'TheEconomist-Liskov-android',
|
'User-Agent': 'TheEconomist-Liskov-android',
|
||||||
'accept': 'multipart/mixed; deferSpec=20220824, application/json',
|
'accept': 'multipart/mixed; deferSpec=20220824, application/json',
|
||||||
@ -287,8 +287,6 @@ class Econ1843(BasicNewsRecipe):
|
|||||||
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
|
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
|
||||||
html = load_article_from_web_json(raw)
|
html = load_article_from_web_json(raw)
|
||||||
root = parse(html)
|
root = parse(html)
|
||||||
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
|
|
||||||
x.getparent().remove(x)
|
|
||||||
# the economist uses <small> for small caps with a custom font
|
# the economist uses <small> for small caps with a custom font
|
||||||
for init in root.xpath('//span[@data-caps="initial"]'):
|
for init in root.xpath('//span[@data-caps="initial"]'):
|
||||||
init.set('style', 'font-weight:bold;')
|
init.set('style', 'font-weight:bold;')
|
||||||
@ -296,7 +294,10 @@ class Econ1843(BasicNewsRecipe):
|
|||||||
if x.text and len(x) == 0:
|
if x.text and len(x) == 0:
|
||||||
x.text = x.text.upper()
|
x.text = x.text.upper()
|
||||||
x.tag = 'span'
|
x.tag = 'span'
|
||||||
x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;')
|
x.set(
|
||||||
|
'style',
|
||||||
|
'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;',
|
||||||
|
)
|
||||||
for h2 in root.xpath('//h2'):
|
for h2 in root.xpath('//h2'):
|
||||||
h2.tag = 'h4'
|
h2.tag = 'h4'
|
||||||
for x in root.xpath('//figcaption'):
|
for x in root.xpath('//figcaption'):
|
||||||
|
@ -8,8 +8,10 @@ from urllib.parse import quote, urlencode
|
|||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
|
from mechanize import Request
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre import browser
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
@ -89,7 +91,7 @@ def process_web_node(node):
|
|||||||
elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
|
elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
|
||||||
if node.get('textHtml'):
|
if node.get('textHtml'):
|
||||||
return f'\n<p>{node.get("textHtml")}</p>'
|
return f'\n<p>{node.get("textHtml")}</p>'
|
||||||
elif node.get('textJson'):
|
if node.get('textJson'):
|
||||||
return f'\n<p>{parse_textjson(node["textJson"])}</p>'
|
return f'\n<p>{parse_textjson(node["textJson"])}</p>'
|
||||||
return f'\n<p>{node.get("text", "")}</p>'
|
return f'\n<p>{node.get("text", "")}</p>'
|
||||||
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
|
elif (ntype == 'IMAGE') or (node.get('__typename', '') == 'ImageComponent'):
|
||||||
@ -106,13 +108,13 @@ def process_web_node(node):
|
|||||||
elif ntype == 'PULL_QUOTE':
|
elif ntype == 'PULL_QUOTE':
|
||||||
if node.get('textHtml'):
|
if node.get('textHtml'):
|
||||||
return f'<blockquote>{node.get("textHtml")}</blockquote>'
|
return f'<blockquote>{node.get("textHtml")}</blockquote>'
|
||||||
elif node.get('textJson'):
|
if node.get('textJson'):
|
||||||
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
|
return f'<blockquote>{parse_textjson(node["textJson"])}</blockquote>'
|
||||||
return f'<blockquote>{node.get("text", "")}</blockquote>'
|
return f'<blockquote>{node.get("text", "")}</blockquote>'
|
||||||
elif ntype == 'BLOCK_QUOTE':
|
elif ntype == 'BLOCK_QUOTE':
|
||||||
if node.get('textHtml'):
|
if node.get('textHtml'):
|
||||||
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
|
return f'<blockquote><i>{node.get("textHtml")}</i></blockquote>'
|
||||||
elif node.get('textJson'):
|
if node.get('textJson'):
|
||||||
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
|
return f'<blockquote><i>{parse_textjson(node["textJson"])}</i></blockquote>'
|
||||||
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
|
return f'<blockquote><i>{node.get("text", "")}</i></blockquote>'
|
||||||
elif ntype == 'DIVIDER':
|
elif ntype == 'DIVIDER':
|
||||||
@ -164,9 +166,7 @@ class NoArticles(Exception):
|
|||||||
|
|
||||||
|
|
||||||
def get_content(url_):
|
def get_content(url_):
|
||||||
from mechanize import Request
|
|
||||||
|
|
||||||
from calibre import browser
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'TheEconomist-Liskov-android',
|
'User-Agent': 'TheEconomist-Liskov-android',
|
||||||
'accept': 'multipart/mixed; deferSpec=20220824, application/json',
|
'accept': 'multipart/mixed; deferSpec=20220824, application/json',
|
||||||
@ -326,8 +326,6 @@ class EconomistNews(BasicNewsRecipe):
|
|||||||
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
|
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
|
||||||
html = load_article_from_web_json(raw)
|
html = load_article_from_web_json(raw)
|
||||||
root = parse(html)
|
root = parse(html)
|
||||||
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
|
|
||||||
x.getparent().remove(x)
|
|
||||||
# the economist uses <small> for small caps with a custom font
|
# the economist uses <small> for small caps with a custom font
|
||||||
for init in root.xpath('//span[@data-caps="initial"]'):
|
for init in root.xpath('//span[@data-caps="initial"]'):
|
||||||
init.set('style', 'font-weight:bold;')
|
init.set('style', 'font-weight:bold;')
|
||||||
@ -335,7 +333,10 @@ class EconomistNews(BasicNewsRecipe):
|
|||||||
if x.text and len(x) == 0:
|
if x.text and len(x) == 0:
|
||||||
x.text = x.text.upper()
|
x.text = x.text.upper()
|
||||||
x.tag = 'span'
|
x.tag = 'span'
|
||||||
x.set('style', 'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;')
|
x.set(
|
||||||
|
'style',
|
||||||
|
'text-transform: uppercase; font-size: 0.85em; letter-spacing: 0.05em;'
|
||||||
|
)
|
||||||
for h2 in root.xpath('//h2'):
|
for h2 in root.xpath('//h2'):
|
||||||
h2.tag = 'h4'
|
h2.tag = 'h4'
|
||||||
for x in root.xpath('//figcaption'):
|
for x in root.xpath('//figcaption'):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user