Update The World Ahead

This commit is contained in:
unkn0w7n 2024-05-06 16:08:06 +05:30
parent 6d9d698ab2
commit 621d7d33c8
2 changed files with 230 additions and 120 deletions

View File

@ -2,13 +2,20 @@
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
import json import json
import time
from collections import defaultdict
from datetime import datetime, timedelta
from urllib.parse import quote, urlencode
from calibre import replace_entities from calibre import replace_entities
from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import parse_only_date
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from html5_parser import parse from html5_parser import parse
from lxml import etree from lxml import etree
use_archive = True
def E(parent, name, text='', **attrs): def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs) ans = parent.makeelement(name, **attrs)
@ -47,6 +54,35 @@ class JSONHasNoContent(ValueError):
pass pass
if use_archive:
def load_article_from_json(raw, root):
# open('/t/raw.json', 'w').write(raw)
data = json.loads(raw)
body = root.xpath('//body')[0]
article = E(body, 'article')
E(article, 'div', data['flyTitle'] , style='color: red; font-size:small; font-weight:bold;')
E(article, 'h1', data['title'], title=safe_dict(data, "url", "canonical") or '')
E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;')
try:
date = data['dateModified']
except Exception:
date = data['datePublished']
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b %d, %Y, %I:%M %p')
if data['dateline'] is None:
E(article, 'p', dt, style='color: gray; font-size:small;')
else:
E(article, 'p', dt + ' | ' + (data['dateline']), style='color: gray; font-size:small;')
main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical')
if main_image_url:
div = E(article, 'div')
try:
E(div, 'img', src=main_image_url)
except Exception:
pass
for node in data.get('text') or ():
process_node(node, article)
else:
def load_article_from_json(raw, root): def load_article_from_json(raw, root):
# open('/t/raw.json', 'w').write(raw) # open('/t/raw.json', 'w').write(raw)
try: try:
@ -59,10 +95,13 @@ def load_article_from_json(raw, root):
for child in tuple(body): for child in tuple(body):
body.remove(child) body.remove(child)
article = E(body, 'article') article = E(body, 'article')
E(article, 'h4', data['subheadline'], style='color: red; margin: 0') E(article, 'div', replace_entities(data['subheadline']) , style='color: red; font-size:small; font-weight:bold;')
E(article, 'h1', data['headline'], style='font-size: x-large') E(article, 'h1', replace_entities(data['headline']))
E(article, 'div', data['description'], style='font-style: italic; color: #202020;') E(article, 'div', replace_entities(data['description']), style='font-style: italic; color:#202020;')
E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em') if data['dateline'] is None:
E(article, 'p', (data['datePublishedString'] or ''), style='color: gray; font-size:small;')
else:
E(article, 'p', (data['datePublishedString'] or '') + ' | ' + (data['dateline']), style='color: gray; font-size:small;')
main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical')
if main_image_url: if main_image_url:
div = E(article, 'div') div = E(article, 'div')
@ -116,6 +155,7 @@ class Economist(BasicNewsRecipe):
title = 'The Economist World Ahead' title = 'The Economist World Ahead'
language = 'en' language = 'en'
encoding = 'utf-8' encoding = 'utf-8'
masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'
__author__ = "Kovid Goyal" __author__ = "Kovid Goyal"
description = ( description = (
@ -124,29 +164,7 @@ class Economist(BasicNewsRecipe):
' Best downloaded in late November.' ' Best downloaded in late November.'
) )
extra_css = ''' extra_css = '''
.headline {font-size: x-large;} em { color:#202020; }
h2 { font-size: small; }
h1 { font-size: medium; }
em.Bold {font-weight:bold;font-style:normal;}
em.Italic {font-style:italic;}
p.xhead {font-weight:bold;}
.pullquote {
float: right;
font-size: larger;
font-weight: bold;
font-style: italic;
page-break-inside:avoid;
border-bottom: 3px solid black;
border-top: 3px solid black;
width: 228px;
margin: 0px 0px 10px 15px;
padding: 7px 0px 9px;
}
.flytitle-and-title__flytitle {
display: block;
font-size: smaller;
color: red;
}
img {display:block; margin:0 auto;} img {display:block; margin:0 auto;}
''' '''
oldest_article = 7.0 oldest_article = 7.0
@ -181,6 +199,83 @@ class Economist(BasicNewsRecipe):
needs_subscription = False needs_subscription = False
def get_browser(self, *args, **kwargs):
# Needed to bypass cloudflare
kwargs['user_agent'] = 'common_words/based'
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.addheaders += [('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8')]
return br
def economist_test_article(self):
return [('Articles', [{'title':'test',
'url':'https://www.economist.com/the-americas/2024/04/14/elon-musk-is-feuding-with-brazils-powerful-supreme-court'
}])]
def economist_return_index(self, ans):
if not ans:
raise NoArticles(
'Could not find any articles, either the '
'economist.com server is having trouble and you should '
'try later or the website format has changed and the '
'recipe needs to be updated.'
)
return ans
if use_archive:
def parse_index(self):
# return self.economist_test_article()
soup = self.index_to_soup('https://www.economist.com/the-world-ahead')
script_tag = soup.find("script", id="__NEXT_DATA__")
if script_tag is None:
raise ValueError('No script tag with JSON data found in the weeklyedition archive')
data = json.loads(script_tag.string)
content_id = data['props']['pageProps']['content']['tegID'].split('/')[-1]
query = {
'query': 'query HubsDataQuery($id:String!$size:Int!){canonical(ref:$id){id headline description url{canonical __typename}image{ident{url{canonical __typename}width height __typename}__typename}text(mode:"hub" format:"json")hasPart(size:$size){parts{id title:headline isPartOf{context{title:headline __typename}__typename}hasPart{parts{...ArticleFragment isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}__typename}__typename}__typename}}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}', # noqa
'operationName': 'HubsDataQuery',
'variables': '{{"id":"/content/{}","size":40}}'.format(content_id),
}
url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote)
try:
raw = self.index_to_soup(url, raw=True)
except Exception:
raise ValueError('Server is not reachable, try again some other time.')
ans = self.economist_parse_index(raw)
return self.economist_return_index(ans)
def economist_parse_index(self, raw):
data = json.loads(raw)['data']['canonical']
self.description = data['description']
feeds_dict = defaultdict(list)
for part in safe_dict(data, "hasPart", "parts"):
section = part['title']
self.log(section)
for art in safe_dict(part, "hasPart", "parts"):
title = safe_dict(art, "title")
desc = safe_dict(art, "rubric") or ''
sub = safe_dict(art, "flyTitle") or ''
if sub and section != sub:
desc = sub + ' :: ' + desc
pt = PersistentTemporaryFile('.html')
pt.write(json.dumps(art).encode('utf-8'))
pt.close()
url = 'file:///' + pt.name
feeds_dict[section].append({"title": title, "url": url, "description": desc})
self.log('\t', title, '\n\t\t', desc)
return [(section, articles) for section, articles in feeds_dict.items()]
def populate_article_metadata(self, article, soup, first):
article.url = soup.find('h1')['title']
def preprocess_html(self, soup):
for img in soup.findAll('img', src=True):
img['src'] = img['src'].replace('economist.com/',
'economist.com/cdn-cgi/image/width=600,quality=80,format=auto/')
return soup
else: # Load articles from individual article pages {{{
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs) BasicNewsRecipe.__init__(self, *args, **kwargs)
if self.output_profile.short_name.startswith('kindle'): if self.output_profile.short_name.startswith('kindle'):
@ -190,52 +285,6 @@ class Economist(BasicNewsRecipe):
self.web2disk_options.compress_news_images_auto_size = 5 self.web2disk_options.compress_news_images_auto_size = 5
self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold')
def get_browser(self, *args, **kwargs):
# Needed to bypass cloudflare
kwargs['user_agent'] = 'common_words/based'
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.addheaders += [('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8')]
return br
def preprocess_raw_html(self, raw, url):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
root = parse(raw)
if '/interactive/' in url:
return '<html><body><article><h1 class="headline">' + root.xpath('//h1')[0].text + '</h1><em>' \
+ 'This article is supposed to be read in a browser' \
+ '</em></article></body></html>'
script = root.xpath('//script[@id="__NEXT_DATA__"]')
if script:
try:
load_article_from_json(script[0].text, root)
except JSONHasNoContent:
cleanup_html_article(root)
for div in root.xpath('//div[@class="lazy-image"]'):
noscript = list(div.iter('noscript'))
if noscript and noscript[0].text:
img = list(parse(noscript[0].text).iter('img'))
if img:
p = noscript[0].getparent()
idx = p.index(noscript[0])
p.insert(idx, p.makeelement('img', src=img[0].get('src')))
p.remove(noscript[0])
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
x.getparent().remove(x)
# the economist uses <small> for small caps with a custom font
for x in root.xpath('//small'):
if x.text and len(x) == 0:
x.text = x.text.upper()
x.tag = 'span'
x.set('style', 'font-variant: small-caps')
for x in root.xpath('//figcaption'):
x.set('style', 'text-align:center; font-size:small;')
for x in root.xpath('//cite'):
x.tag = 'blockquote'
x.set('style', 'color:#404040;')
raw = etree.tostring(root, encoding='unicode')
return raw
def parse_index(self): def parse_index(self):
# return [('Articles', [{'title':'test', # return [('Articles', [{'title':'test',
# 'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-models-are-turbo-charging-ai-progress' # 'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-models-are-turbo-charging-ai-progress'
@ -291,6 +340,58 @@ class Economist(BasicNewsRecipe):
feeds.append((section, articles)) feeds.append((section, articles))
return feeds return feeds
# }}}
def preprocess_raw_html(self, raw, url):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
if use_archive:
body = '<html><body><article></article></body></html>'
root = parse(body)
load_article_from_json(raw, root)
else:
root = parse(raw)
script = root.xpath('//script[@id="__NEXT_DATA__"]')
if script:
try:
load_article_from_json(script[0].text, root)
except JSONHasNoContent:
cleanup_html_article(root)
if '/interactive/' in url:
return '<html><body><article><h1>' + root.xpath('//h1')[0].text + '</h1><em>' \
+ 'This article is supposed to be read in a browser' \
+ '</em></article></body></html>'
for div in root.xpath('//div[@class="lazy-image"]'):
noscript = list(div.iter('noscript'))
if noscript and noscript[0].text:
img = list(parse(noscript[0].text).iter('img'))
if img:
p = noscript[0].getparent()
idx = p.index(noscript[0])
p.insert(idx, p.makeelement('img', src=img[0].get('src')))
p.remove(noscript[0])
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
x.getparent().remove(x)
# the economist uses <small> for small caps with a custom font
for init in root.xpath('//span[@data-caps="initial"]'):
init.set('style', 'font-weight:bold;')
for x in root.xpath('//small'):
if x.text and len(x) == 0:
x.text = x.text.upper()
x.tag = 'span'
x.set('style', 'font-variant: small-caps')
for h2 in root.xpath('//h2'):
h2.tag = 'h4'
for x in root.xpath('//figcaption'):
x.set('style', 'text-align:center; font-size:small;')
for x in root.xpath('//cite'):
x.tag = 'blockquote'
x.set('style', 'color:#404040;')
raw = etree.tostring(root, encoding='unicode')
return raw
def eco_find_image_tables(self, soup): def eco_find_image_tables(self, soup):
for x in soup.findAll('table', align=['right', 'center']): for x in soup.findAll('table', align=['right', 'center']):
if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1: if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1:
@ -318,3 +419,12 @@ class Economist(BasicNewsRecipe):
if url.endswith('/print'): if url.endswith('/print'):
url = url.rpartition('/')[0] url = url.rpartition('/')[0]
return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link) return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
def get_login_cookies(username, password):
print(33333333333, username, password)
if __name__ == '__main__':
import sys
get_login_cookies(sys.argv[-2], sys.argv[-1])

Binary file not shown.

Before

Width:  |  Height:  |  Size: 762 B

After

Width:  |  Height:  |  Size: 2.6 KiB