From 87391bff5c266af44102b9eca2d63d70ea7a8581 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sat, 19 Apr 2025 20:05:51 +0530
Subject: [PATCH 1/2] Update economist_news.recipe
---
recipes/economist_news.recipe | 199 ++++++++++++++++++++--------------
1 file changed, 115 insertions(+), 84 deletions(-)
diff --git a/recipes/economist_news.recipe b/recipes/economist_news.recipe
index cf2223dc85..28a927dbc4 100644
--- a/recipes/economist_news.recipe
+++ b/recipes/economist_news.recipe
@@ -10,38 +10,10 @@ from uuid import uuid4
from html5_parser import parse
from lxml import etree
-from calibre import replace_entities
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
-from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
-def E(parent, name, text='', **attrs):
- ans = parent.makeelement(name, **attrs)
- ans.text = text
- parent.append(ans)
- return ans
-
-
-def process_node(node, html_parent):
- ntype = node.get('type')
- if ntype == 'tag':
- c = html_parent.makeelement(node['name'])
- c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
- html_parent.append(c)
- for nc in node.get('children', ()):
- process_node(nc, c)
- elif ntype == 'text':
- text = node.get('data')
- if text:
- text = replace_entities(text)
- if len(html_parent):
- t = html_parent[-1]
- t.tail = (t.tail or '') + text
- else:
- html_parent.text = (html_parent.text or '') + text
-
-
def safe_dict(data, *names):
ans = data
for x in names:
@@ -49,37 +21,86 @@ def safe_dict(data, *names):
return ans
-class JSONHasNoContent(ValueError):
- pass
+def process_web_list(li_node):
+ li_html = ''
+ for li in li_node['items']:
+ if li.get('textHtml'):
+ li_html += f'
{li.get("textHtml")}'
+ else:
+ li_html += f'{li.get("text", "")}'
+ return li_html
-def load_article_from_json(raw, root):
+def process_info_box(bx):
+ info = ''
+ for x in safe_dict(bx, 'components'):
+ info += f'{process_web_node(x)}
'
+ return info
+
+
+def process_web_node(node):
+ ntype = node.get('type', '')
+ if ntype == 'CROSSHEAD':
+ if node.get('textHtml'):
+ return f'{node.get("textHtml")}
'
+ return f'{node.get("text", "")}
'
+ elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
+ if node.get('textHtml'):
+ return f'{node.get("textHtml")}
'
+ return f'{node.get("text", "")}
'
+ elif ntype == 'IMAGE':
+ alt = '' if node.get('altText') is None else node.get('altText')
+ cap = ''
+ if node.get('caption'):
+ if node['caption'].get('textHtml') is not None:
+ cap = node['caption']['textHtml']
+ return f'{cap}
'
+ elif ntype == 'PULL_QUOTE':
+ if node.get('textHtml'):
+ return f'{node.get("textHtml")}
'
+ return f'{node.get("text", "")}
'
+ elif ntype == 'DIVIDER':
+ return '
'
+ elif ntype == 'INFOGRAPHIC':
+ if node.get('fallback'):
+ return process_web_node(node['fallback'])
+ elif ntype == 'INFOBOX':
+ return process_info_box(node)
+ elif ntype == 'UNORDERED_LIST':
+ if node.get('items'):
+ return process_web_list(node)
+ elif ntype:
+ print('** ', ntype)
+ return ''
+
+
+def load_article_from_web_json(raw):
# open('/t/raw.json', 'w').write(raw)
- data = json.loads(raw)
- body = root.xpath('//body')[0]
- article = E(body, 'article')
- E(article, 'div', data['flyTitle'], style='color: red; font-size:small; font-weight:bold;')
- E(article, 'h1', data['title'], title=safe_dict(data, 'url', 'canonical') or '')
- E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;')
+ body = ''
+ try:
+ data = json.loads(raw)['props']['pageProps']['cp2Content']
+ except Exception:
+ data = json.loads(raw)['props']['pageProps']['content']
+ body += f'{data.get("flyTitle", "")}
'
+ body += f'{data["headline"]}
'
+ if data.get('rubric') and data.get('rubric') is not None:
+ body += f'{data.get("rubric", "")}
'
try:
date = data['dateModified']
except Exception:
date = data['datePublished']
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b %d, %Y %I:%M %p')
- if data['dateline'] is None:
- E(article, 'p', dt, style='color: gray; font-size:small;')
+ if data.get('dateline') is None:
+ body += f'{dt}
'
else:
- E(article, 'p', dt + ' | ' + (data['dateline']), style='color: gray; font-size:small;')
- main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical')
+ body += f'{dt + " | " + (data["dateline"])}
'
+ main_image_url = safe_dict(data, 'leadComponent') or ''
if main_image_url:
- div = E(article, 'div')
- try:
- E(div, 'img', src=main_image_url)
- except Exception:
- pass
- for node in data.get('text') or ():
- process_node(node, article)
+ body += process_web_node(data['leadComponent'])
+ for node in data.get('body'):
+ body += process_web_node(node)
+ return '' + body + ''
def cleanup_html_article(root):
@@ -172,6 +193,8 @@ class EconomistNews(BasicNewsRecipe):
needs_subscription = False
+ from_web = False
+
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
@@ -192,16 +215,25 @@ class EconomistNews(BasicNewsRecipe):
self.oldest_article = float(d)
def get_browser(self, *args, **kwargs):
- kwargs['user_agent'] = 'TheEconomist-Lamarr-android'
- br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
- br.addheaders += [
- ('accept', '*/*'),
- ('content-type', 'application/json'),
- ('apollographql-client-name', 'mobile-app-apollo'),
- ('apollographql-client-version', '3.50.0'),
- ('x-request-id', str(uuid4())),
- ]
- return br
+ if self.from_web:
+ kwargs['user_agent'] = (
+ 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
+ )
+ br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
+ else:
+ kwargs['user_agent'] = 'TheEconomist-Liskov-android'
+ br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
+ br.addheaders += [
+ ('accept', 'multipart/mixed; deferSpec=20220824, application/json'),
+ ('accept-encoding', 'gzip'),
+ ('content-type', 'application/json'),
+ ('x-app-trace-id', str(uuid4())),
+ ('x-economist-consumer', 'TheEconomist-Liskov-android'),
+ ('x-teg-client-name', 'Economist-Android'),
+ ('x-teg-client-os', 'Android'),
+ ('x-teg-client-version', '4.30.0'),
+ ]
+ return br
def economist_return_index(self, ans):
if not ans:
@@ -215,9 +247,9 @@ class EconomistNews(BasicNewsRecipe):
def parse_index(self):
query = {
- 'query': 'query HomeQuery($homeId:String!$relatedId:String!){canonical(ref:$homeId){hasPart{parts{id title:headline cta{text url __typename}type hasPart{parts{...ArticleFragment ...VideoFragment hasPart{parts{...ArticleFragment isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}__typename}__typename}__typename}relatedTopStories:canonical(ref:$relatedId){id title:headline hasPart(size:2 sort:"datePublished:desc"){parts{...ArticleFragment isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}__typename}}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}fragment VideoFragment on Content{video{playlist{playlistId __typename}__typename}__typename}', # noqa: E501
- 'operationName': 'HomeQuery',
- 'variables': '{"homeId":"/content/mgo2tcc3u3002m4gndvffg3kqv7n5n3g","relatedId":"/content/bp252fp9p9dvkn6pcjog5cks9hhnrf96"}',
+ 'operationName': 'FindHomepage',
+ 'variables': '{"homepageType":"MOBILE"}',
+ 'query': 'query FindHomepage($homepageType: HomepageType!) { findHomepage(homepageType: $homepageType) { __typename ...HomepageFragment } } fragment CtaFragment on Cta { link text } fragment ContentIdentityFragment on ContentIdentity { articleType forceAppWebView leadMediaType } fragment ImageTeaserFragment on ImageComponent { altText height imageType source url width } fragment NarrationFragment on Narration { album bitrate duration filename id provider url } fragment PodcastAudioFragment on PodcastEpisode { id audio { url durationInSeconds } } fragment ArticleTeaserFragment on Article { id tegId url rubric headline flyTitle brand byline dateFirstPublished dateline dateModified datePublished dateRevised estimatedReadTime wordCount printHeadline contentIdentity { __typename ...ContentIdentityFragment } section { tegId name } teaserImage { __typename type ...ImageTeaserFragment } leadComponent { __typename type ...ImageTeaserFragment } narration { __typename ...NarrationFragment } podcast { __typename ...PodcastAudioFragment } } fragment OverridesFragment on Overrides { flyTitle headline rubric teaserImage { __typename ...ImageTeaserFragment } } fragment CollectionItemFragment on CollectionItem { __typename type ... on CollectionArticleItem { article { __typename ...ArticleTeaserFragment } overrides { __typename ...OverridesFragment } } ... on CollectionRelatedArticleItem { article { __typename ...ArticleTeaserFragment } overrides { __typename ...OverridesFragment } } ... on CollectionExternalLinkItem { url overrides { __typename ...OverridesFragment } } } fragment HomepageFragment on Homepage { components { __typename id headline type ... on StandardCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on TakeoverCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on DiscoverRailCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on TopStoriesCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on EmbedsCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on CarouselCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } imageLayout variant } ... on VideoCarouselCollection { cta { __typename ...CtaFragment } playlistId source fallbackStoryLink { sourceId } } ... on CoverPackageCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on LatestEditionCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on Newsletter { slug items { __typename ...CollectionItemFragment } } } }', # noqa: E501
}
url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote)
try:
@@ -228,41 +260,39 @@ class EconomistNews(BasicNewsRecipe):
return self.economist_return_index(ans)
def economist_parse_index(self, raw):
- data = json.loads(raw)['data']['canonical']['hasPart']['parts']
+ data = json.loads(raw)['data']['findHomepage']['components']
feeds = []
for part in data:
- section = part.get('title', 'Articles')
+ if not part.get('items'):
+ continue
+ section = part.get('headline', 'Articles')
self.log(section)
articles = []
- for art in part['hasPart']['parts']:
- title = safe_dict(art, 'title')
- desc = safe_dict(art, 'rubric') or ''
- sub = safe_dict(art, 'flyTitle') or ''
+ for art in part['items']:
+ if not art.get('article'):
+ continue
+ arts = art['article']
+ title = safe_dict(arts, 'headline')
+ desc = safe_dict(arts, 'rubric') or ''
+ sub = safe_dict(arts, 'flyTitle') or ''
if sub and section != sub:
desc = sub + ' :: ' + desc
- if not art.get('text'):
- continue
- date_ = art['datePublished']
+ date_ = arts['datePublished']
dt = datetime.fromisoformat(date_[:-1]) + timedelta(seconds=time.timezone)
if (datetime.now() - dt) > timedelta(self.oldest_article):
continue
- pt = PersistentTemporaryFile('.html')
- pt.write(json.dumps(art).encode('utf-8'))
- pt.close()
- url = 'file:///' + pt.name
+ url = process_url(arts['url'])
articles.append({'title': title, 'url': url, 'description': desc})
self.log('\t', title, '\n\t\t', desc)
if articles:
feeds.append((section, articles))
+ self.from_web = True
return feeds
- def populate_article_metadata(self, article, soup, first):
- article.url = soup.find('h1')['title']
-
def preprocess_html(self, soup):
width = '600'
w = self.recipe_specific_options.get('res')
@@ -275,16 +305,17 @@ class EconomistNews(BasicNewsRecipe):
def preprocess_raw_html(self, raw, url):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
-
- body = ''
- root = parse(body)
- load_article_from_json(raw, root)
-
+ root_ = parse(raw)
if '/interactive/' in url:
- return ('' + root.xpath('//h1')[0].text + '
'
- 'This article is supposed to be read in a browser.'
+ return ('' + root_.xpath('//h1')[0].text + '
'
+ 'This article is supposed to be read in a browser'
'')
+ script = root_.xpath('//script[@id="__NEXT_DATA__"]')
+
+ html = load_article_from_web_json(script[0].text)
+
+ root = parse(html)
for div in root.xpath('//div[@class="lazy-image"]'):
noscript = list(div.iter('noscript'))
if noscript and noscript[0].text:
From cd862d28d7aa2f7610c449e56c6e2e628c9adb48 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sat, 19 Apr 2025 20:06:56 +0530
Subject: [PATCH 2/2] econ
update user
---
recipes/economist.recipe | 2 +-
recipes/economist_espresso.recipe | 2 +-
recipes/economist_free.recipe | 2 +-
recipes/economist_search.recipe | 2 +-
recipes/economist_world_ahead.recipe | 2 +-
5 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/recipes/economist.recipe b/recipes/economist.recipe
index 2a2c58ec3b..25770f4313 100644
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@@ -283,7 +283,7 @@ class Economist(BasicNewsRecipe):
def get_browser(self, *args, **kwargs):
if self.from_archive:
kwargs['user_agent'] = (
- 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+ 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
)
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
else:
diff --git a/recipes/economist_espresso.recipe b/recipes/economist_espresso.recipe
index 70447e844d..4a3c0b3e63 100644
--- a/recipes/economist_espresso.recipe
+++ b/recipes/economist_espresso.recipe
@@ -77,6 +77,6 @@ class Espresso(BasicNewsRecipe):
def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = (
- 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+ 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
)
return BasicNewsRecipe.get_browser(self, *args, **kwargs)
diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe
index 2a2c58ec3b..25770f4313 100644
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@@ -283,7 +283,7 @@ class Economist(BasicNewsRecipe):
def get_browser(self, *args, **kwargs):
if self.from_archive:
kwargs['user_agent'] = (
- 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+ 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
)
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
else:
diff --git a/recipes/economist_search.recipe b/recipes/economist_search.recipe
index 386c03f78e..09740561e4 100644
--- a/recipes/economist_search.recipe
+++ b/recipes/economist_search.recipe
@@ -185,7 +185,7 @@ class econ_search(BasicNewsRecipe):
def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = (
- 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+ 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
)
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
return br
diff --git a/recipes/economist_world_ahead.recipe b/recipes/economist_world_ahead.recipe
index 8e7b59536d..37b9f0e054 100644
--- a/recipes/economist_world_ahead.recipe
+++ b/recipes/economist_world_ahead.recipe
@@ -203,7 +203,7 @@ class EconomistWorld(BasicNewsRecipe):
def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = (
- 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+ 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
)
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
return br