This commit is contained in:
Kovid Goyal 2025-04-19 20:12:31 +05:30
commit b21ae6f411
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 120 additions and 89 deletions

View File

@ -283,7 +283,7 @@ class Economist(BasicNewsRecipe):
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
if self.from_archive: if self.from_archive:
kwargs['user_agent'] = ( kwargs['user_agent'] = (
'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
) )
br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
else: else:

View File

@ -77,6 +77,6 @@ class Espresso(BasicNewsRecipe):
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = ( kwargs['user_agent'] = (
'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
) )
return BasicNewsRecipe.get_browser(self, *args, **kwargs) return BasicNewsRecipe.get_browser(self, *args, **kwargs)

View File

@ -283,7 +283,7 @@ class Economist(BasicNewsRecipe):
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
if self.from_archive: if self.from_archive:
kwargs['user_agent'] = ( kwargs['user_agent'] = (
'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
) )
br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
else: else:

View File

@ -10,38 +10,10 @@ from uuid import uuid4
from html5_parser import parse from html5_parser import parse
from lxml import etree from lxml import etree
from calibre import replace_entities
from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs)
ans.text = text
parent.append(ans)
return ans
def process_node(node, html_parent):
ntype = node.get('type')
if ntype == 'tag':
c = html_parent.makeelement(node['name'])
c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
html_parent.append(c)
for nc in node.get('children', ()):
process_node(nc, c)
elif ntype == 'text':
text = node.get('data')
if text:
text = replace_entities(text)
if len(html_parent):
t = html_parent[-1]
t.tail = (t.tail or '') + text
else:
html_parent.text = (html_parent.text or '') + text
def safe_dict(data, *names): def safe_dict(data, *names):
ans = data ans = data
for x in names: for x in names:
@ -49,37 +21,86 @@ def safe_dict(data, *names):
return ans return ans
class JSONHasNoContent(ValueError): def process_web_list(li_node):
pass li_html = ''
for li in li_node['items']:
if li.get('textHtml'):
li_html += f'<li>{li.get("textHtml")}</li>'
else:
li_html += f'<li>{li.get("text", "")}</li>'
return li_html
def load_article_from_json(raw, root): def process_info_box(bx):
info = ''
for x in safe_dict(bx, 'components'):
info += f'<blockquote>{process_web_node(x)}</blockquote>'
return info
def process_web_node(node):
ntype = node.get('type', '')
if ntype == 'CROSSHEAD':
if node.get('textHtml'):
return f'<h4>{node.get("textHtml")}</h4>'
return f'<h4>{node.get("text", "")}</h4>'
elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
if node.get('textHtml'):
return f'<p>{node.get("textHtml")}</p>'
return f'<p>{node.get("text", "")}</p>'
elif ntype == 'IMAGE':
alt = '' if node.get('altText') is None else node.get('altText')
cap = ''
if node.get('caption'):
if node['caption'].get('textHtml') is not None:
cap = node['caption']['textHtml']
return f'<div><img src="{node["url"]}" title="{alt}"></div><div style="text-align:center; font-size:small;">{cap}</div>'
elif ntype == 'PULL_QUOTE':
if node.get('textHtml'):
return f'<blockquote>{node.get("textHtml")}</blockquote>'
return f'<blockquote>{node.get("text", "")}</blockquote>'
elif ntype == 'DIVIDER':
return '<hr>'
elif ntype == 'INFOGRAPHIC':
if node.get('fallback'):
return process_web_node(node['fallback'])
elif ntype == 'INFOBOX':
return process_info_box(node)
elif ntype == 'UNORDERED_LIST':
if node.get('items'):
return process_web_list(node)
elif ntype:
print('** ', ntype)
return ''
def load_article_from_web_json(raw):
# open('/t/raw.json', 'w').write(raw) # open('/t/raw.json', 'w').write(raw)
data = json.loads(raw) body = ''
body = root.xpath('//body')[0] try:
article = E(body, 'article') data = json.loads(raw)['props']['pageProps']['cp2Content']
E(article, 'div', data['flyTitle'], style='color: red; font-size:small; font-weight:bold;') except Exception:
E(article, 'h1', data['title'], title=safe_dict(data, 'url', 'canonical') or '') data = json.loads(raw)['props']['pageProps']['content']
E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;') body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>'
body += f'<h1>{data["headline"]}</h1>'
if data.get('rubric') and data.get('rubric') is not None:
body += f'<div style="font-style: italic; color:#202020;">{data.get("rubric", "")}</div>'
try: try:
date = data['dateModified'] date = data['dateModified']
except Exception: except Exception:
date = data['datePublished'] date = data['datePublished']
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b %d, %Y %I:%M %p') dt = dt.strftime('%b %d, %Y %I:%M %p')
if data['dateline'] is None: if data.get('dateline') is None:
E(article, 'p', dt, style='color: gray; font-size:small;') body += f'<p style="color: gray; font-size: small;">{dt}</p>'
else: else:
E(article, 'p', dt + ' | ' + (data['dateline']), style='color: gray; font-size:small;') body += f'<p style="color: gray; font-size: small;">{dt + " | " + (data["dateline"])}</p>'
main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') main_image_url = safe_dict(data, 'leadComponent') or ''
if main_image_url: if main_image_url:
div = E(article, 'div') body += process_web_node(data['leadComponent'])
try: for node in data.get('body'):
E(div, 'img', src=main_image_url) body += process_web_node(node)
except Exception: return '<html><body><article>' + body + '</article></body></html>'
pass
for node in data.get('text') or ():
process_node(node, article)
def cleanup_html_article(root): def cleanup_html_article(root):
@ -172,6 +193,8 @@ class EconomistNews(BasicNewsRecipe):
needs_subscription = False needs_subscription = False
from_web = False
recipe_specific_options = { recipe_specific_options = {
'days': { 'days': {
'short': 'Oldest article to download from this news source. In days ', 'short': 'Oldest article to download from this news source. In days ',
@ -192,14 +215,23 @@ class EconomistNews(BasicNewsRecipe):
self.oldest_article = float(d) self.oldest_article = float(d)
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = 'TheEconomist-Lamarr-android' if self.from_web:
kwargs['user_agent'] = (
'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
)
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
else:
kwargs['user_agent'] = 'TheEconomist-Liskov-android'
br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.addheaders += [ br.addheaders += [
('accept', '*/*'), ('accept', 'multipart/mixed; deferSpec=20220824, application/json'),
('accept-encoding', 'gzip'),
('content-type', 'application/json'), ('content-type', 'application/json'),
('apollographql-client-name', 'mobile-app-apollo'), ('x-app-trace-id', str(uuid4())),
('apollographql-client-version', '3.50.0'), ('x-economist-consumer', 'TheEconomist-Liskov-android'),
('x-request-id', str(uuid4())), ('x-teg-client-name', 'Economist-Android'),
('x-teg-client-os', 'Android'),
('x-teg-client-version', '4.30.0'),
] ]
return br return br
@ -215,9 +247,9 @@ class EconomistNews(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
query = { query = {
'query': 'query HomeQuery($homeId:String!$relatedId:String!){canonical(ref:$homeId){hasPart{parts{id title:headline cta{text url __typename}type hasPart{parts{...ArticleFragment ...VideoFragment hasPart{parts{...ArticleFragment isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}__typename}__typename}__typename}relatedTopStories:canonical(ref:$relatedId){id title:headline hasPart(size:2 sort:"datePublished:desc"){parts{...ArticleFragment isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}__typename}}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}fragment VideoFragment on Content{video{playlist{playlistId __typename}__typename}__typename}', # noqa: E501 'operationName': 'FindHomepage',
'operationName': 'HomeQuery', 'variables': '{"homepageType":"MOBILE"}',
'variables': '{"homeId":"/content/mgo2tcc3u3002m4gndvffg3kqv7n5n3g","relatedId":"/content/bp252fp9p9dvkn6pcjog5cks9hhnrf96"}', 'query': 'query FindHomepage($homepageType: HomepageType!) { findHomepage(homepageType: $homepageType) { __typename ...HomepageFragment } } fragment CtaFragment on Cta { link text } fragment ContentIdentityFragment on ContentIdentity { articleType forceAppWebView leadMediaType } fragment ImageTeaserFragment on ImageComponent { altText height imageType source url width } fragment NarrationFragment on Narration { album bitrate duration filename id provider url } fragment PodcastAudioFragment on PodcastEpisode { id audio { url durationInSeconds } } fragment ArticleTeaserFragment on Article { id tegId url rubric headline flyTitle brand byline dateFirstPublished dateline dateModified datePublished dateRevised estimatedReadTime wordCount printHeadline contentIdentity { __typename ...ContentIdentityFragment } section { tegId name } teaserImage { __typename type ...ImageTeaserFragment } leadComponent { __typename type ...ImageTeaserFragment } narration { __typename ...NarrationFragment } podcast { __typename ...PodcastAudioFragment } } fragment OverridesFragment on Overrides { flyTitle headline rubric teaserImage { __typename ...ImageTeaserFragment } } fragment CollectionItemFragment on CollectionItem { __typename type ... on CollectionArticleItem { article { __typename ...ArticleTeaserFragment } overrides { __typename ...OverridesFragment } } ... on CollectionRelatedArticleItem { article { __typename ...ArticleTeaserFragment } overrides { __typename ...OverridesFragment } } ... on CollectionExternalLinkItem { url overrides { __typename ...OverridesFragment } } } fragment HomepageFragment on Homepage { components { __typename id headline type ... on StandardCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on TakeoverCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on DiscoverRailCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on TopStoriesCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on EmbedsCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on CarouselCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } imageLayout variant } ... on VideoCarouselCollection { cta { __typename ...CtaFragment } playlistId source fallbackStoryLink { sourceId } } ... on CoverPackageCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on LatestEditionCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on Newsletter { slug items { __typename ...CollectionItemFragment } } } }', # noqa: E501
} }
url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote) url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote)
try: try:
@ -228,41 +260,39 @@ class EconomistNews(BasicNewsRecipe):
return self.economist_return_index(ans) return self.economist_return_index(ans)
def economist_parse_index(self, raw): def economist_parse_index(self, raw):
data = json.loads(raw)['data']['canonical']['hasPart']['parts'] data = json.loads(raw)['data']['findHomepage']['components']
feeds = [] feeds = []
for part in data: for part in data:
section = part.get('title', 'Articles') if not part.get('items'):
continue
section = part.get('headline', 'Articles')
self.log(section) self.log(section)
articles = [] articles = []
for art in part['hasPart']['parts']: for art in part['items']:
title = safe_dict(art, 'title') if not art.get('article'):
desc = safe_dict(art, 'rubric') or '' continue
sub = safe_dict(art, 'flyTitle') or '' arts = art['article']
title = safe_dict(arts, 'headline')
desc = safe_dict(arts, 'rubric') or ''
sub = safe_dict(arts, 'flyTitle') or ''
if sub and section != sub: if sub and section != sub:
desc = sub + ' :: ' + desc desc = sub + ' :: ' + desc
if not art.get('text'): date_ = arts['datePublished']
continue
date_ = art['datePublished']
dt = datetime.fromisoformat(date_[:-1]) + timedelta(seconds=time.timezone) dt = datetime.fromisoformat(date_[:-1]) + timedelta(seconds=time.timezone)
if (datetime.now() - dt) > timedelta(self.oldest_article): if (datetime.now() - dt) > timedelta(self.oldest_article):
continue continue
pt = PersistentTemporaryFile('.html') url = process_url(arts['url'])
pt.write(json.dumps(art).encode('utf-8'))
pt.close()
url = 'file:///' + pt.name
articles.append({'title': title, 'url': url, 'description': desc}) articles.append({'title': title, 'url': url, 'description': desc})
self.log('\t', title, '\n\t\t', desc) self.log('\t', title, '\n\t\t', desc)
if articles: if articles:
feeds.append((section, articles)) feeds.append((section, articles))
self.from_web = True
return feeds return feeds
def populate_article_metadata(self, article, soup, first):
article.url = soup.find('h1')['title']
def preprocess_html(self, soup): def preprocess_html(self, soup):
width = '600' width = '600'
w = self.recipe_specific_options.get('res') w = self.recipe_specific_options.get('res')
@ -275,16 +305,17 @@ class EconomistNews(BasicNewsRecipe):
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8')) # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
root_ = parse(raw)
body = '<html><body><article></article></body></html>'
root = parse(body)
load_article_from_json(raw, root)
if '/interactive/' in url: if '/interactive/' in url:
return ('<html><body><article><h1>' + root.xpath('//h1')[0].text + '</h1><em>' return ('<html><body><article><h1>' + root_.xpath('//h1')[0].text + '</h1><em>'
'This article is supposed to be read in a browser.' 'This article is supposed to be read in a browser'
'</em></article></body></html>') '</em></article></body></html>')
script = root_.xpath('//script[@id="__NEXT_DATA__"]')
html = load_article_from_web_json(script[0].text)
root = parse(html)
for div in root.xpath('//div[@class="lazy-image"]'): for div in root.xpath('//div[@class="lazy-image"]'):
noscript = list(div.iter('noscript')) noscript = list(div.iter('noscript'))
if noscript and noscript[0].text: if noscript and noscript[0].text:

View File

@ -185,7 +185,7 @@ class econ_search(BasicNewsRecipe):
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = ( kwargs['user_agent'] = (
'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
) )
br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
return br return br

View File

@ -203,7 +203,7 @@ class EconomistWorld(BasicNewsRecipe):
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = ( kwargs['user_agent'] = (
'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
) )
br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
return br return br