From 87391bff5c266af44102b9eca2d63d70ea7a8581 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 19 Apr 2025 20:05:51 +0530 Subject: [PATCH 1/2] Update economist_news.recipe --- recipes/economist_news.recipe | 199 ++++++++++++++++++++-------------- 1 file changed, 115 insertions(+), 84 deletions(-) diff --git a/recipes/economist_news.recipe b/recipes/economist_news.recipe index cf2223dc85..28a927dbc4 100644 --- a/recipes/economist_news.recipe +++ b/recipes/economist_news.recipe @@ -10,38 +10,10 @@ from uuid import uuid4 from html5_parser import parse from lxml import etree -from calibre import replace_entities from calibre.ebooks.BeautifulSoup import NavigableString, Tag -from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe -def E(parent, name, text='', **attrs): - ans = parent.makeelement(name, **attrs) - ans.text = text - parent.append(ans) - return ans - - -def process_node(node, html_parent): - ntype = node.get('type') - if ntype == 'tag': - c = html_parent.makeelement(node['name']) - c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()}) - html_parent.append(c) - for nc in node.get('children', ()): - process_node(nc, c) - elif ntype == 'text': - text = node.get('data') - if text: - text = replace_entities(text) - if len(html_parent): - t = html_parent[-1] - t.tail = (t.tail or '') + text - else: - html_parent.text = (html_parent.text or '') + text - - def safe_dict(data, *names): ans = data for x in names: @@ -49,37 +21,86 @@ def safe_dict(data, *names): return ans -class JSONHasNoContent(ValueError): - pass +def process_web_list(li_node): + li_html = '' + for li in li_node['items']: + if li.get('textHtml'): + li_html += f'
  • {li.get("textHtml")}
  • ' + else: + li_html += f'
  • {li.get("text", "")}
  • ' + return li_html -def load_article_from_json(raw, root): +def process_info_box(bx): + info = '' + for x in safe_dict(bx, 'components'): + info += f'
    {process_web_node(x)}
    ' + return info + + +def process_web_node(node): + ntype = node.get('type', '') + if ntype == 'CROSSHEAD': + if node.get('textHtml'): + return f'

    {node.get("textHtml")}

    ' + return f'

    {node.get("text", "")}

    ' + elif ntype in ['PARAGRAPH', 'BOOK_INFO']: + if node.get('textHtml'): + return f'

    {node.get("textHtml")}

    ' + return f'

    {node.get("text", "")}

    ' + elif ntype == 'IMAGE': + alt = '' if node.get('altText') is None else node.get('altText') + cap = '' + if node.get('caption'): + if node['caption'].get('textHtml') is not None: + cap = node['caption']['textHtml'] + return f'
    {cap}
    ' + elif ntype == 'PULL_QUOTE': + if node.get('textHtml'): + return f'
    {node.get("textHtml")}
    ' + return f'
    {node.get("text", "")}
    ' + elif ntype == 'DIVIDER': + return '
    ' + elif ntype == 'INFOGRAPHIC': + if node.get('fallback'): + return process_web_node(node['fallback']) + elif ntype == 'INFOBOX': + return process_info_box(node) + elif ntype == 'UNORDERED_LIST': + if node.get('items'): + return process_web_list(node) + elif ntype: + print('** ', ntype) + return '' + + +def load_article_from_web_json(raw): # open('/t/raw.json', 'w').write(raw) - data = json.loads(raw) - body = root.xpath('//body')[0] - article = E(body, 'article') - E(article, 'div', data['flyTitle'], style='color: red; font-size:small; font-weight:bold;') - E(article, 'h1', data['title'], title=safe_dict(data, 'url', 'canonical') or '') - E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;') + body = '' + try: + data = json.loads(raw)['props']['pageProps']['cp2Content'] + except Exception: + data = json.loads(raw)['props']['pageProps']['content'] + body += f'
    {data.get("flyTitle", "")}
    ' + body += f'

    {data["headline"]}

    ' + if data.get('rubric') and data.get('rubric') is not None: + body += f'
    {data.get("rubric", "")}
    ' try: date = data['dateModified'] except Exception: date = data['datePublished'] dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) dt = dt.strftime('%b %d, %Y %I:%M %p') - if data['dateline'] is None: - E(article, 'p', dt, style='color: gray; font-size:small;') + if data.get('dateline') is None: + body += f'

    {dt}

    ' else: - E(article, 'p', dt + ' | ' + (data['dateline']), style='color: gray; font-size:small;') - main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') + body += f'

    {dt + " | " + (data["dateline"])}

    ' + main_image_url = safe_dict(data, 'leadComponent') or '' if main_image_url: - div = E(article, 'div') - try: - E(div, 'img', src=main_image_url) - except Exception: - pass - for node in data.get('text') or (): - process_node(node, article) + body += process_web_node(data['leadComponent']) + for node in data.get('body'): + body += process_web_node(node) + return '
    ' + body + '
    ' def cleanup_html_article(root): @@ -172,6 +193,8 @@ class EconomistNews(BasicNewsRecipe): needs_subscription = False + from_web = False + recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', @@ -192,16 +215,25 @@ class EconomistNews(BasicNewsRecipe): self.oldest_article = float(d) def get_browser(self, *args, **kwargs): - kwargs['user_agent'] = 'TheEconomist-Lamarr-android' - br = BasicNewsRecipe.get_browser(self, *args, **kwargs) - br.addheaders += [ - ('accept', '*/*'), - ('content-type', 'application/json'), - ('apollographql-client-name', 'mobile-app-apollo'), - ('apollographql-client-version', '3.50.0'), - ('x-request-id', str(uuid4())), - ] - return br + if self.from_web: + kwargs['user_agent'] = ( + 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov' + ) + br = BasicNewsRecipe.get_browser(self, *args, **kwargs) + else: + kwargs['user_agent'] = 'TheEconomist-Liskov-android' + br = BasicNewsRecipe.get_browser(self, *args, **kwargs) + br.addheaders += [ + ('accept', 'multipart/mixed; deferSpec=20220824, application/json'), + ('accept-encoding', 'gzip'), + ('content-type', 'application/json'), + ('x-app-trace-id', str(uuid4())), + ('x-economist-consumer', 'TheEconomist-Liskov-android'), + ('x-teg-client-name', 'Economist-Android'), + ('x-teg-client-os', 'Android'), + ('x-teg-client-version', '4.30.0'), + ] + return br def economist_return_index(self, ans): if not ans: @@ -215,9 +247,9 @@ class EconomistNews(BasicNewsRecipe): def parse_index(self): query = { - 'query': 'query HomeQuery($homeId:String!$relatedId:String!){canonical(ref:$homeId){hasPart{parts{id title:headline cta{text url __typename}type hasPart{parts{...ArticleFragment ...VideoFragment hasPart{parts{...ArticleFragment isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}__typename}__typename}__typename}relatedTopStories:canonical(ref:$relatedId){id title:headline hasPart(size:2 sort:"datePublished:desc"){parts{...ArticleFragment isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}__typename}}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}fragment VideoFragment on Content{video{playlist{playlistId __typename}__typename}__typename}', # noqa: E501 - 'operationName': 'HomeQuery', - 'variables': '{"homeId":"/content/mgo2tcc3u3002m4gndvffg3kqv7n5n3g","relatedId":"/content/bp252fp9p9dvkn6pcjog5cks9hhnrf96"}', + 'operationName': 'FindHomepage', + 'variables': '{"homepageType":"MOBILE"}', + 'query': 'query FindHomepage($homepageType: HomepageType!) { findHomepage(homepageType: $homepageType) { __typename ...HomepageFragment } } fragment CtaFragment on Cta { link text } fragment ContentIdentityFragment on ContentIdentity { articleType forceAppWebView leadMediaType } fragment ImageTeaserFragment on ImageComponent { altText height imageType source url width } fragment NarrationFragment on Narration { album bitrate duration filename id provider url } fragment PodcastAudioFragment on PodcastEpisode { id audio { url durationInSeconds } } fragment ArticleTeaserFragment on Article { id tegId url rubric headline flyTitle brand byline dateFirstPublished dateline dateModified datePublished dateRevised estimatedReadTime wordCount printHeadline contentIdentity { __typename ...ContentIdentityFragment } section { tegId name } teaserImage { __typename type ...ImageTeaserFragment } leadComponent { __typename type ...ImageTeaserFragment } narration { __typename ...NarrationFragment } podcast { __typename ...PodcastAudioFragment } } fragment OverridesFragment on Overrides { flyTitle headline rubric teaserImage { __typename ...ImageTeaserFragment } } fragment CollectionItemFragment on CollectionItem { __typename type ... on CollectionArticleItem { article { __typename ...ArticleTeaserFragment } overrides { __typename ...OverridesFragment } } ... on CollectionRelatedArticleItem { article { __typename ...ArticleTeaserFragment } overrides { __typename ...OverridesFragment } } ... on CollectionExternalLinkItem { url overrides { __typename ...OverridesFragment } } } fragment HomepageFragment on Homepage { components { __typename id headline type ... on StandardCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on TakeoverCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on DiscoverRailCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on TopStoriesCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on EmbedsCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on CarouselCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } imageLayout variant } ... on VideoCarouselCollection { cta { __typename ...CtaFragment } playlistId source fallbackStoryLink { sourceId } } ... on CoverPackageCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on LatestEditionCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on Newsletter { slug items { __typename ...CollectionItemFragment } } } }', # noqa: E501 } url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote) try: @@ -228,41 +260,39 @@ class EconomistNews(BasicNewsRecipe): return self.economist_return_index(ans) def economist_parse_index(self, raw): - data = json.loads(raw)['data']['canonical']['hasPart']['parts'] + data = json.loads(raw)['data']['findHomepage']['components'] feeds = [] for part in data: - section = part.get('title', 'Articles') + if not part.get('items'): + continue + section = part.get('headline', 'Articles') self.log(section) articles = [] - for art in part['hasPart']['parts']: - title = safe_dict(art, 'title') - desc = safe_dict(art, 'rubric') or '' - sub = safe_dict(art, 'flyTitle') or '' + for art in part['items']: + if not art.get('article'): + continue + arts = art['article'] + title = safe_dict(arts, 'headline') + desc = safe_dict(arts, 'rubric') or '' + sub = safe_dict(arts, 'flyTitle') or '' if sub and section != sub: desc = sub + ' :: ' + desc - if not art.get('text'): - continue - date_ = art['datePublished'] + date_ = arts['datePublished'] dt = datetime.fromisoformat(date_[:-1]) + timedelta(seconds=time.timezone) if (datetime.now() - dt) > timedelta(self.oldest_article): continue - pt = PersistentTemporaryFile('.html') - pt.write(json.dumps(art).encode('utf-8')) - pt.close() - url = 'file:///' + pt.name + url = process_url(arts['url']) articles.append({'title': title, 'url': url, 'description': desc}) self.log('\t', title, '\n\t\t', desc) if articles: feeds.append((section, articles)) + self.from_web = True return feeds - def populate_article_metadata(self, article, soup, first): - article.url = soup.find('h1')['title'] - def preprocess_html(self, soup): width = '600' w = self.recipe_specific_options.get('res') @@ -275,16 +305,17 @@ class EconomistNews(BasicNewsRecipe): def preprocess_raw_html(self, raw, url): # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) - - body = '
    ' - root = parse(body) - load_article_from_json(raw, root) - + root_ = parse(raw) if '/interactive/' in url: - return ('

    ' + root.xpath('//h1')[0].text + '

    ' - 'This article is supposed to be read in a browser.' + return ('

    ' + root_.xpath('//h1')[0].text + '

    ' + 'This article is supposed to be read in a browser' '
    ') + script = root_.xpath('//script[@id="__NEXT_DATA__"]') + + html = load_article_from_web_json(script[0].text) + + root = parse(html) for div in root.xpath('//div[@class="lazy-image"]'): noscript = list(div.iter('noscript')) if noscript and noscript[0].text: From cd862d28d7aa2f7610c449e56c6e2e628c9adb48 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 19 Apr 2025 20:06:56 +0530 Subject: [PATCH 2/2] econ update user --- recipes/economist.recipe | 2 +- recipes/economist_espresso.recipe | 2 +- recipes/economist_free.recipe | 2 +- recipes/economist_search.recipe | 2 +- recipes/economist_world_ahead.recipe | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 2a2c58ec3b..25770f4313 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -283,7 +283,7 @@ class Economist(BasicNewsRecipe): def get_browser(self, *args, **kwargs): if self.from_archive: kwargs['user_agent'] = ( - 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' + 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov' ) br = BasicNewsRecipe.get_browser(self, *args, **kwargs) else: diff --git a/recipes/economist_espresso.recipe b/recipes/economist_espresso.recipe index 70447e844d..4a3c0b3e63 100644 --- a/recipes/economist_espresso.recipe +++ b/recipes/economist_espresso.recipe @@ -77,6 +77,6 @@ class Espresso(BasicNewsRecipe): def get_browser(self, *args, **kwargs): kwargs['user_agent'] = ( - 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' + 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov' ) return BasicNewsRecipe.get_browser(self, *args, **kwargs) diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index 2a2c58ec3b..25770f4313 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -283,7 +283,7 @@ class Economist(BasicNewsRecipe): def get_browser(self, *args, **kwargs): if self.from_archive: kwargs['user_agent'] = ( - 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' + 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov' ) br = BasicNewsRecipe.get_browser(self, *args, **kwargs) else: diff --git a/recipes/economist_search.recipe b/recipes/economist_search.recipe index 386c03f78e..09740561e4 100644 --- a/recipes/economist_search.recipe +++ b/recipes/economist_search.recipe @@ -185,7 +185,7 @@ class econ_search(BasicNewsRecipe): def get_browser(self, *args, **kwargs): kwargs['user_agent'] = ( - 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' + 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov' ) br = BasicNewsRecipe.get_browser(self, *args, **kwargs) return br diff --git a/recipes/economist_world_ahead.recipe b/recipes/economist_world_ahead.recipe index 8e7b59536d..37b9f0e054 100644 --- a/recipes/economist_world_ahead.recipe +++ b/recipes/economist_world_ahead.recipe @@ -203,7 +203,7 @@ class EconomistWorld(BasicNewsRecipe): def get_browser(self, *args, **kwargs): kwargs['user_agent'] = ( - 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr' + 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov' ) br = BasicNewsRecipe.get_browser(self, *args, **kwargs) return br