Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-11-02 10:37:01 -05:00 · 2025-04-19 20:12:31 +05:30 · 2025-04-19 20:12:31 +05:30 · b21ae6f411
commit b21ae6f411
parent 27b6124c70 cd862d28d7
6 changed files with 120 additions and 89 deletions
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -283,7 +283,7 @@ class Economist(BasicNewsRecipe):
    def get_browser(self, *args, **kwargs):
        if self.from_archive:
            kwargs['user_agent'] = (
-                'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+                'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
            )
            br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        else:
--- a/recipes/economist_espresso.recipe
+++ b/recipes/economist_espresso.recipe
@ -77,6 +77,6 @@ class Espresso(BasicNewsRecipe):

    def get_browser(self, *args, **kwargs):
        kwargs['user_agent'] = (
-            'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+            'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
        )
        return BasicNewsRecipe.get_browser(self, *args, **kwargs)
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -283,7 +283,7 @@ class Economist(BasicNewsRecipe):
    def get_browser(self, *args, **kwargs):
        if self.from_archive:
            kwargs['user_agent'] = (
-                'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+                'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
            )
            br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        else:
--- a/recipes/economist_news.recipe
+++ b/recipes/economist_news.recipe
@ -10,38 +10,10 @@ from uuid import uuid4
 from html5_parser import parse
 from lxml import etree

-from calibre import replace_entities
 from calibre.ebooks.BeautifulSoup import NavigableString, Tag
-from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe


-def E(parent, name, text='', **attrs):
-    ans = parent.makeelement(name, **attrs)
-    ans.text = text
-    parent.append(ans)
-    return ans
-
-
-def process_node(node, html_parent):
-    ntype = node.get('type')
-    if ntype == 'tag':
-        c = html_parent.makeelement(node['name'])
-        c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
-        html_parent.append(c)
-        for nc in node.get('children', ()):
-            process_node(nc, c)
-    elif ntype == 'text':
-        text = node.get('data')
-        if text:
-            text = replace_entities(text)
-            if len(html_parent):
-                t = html_parent[-1]
-                t.tail = (t.tail or '') + text
-            else:
-                html_parent.text = (html_parent.text or '') + text
-
-
 def safe_dict(data, *names):
    ans = data
    for x in names:
@ -49,37 +21,86 @@ def safe_dict(data, *names):
    return ans


-class JSONHasNoContent(ValueError):
-    pass
+def process_web_list(li_node):
+    li_html = ''
+    for li in li_node['items']:
+        if li.get('textHtml'):
+            li_html += f'<li>{li.get("textHtml")}</li>'
+        else:
+            li_html += f'<li>{li.get("text", "")}</li>'
+    return li_html


-def load_article_from_json(raw, root):
+def process_info_box(bx):
+    info = ''
+    for x in safe_dict(bx, 'components'):
+        info += f'<blockquote>{process_web_node(x)}</blockquote>'
+    return info
+
+
+def process_web_node(node):
+    ntype = node.get('type', '')
+    if ntype == 'CROSSHEAD':
+        if node.get('textHtml'):
+            return f'<h4>{node.get("textHtml")}</h4>'
+        return f'<h4>{node.get("text", "")}</h4>'
+    elif ntype in ['PARAGRAPH', 'BOOK_INFO']:
+        if node.get('textHtml'):
+            return f'<p>{node.get("textHtml")}</p>'
+        return f'<p>{node.get("text", "")}</p>'
+    elif ntype == 'IMAGE':
+        alt = '' if node.get('altText') is None else node.get('altText')
+        cap = ''
+        if node.get('caption'):
+            if node['caption'].get('textHtml') is not None:
+                cap = node['caption']['textHtml']
+        return f'<div><img src="{node["url"]}" title="{alt}"></div><div style="text-align:center; font-size:small;">{cap}</div>'
+    elif ntype == 'PULL_QUOTE':
+        if node.get('textHtml'):
+            return f'<blockquote>{node.get("textHtml")}</blockquote>'
+        return f'<blockquote>{node.get("text", "")}</blockquote>'
+    elif ntype == 'DIVIDER':
+        return '<hr>'
+    elif ntype == 'INFOGRAPHIC':
+        if node.get('fallback'):
+            return process_web_node(node['fallback'])
+    elif ntype == 'INFOBOX':
+        return process_info_box(node)
+    elif ntype == 'UNORDERED_LIST':
+        if node.get('items'):
+            return process_web_list(node)
+    elif ntype:
+        print('** ', ntype)
+    return ''
+
+
+def load_article_from_web_json(raw):
    # open('/t/raw.json', 'w').write(raw)
-    data = json.loads(raw)
-    body = root.xpath('//body')[0]
-    article = E(body, 'article')
-    E(article, 'div', data['flyTitle'], style='color: red; font-size:small; font-weight:bold;')
-    E(article, 'h1', data['title'], title=safe_dict(data, 'url', 'canonical') or '')
-    E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;')
+    body = ''
+    try:
+        data = json.loads(raw)['props']['pageProps']['cp2Content']
+    except Exception:
+        data = json.loads(raw)['props']['pageProps']['content']
+    body += f'<div style="color: red; font-size:small; font-weight:bold;">{data.get("flyTitle", "")}</div>'
+    body += f'<h1>{data["headline"]}</h1>'
+    if data.get('rubric') and data.get('rubric') is not None:
+        body += f'<div style="font-style: italic; color:#202020;">{data.get("rubric", "")}</div>'
    try:
        date = data['dateModified']
    except Exception:
        date = data['datePublished']
    dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
    dt = dt.strftime('%b %d, %Y %I:%M %p')
-    if data['dateline'] is None:
-        E(article, 'p', dt, style='color: gray; font-size:small;')
+    if data.get('dateline') is None:
+        body += f'<p style="color: gray; font-size: small;">{dt}</p>'
    else:
-        E(article, 'p', dt + ' | ' + (data['dateline']), style='color: gray; font-size:small;')
-    main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical')
+        body += f'<p style="color: gray; font-size: small;">{dt + " | " + (data["dateline"])}</p>'
+    main_image_url = safe_dict(data, 'leadComponent') or ''
    if main_image_url:
-        div = E(article, 'div')
-        try:
-            E(div, 'img', src=main_image_url)
-        except Exception:
-            pass
-    for node in data.get('text') or ():
-        process_node(node, article)
+        body += process_web_node(data['leadComponent'])
+    for node in data.get('body'):
+        body += process_web_node(node)
+    return '<html><body><article>' + body + '</article></body></html>'


 def cleanup_html_article(root):
@ -172,6 +193,8 @@ class EconomistNews(BasicNewsRecipe):

    needs_subscription = False

+    from_web = False
+
    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
@ -192,16 +215,25 @@ class EconomistNews(BasicNewsRecipe):
            self.oldest_article = float(d)

    def get_browser(self, *args, **kwargs):
-        kwargs['user_agent'] = 'TheEconomist-Lamarr-android'
-        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
-        br.addheaders += [
-            ('accept', '*/*'),
-            ('content-type', 'application/json'),
-            ('apollographql-client-name', 'mobile-app-apollo'),
-            ('apollographql-client-version', '3.50.0'),
-            ('x-request-id', str(uuid4())),
-        ]
-        return br
+        if self.from_web:
+            kwargs['user_agent'] = (
+                'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
+            )
+            br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
+        else:
+            kwargs['user_agent'] = 'TheEconomist-Liskov-android'
+            br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
+            br.addheaders += [
+                ('accept', 'multipart/mixed; deferSpec=20220824, application/json'),
+                ('accept-encoding', 'gzip'),
+                ('content-type', 'application/json'),
+                ('x-app-trace-id', str(uuid4())),
+                ('x-economist-consumer', 'TheEconomist-Liskov-android'),
+                ('x-teg-client-name', 'Economist-Android'),
+                ('x-teg-client-os', 'Android'),
+                ('x-teg-client-version', '4.30.0'),
+            ]
+            return br

    def economist_return_index(self, ans):
        if not ans:
@ -215,9 +247,9 @@ class EconomistNews(BasicNewsRecipe):

    def parse_index(self):
        query = {
-            'query': 'query HomeQuery($homeId:String!$relatedId:String!){canonical(ref:$homeId){hasPart{parts{id title:headline cta{text url __typename}type hasPart{parts{...ArticleFragment ...VideoFragment hasPart{parts{...ArticleFragment isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}__typename}__typename}__typename}relatedTopStories:canonical(ref:$relatedId){id title:headline hasPart(size:2 sort:"datePublished:desc"){parts{...ArticleFragment isPartOf{id context{title:headline flyTitle:subheadline rubric:description dateline image{...ImageMainFragment ...ImagePromoFragment __typename}__typename}__typename}__typename}__typename}__typename}}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}fragment VideoFragment on Content{video{playlist{playlistId __typename}__typename}__typename}',  # noqa: E501
-            'operationName': 'HomeQuery',
-            'variables': '{"homeId":"/content/mgo2tcc3u3002m4gndvffg3kqv7n5n3g","relatedId":"/content/bp252fp9p9dvkn6pcjog5cks9hhnrf96"}',
+            'operationName': 'FindHomepage',
+            'variables': '{"homepageType":"MOBILE"}',
+            'query': 'query FindHomepage($homepageType: HomepageType!) { findHomepage(homepageType: $homepageType) { __typename ...HomepageFragment } }  fragment CtaFragment on Cta { link text }  fragment ContentIdentityFragment on ContentIdentity { articleType forceAppWebView leadMediaType }  fragment ImageTeaserFragment on ImageComponent { altText height imageType source url width }  fragment NarrationFragment on Narration { album bitrate duration filename id provider url }  fragment PodcastAudioFragment on PodcastEpisode { id audio { url durationInSeconds } }  fragment ArticleTeaserFragment on Article { id tegId url rubric headline flyTitle brand byline dateFirstPublished dateline dateModified datePublished dateRevised estimatedReadTime wordCount printHeadline contentIdentity { __typename ...ContentIdentityFragment } section { tegId name } teaserImage { __typename type ...ImageTeaserFragment } leadComponent { __typename type ...ImageTeaserFragment } narration { __typename ...NarrationFragment } podcast { __typename ...PodcastAudioFragment } }  fragment OverridesFragment on Overrides { flyTitle headline rubric teaserImage { __typename ...ImageTeaserFragment } }  fragment CollectionItemFragment on CollectionItem { __typename type ... on CollectionArticleItem { article { __typename ...ArticleTeaserFragment } overrides { __typename ...OverridesFragment } } ... on CollectionRelatedArticleItem { article { __typename ...ArticleTeaserFragment } overrides { __typename ...OverridesFragment } } ... on CollectionExternalLinkItem { url overrides { __typename ...OverridesFragment } } }  fragment HomepageFragment on Homepage { components { __typename id headline type ... on StandardCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on TakeoverCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on DiscoverRailCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on TopStoriesCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on EmbedsCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on CarouselCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } imageLayout variant } ... on VideoCarouselCollection { cta { __typename ...CtaFragment } playlistId source fallbackStoryLink { sourceId } } ... on CoverPackageCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on LatestEditionCollection { cta { __typename ...CtaFragment } items { __typename ...CollectionItemFragment } } ... on Newsletter { slug items { __typename ...CollectionItemFragment } } } }',  # noqa: E501
        }
        url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote)
        try:
@ -228,41 +260,39 @@ class EconomistNews(BasicNewsRecipe):
        return self.economist_return_index(ans)

    def economist_parse_index(self, raw):
-        data = json.loads(raw)['data']['canonical']['hasPart']['parts']
+        data = json.loads(raw)['data']['findHomepage']['components']

        feeds = []

        for part in data:
-            section = part.get('title', 'Articles')
+            if not part.get('items'):
+                continue
+            section = part.get('headline', 'Articles')
            self.log(section)

            articles = []

-            for art in part['hasPart']['parts']:
-                title = safe_dict(art, 'title')
-                desc = safe_dict(art, 'rubric') or ''
-                sub = safe_dict(art, 'flyTitle') or ''
+            for art in part['items']:
+                if not art.get('article'):
+                    continue
+                arts = art['article']
+                title = safe_dict(arts, 'headline')
+                desc = safe_dict(arts, 'rubric') or ''
+                sub = safe_dict(arts, 'flyTitle') or ''
                if sub and section != sub:
                    desc = sub + ' :: ' + desc
-                if not art.get('text'):
-                    continue
-                date_ = art['datePublished']
+                date_ = arts['datePublished']
                dt = datetime.fromisoformat(date_[:-1]) + timedelta(seconds=time.timezone)
                if (datetime.now() - dt) > timedelta(self.oldest_article):
                    continue
-                pt = PersistentTemporaryFile('.html')
-                pt.write(json.dumps(art).encode('utf-8'))
-                pt.close()
-                url = 'file:///' + pt.name
+                url = process_url(arts['url'])
                articles.append({'title': title, 'url': url, 'description': desc})
                self.log('\t', title, '\n\t\t', desc)
            if articles:
                feeds.append((section, articles))
+        self.from_web = True
        return feeds

-    def populate_article_metadata(self, article, soup, first):
-        article.url = soup.find('h1')['title']
-
    def preprocess_html(self, soup):
        width = '600'
        w = self.recipe_specific_options.get('res')
@ -275,16 +305,17 @@ class EconomistNews(BasicNewsRecipe):

    def preprocess_raw_html(self, raw, url):
        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
-
-        body = '<html><body><article></article></body></html>'
-        root = parse(body)
-        load_article_from_json(raw, root)
-
+        root_ = parse(raw)
        if '/interactive/' in url:
-            return ('<html><body><article><h1>' + root.xpath('//h1')[0].text + '</h1><em>'
-                    'This article is supposed to be read in a browser.'
+            return ('<html><body><article><h1>' + root_.xpath('//h1')[0].text + '</h1><em>'
+                    'This article is supposed to be read in a browser'
                    '</em></article></body></html>')

+        script = root_.xpath('//script[@id="__NEXT_DATA__"]')
+
+        html = load_article_from_web_json(script[0].text)
+
+        root = parse(html)
        for div in root.xpath('//div[@class="lazy-image"]'):
            noscript = list(div.iter('noscript'))
            if noscript and noscript[0].text:
--- a/recipes/economist_search.recipe
+++ b/recipes/economist_search.recipe
@ -185,7 +185,7 @@ class econ_search(BasicNewsRecipe):

    def get_browser(self, *args, **kwargs):
        kwargs['user_agent'] = (
-            'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+            'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
        )
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        return br
--- a/recipes/economist_world_ahead.recipe
+++ b/recipes/economist_world_ahead.recipe
@ -203,7 +203,7 @@ class EconomistWorld(BasicNewsRecipe):

    def get_browser(self, *args, **kwargs):
        kwargs['user_agent'] = (
-            'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+            'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Liskov'
        )
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        return br