From 58670f2fd183149ce0736b18313827ca4ecc6133 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Fri, 18 Apr 2025 16:48:25 +0530
Subject: [PATCH 1/3] Update economist_espresso.recipe

---
 recipes/economist_espresso.recipe | 295 +++++-------------------------
 1 file changed, 49 insertions(+), 246 deletions(-)

diff --git a/recipes/economist_espresso.recipe b/recipes/economist_espresso.recipe
index 18dba4922d..abbc934b32 100644
--- a/recipes/economist_espresso.recipe
+++ b/recipes/economist_espresso.recipe
@@ -3,94 +3,10 @@
 https://www.economist.com/the-world-in-brief
 '''
 
-import json
-from urllib.parse import quote, urlencode
-from uuid import uuid4
+import re
 
-from html5_parser import parse
-from lxml import etree
-
-from calibre import replace_entities
-from calibre.ebooks.BeautifulSoup import NavigableString, Tag
-from calibre.ptempfile import PersistentTemporaryFile
-from calibre.web.feeds.news import BasicNewsRecipe
-
-
-def E(parent, name, text='', **attrs):
-    ans = parent.makeelement(name, **attrs)
-    ans.text = text
-    parent.append(ans)
-    return ans
-
-
-def process_node(node, html_parent):
-    ntype = node.get('type')
-    if ntype == 'tag':
-        c = html_parent.makeelement(node['name'])
-        c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
-        html_parent.append(c)
-        for nc in node.get('children', ()):
-            process_node(nc, c)
-    elif ntype == 'text':
-        text = node.get('data')
-        if text:
-            text = replace_entities(text)
-            if len(html_parent):
-                t = html_parent[-1]
-                t.tail = (t.tail or '') + text
-            else:
-                html_parent.text = (html_parent.text or '') + text
-
-
-def safe_dict(data, *names):
-    ans = data
-    for x in names:
-        ans = ans.get(x) or {}
-    return ans
-
-
-class JSONHasNoContent(ValueError):
-    pass
-
-
-def load_article_from_json(raw, root):
-    # open('/t/raw.json', 'w').write(raw)
-    data = json.loads(raw)
-    body = root.xpath('//body')[0]
-    article = E(body, 'article')
-    E(article, 'div', data['flyTitle'], style='color: red; font-size:small; font-weight:bold;')
-    E(article, 'h1', data['title'], title=safe_dict(data, 'url', 'canonical') or '')
-    E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;')
-    E(article, 'div', data['byline'], style='font-style: italic; color:#202020;')
-    main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical')
-    if main_image_url:
-        div = E(article, 'div')
-        try:
-            E(div, 'img', src=main_image_url)
-        except Exception:
-            pass
-    for node in data.get('text') or ():
-        process_node(node, article)
-
-
-def cleanup_html_article(root):
-    main = root.xpath('//main')[0]
-    body = root.xpath('//body')[0]
-    for child in tuple(body):
-        body.remove(child)
-    body.append(main)
-    main.set('id', '')
-    main.tag = 'article'
-    for x in root.xpath('//*[@style]'):
-        x.set('style', '')
-    for x in root.xpath('//button'):
-        x.getparent().remove(x)
-
-
-def classes(classes):
-    q = frozenset(classes.split(' '))
-    return dict(attrs={
-        'class': lambda x: x and frozenset(x.split()).intersection(q)})
+from calibre.ebooks.BeautifulSoup import Tag
+from calibre.web.feeds.news import BasicNewsRecipe, classes
 
 
 def new_tag(soup, name, attrs=()):
@@ -100,178 +16,65 @@ def new_tag(soup, name, attrs=()):
     return Tag(soup, name, attrs=attrs or None)
 
 
-class NoArticles(Exception):
-    pass
-
-
-def process_url(url):
-    if url.startswith('/'):
-        url = 'https://www.economist.com' + url
-    return url
-
-
 class Espresso(BasicNewsRecipe):
     title = 'The Economist Espresso'
-    language = 'en'
+    language = 'en_GB'
     __author__ = 'unkn0wn'
-    encoding = 'utf-8'
-    masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'
-    cover_url = 'https://downloadr2.apkmirror.com/wp-content/uploads/2021/10/75/615777cc6611b.png'
     description = (
-        'Espresso is a rich, full-flavoured shot of daily global analysis'
-        ' from the editors of The Economist to get you up to speed, fast.'
-         ' Maximise your understanding of the most significant business, '
-         'economic, political and cultural developments globally.'
+        'Espresso is a rich, full-flavoured shot of daily global analysis '
+        'from the editors of The Economist to get you up to speed, fast. '
+        'Maximise your understanding of the most significant business, '
+        'economic, political and cultural developments globally.'
     )
+    cover_url = (
+        'https://downloadr2.apkmirror.com/wp-content/uploads/2021/10/75/615777cc6611b.png'
+    )
+    no_stylesheets = True
+    remove_attributes = ['height', 'width', 'style']
+    use_embedded_content = False
+    masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'
 
-    extra_css = '''
-        em { color:#202020; }
-        img {display:block; margin:0 auto;}
-    '''
+    extra_css = """
+        h1 { text-align:center; }
+        ._main-image, ._description, .sub { text-align:center; font-size:small; }
+        ._quote-container { font-size:x-large; font-style:italic; color:#202020; }
+    """
+
+    keep_only_tags = [dict(name='main', attrs={'id': 'content'})]
 
     remove_tags = [
-        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
-        dict(attrs={'aria-label': 'Article Teaser'}),
-        dict(attrs={
-                'class': [
-                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
-                    'related-items', 'main-content-container', 'ec-topic-widget',
-                    'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label',
-                    'blog-post__foot-note', 'blog-post__sharebar', 'blog-post__bottom-panel',
-                    'newsletter-form','share-links-header','teaser--wrapped', 'latest-updates-panel__container',
-                    'latest-updates-panel__article-link','blog-post__section'
-                ]
-            }
-        ),
-        dict(attrs={
-                'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
-        classes(
-            'share-links-header teaser--wrapped latest-updates-panel__container'
-            ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
-        )
+        classes('_podcast-promo _newsletter-promo-container _time-last-updated'),
+        dict(attrs={'data-test-id': 'twib-audio-player'}),
     ]
-    keep_only_tags = [dict(name='article', id=lambda x: not x)]
-    no_stylesheets = True
-    remove_attributes = ['data-reactid', 'width', 'height']
-
-    def get_browser(self, *args, **kwargs):
-        kwargs['user_agent'] = 'TheEconomist-Lamarr-android'
-        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
-        br.addheaders += [
-            ('accept', '*/*'),
-            ('content-type', 'application/json'),
-            ('apollographql-client-name', 'mobile-app-apollo'),
-            ('apollographql-client-version', '3.50.0'),
-            ('x-request-id', str(uuid4())),
-        ]
-        return br
-
-    def economist_return_index(self, ans):
-        if not ans:
-            raise NoArticles(
-                'Could not find any articles, either the '
-                'economist.com server is having trouble and you should '
-                'try later or the website format has changed and the '
-                'recipe needs to be updated.'
-            )
-        return ans
 
     def parse_index(self):
-        query = {
-            'query': 'query EspressoQuery($ref:String!){espresso:canonical(ref:$ref){...EspressoFragment __typename}}fragment EspressoFragment on Content{id type hasPart(size:1 sort:"datePublished:desc"){parts{id type rubric:description hasPart(sort:"publication.context.position:asc,datePublished:desc"){parts{...ArticleFragment __typename}__typename}__typename}__typename}__typename}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}',  # noqa: E501
-            'operationName': 'EspressoQuery',
-            'variables': '{"ref":"/content/ai0db6q5mftflg1irq7hiiofp15t7nlv"}',
-        }
-        url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote)
-        try:
-            raw = self.index_to_soup(url, raw=True)
-        except Exception:
-            raise ValueError('Server is not reachable, try again after some time.')
-        ans = self.economist_parse_index(raw)
-        return self.economist_return_index(ans)
-
-    def economist_parse_index(self, raw):
-        data = json.loads(raw)['data']['espresso']['hasPart']['parts'][0]
-        self.description = data['rubric']
-
-        ans = []
-        for part in safe_dict(data, 'hasPart', 'parts'):
-            title = safe_dict(part, 'title')
-            pt = PersistentTemporaryFile('.html')
-            pt.write(json.dumps(part).encode('utf-8'))
-            pt.close()
-            url = 'file:///' + pt.name
-            ans.append({'title': title, 'url': url})
-        return [('Espresso', ans)]
+        return [
+            (
+                'Espresso',
+                [
+                    {
+                        'title': 'The World in Brief',
+                        'url': 'https://www.economist.com/the-world-in-brief',
+                        'description': 'Catch up quickly on the global stories that matter',
+                    },
+                ],
+            ),
+        ]
 
     def preprocess_html(self, soup):
+        if h1 := soup.find('h1'):
+            if p := h1.find_next_sibling('p'):
+                p['class'] = 'sub'
+        for hr in soup.findAll(attrs={'class': ['_gobbet', '_article']}):
+            nt = new_tag(soup, 'hr')
+            hr.append(nt)
         for img in soup.findAll('img', src=True):
-            img['src'] = img['src'].replace('economist.com/',
-                'economist.com/cdn-cgi/image/width=600,quality=80,format=auto/')
+            img['src'] = re.sub(r'width=\d+', 'width=600', img['src'])
+        
         return soup
 
-    def populate_article_metadata(self, article, soup, first):
-        article.url = soup.find('h1')['title']
-
-    def preprocess_raw_html(self, raw, url):
-        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
-        body = '<html><body><article></article></body></html>'
-        root = parse(body)
-        load_article_from_json(raw, root)
-
-        for div in root.xpath('//div[@class="lazy-image"]'):
-            noscript = list(div.iter('noscript'))
-            if noscript and noscript[0].text:
-                img = list(parse(noscript[0].text).iter('img'))
-                if img:
-                    p = noscript[0].getparent()
-                    idx = p.index(noscript[0])
-                    p.insert(idx, p.makeelement('img', src=img[0].get('src')))
-                    p.remove(noscript[0])
-        for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
-            x.getparent().remove(x)
-        # the economist uses <small> for small caps with a custom font
-        for init in root.xpath('//span[@data-caps="initial"]'):
-            init.set('style', 'font-weight:bold;')
-        for x in root.xpath('//small'):
-            if x.text and len(x) == 0:
-                x.text = x.text.upper()
-                x.tag = 'span'
-                x.set('style', 'font-variant: small-caps')
-        for h2 in root.xpath('//h2'):
-            h2.tag = 'h4'
-        for x in root.xpath('//figcaption'):
-            x.set('style', 'text-align:center; font-size:small;')
-        for x in root.xpath('//cite'):
-            x.tag = 'blockquote'
-            x.set('style', 'color:#404040;')
-        raw = etree.tostring(root, encoding='unicode')
-        return raw
-
-    def eco_find_image_tables(self, soup):
-        for x in soup.findAll('table', align=['right', 'center']):
-            if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1:
-                yield x
-
-    def postprocess_html(self, soup, first):
-        for img in soup.findAll('img', srcset=True):
-            del img['srcset']
-        for table in list(self.eco_find_image_tables(soup)):
-            caption = table.find('font')
-            img = table.find('img')
-            div = new_tag(soup, 'div')
-            div['style'] = 'text-align:left;font-size:70%'
-            ns = NavigableString(self.tag_to_string(caption))
-            div.insert(0, ns)
-            div.insert(1, new_tag(soup, 'br'))
-            del img['width']
-            del img['height']
-            img.extract()
-            div.insert(2, img)
-            table.replaceWith(div)
-        return soup
-
-    def canonicalize_internal_url(self, url, is_link=True):
-        if url.endswith('/print'):
-            url = url.rpartition('/')[0]
-        return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
+    def get_browser(self, *args, **kwargs):
+        kwargs['user_agent'] = (
+            'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36 Lamarr'
+        )
+        return BasicNewsRecipe.get_browser(self, *args, **kwargs)

From 5e9759125574b1cc9f5793173156c547d1cb8a33 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Fri, 18 Apr 2025 17:00:46 +0530
Subject: [PATCH 2/3] ...

---
 recipes/economist_espresso.recipe | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/recipes/economist_espresso.recipe b/recipes/economist_espresso.recipe
index abbc934b32..7ff511ef26 100644
--- a/recipes/economist_espresso.recipe
+++ b/recipes/economist_espresso.recipe
@@ -44,7 +44,6 @@ class Espresso(BasicNewsRecipe):
 
     remove_tags = [
         classes('_podcast-promo _newsletter-promo-container _time-last-updated'),
-        dict(attrs={'data-test-id': 'twib-audio-player'}),
     ]
 
     def parse_index(self):
@@ -70,7 +69,10 @@ class Espresso(BasicNewsRecipe):
             hr.append(nt)
         for img in soup.findAll('img', src=True):
             img['src'] = re.sub(r'width=\d+', 'width=600', img['src'])
-        
+        if aud := soup.find(attrs={'data-test-id': 'twib-audio-player'}):
+            if div := aud.find_next('div'):
+                div.extract()
+            aud.extract()
         return soup
 
     def get_browser(self, *args, **kwargs):

From 35cc158054a6efa54157b21ec2a50e82c6fca58c Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Fri, 18 Apr 2025 17:03:57 +0530
Subject: [PATCH 3/3] ...

---
 recipes/economist_espresso.recipe | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes/economist_espresso.recipe b/recipes/economist_espresso.recipe
index 7ff511ef26..bb0cae2bf0 100644
--- a/recipes/economist_espresso.recipe
+++ b/recipes/economist_espresso.recipe
@@ -36,7 +36,7 @@ class Espresso(BasicNewsRecipe):
 
     extra_css = """
         h1 { text-align:center; }
-        ._main-image, ._description, .sub { text-align:center; font-size:small; }
+        ._main-image, ._description, .sub, .calibre-nuked-tag-figcaption { text-align:center; font-size:small; }
         ._quote-container { font-size:x-large; font-style:italic; color:#202020; }
     """
 
@@ -70,7 +70,7 @@ class Espresso(BasicNewsRecipe):
         for img in soup.findAll('img', src=True):
             img['src'] = re.sub(r'width=\d+', 'width=600', img['src'])
         if aud := soup.find(attrs={'data-test-id': 'twib-audio-player'}):
-            if div := aud.find_next('div'):
+            if div := aud.find_next_sibling('div'):
                 div.extract()
             aud.extract()
         return soup