From 23e52f2cba0a714ce8a4a4baca9b21ae8250fa2a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 11 Jun 2022 12:34:23 +0530
Subject: [PATCH] Dont fail to download interactive aricles from the Economist

They dont work well and look pretty awful, but better than nothing
---
 recipes/economist.recipe      | 33 +++++++++++++++++++++++++++++----
 recipes/economist_free.recipe | 33 +++++++++++++++++++++++++++++----
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/recipes/economist.recipe b/recipes/economist.recipe
index bc5fa0d8df..f2af44c38a 100644
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@@ -53,9 +53,16 @@ def safe_dict(data, *names):
     return ans
 
 
+class JSONHasNoContent(ValueError):
+    pass
+
+
 def load_article_from_json(raw, root):
-    data = json.loads(raw)['props']['pageProps']['content']
-    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
+    # open('/t/raw.json', 'w').write(raw)
+    try:
+        data = json.loads(raw)['props']['pageProps']['content']
+    except KeyError as e:
+        raise JSONHasNoContent(e)
     if isinstance(data, list):
         data = data[0]
     body = root.xpath('//body')[0]
@@ -77,6 +84,20 @@ def load_article_from_json(raw, root):
         process_node(node, article)
 
 
+def cleanup_html_article(root):
+    main = root.xpath('//main')[0]
+    body = root.xpath('//body')[0]
+    for child in tuple(body):
+        body.remove(child)
+    body.append(main)
+    main.set('id', '')
+    main.tag = 'article'
+    for x in root.xpath('//*[@style]'):
+        x.set('style', '')
+    for x in root.xpath('//button'):
+        x.getparent().remove(x)
+
+
 def classes(classes):
     q = frozenset(classes.split(' '))
     return dict(attrs={
@@ -104,6 +125,7 @@ class Economist(BasicNewsRecipe):
 
     title = 'The Economist'
     language = 'en'
+    encoding = 'utf-8'
 
     __author__ = "Kovid Goyal"
     description = (
@@ -208,7 +230,10 @@ class Economist(BasicNewsRecipe):
         root = parse(raw)
         script = root.xpath('//script[@id="__NEXT_DATA__"]')
         if script:
-            load_article_from_json(script[0].text, root)
+            try:
+                load_article_from_json(script[0].text, root)
+            except JSONHasNoContent:
+                cleanup_html_article(root)
         for div in root.xpath('//div[@class="lazy-image"]'):
             noscript = list(div.iter('noscript'))
             if noscript and noscript[0].text:
@@ -248,7 +273,7 @@ class Economist(BasicNewsRecipe):
 
     def parse_index(self):
         # return [('Articles', [{'title':'test',
-        #     'url':'https://www.economist.com/economic-and-financial-indicators/2022/04/23/economic-data-commodities-and-markets'
+        #     'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-models-are-turbo-charging-ai-progress'
         # }])]
         if edition_date:
             url = 'https://www.economist.com/weeklyedition/' + edition_date
diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe
index bc5fa0d8df..f2af44c38a 100644
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@@ -53,9 +53,16 @@ def safe_dict(data, *names):
     return ans
 
 
+class JSONHasNoContent(ValueError):
+    pass
+
+
 def load_article_from_json(raw, root):
-    data = json.loads(raw)['props']['pageProps']['content']
-    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
+    # open('/t/raw.json', 'w').write(raw)
+    try:
+        data = json.loads(raw)['props']['pageProps']['content']
+    except KeyError as e:
+        raise JSONHasNoContent(e)
     if isinstance(data, list):
         data = data[0]
     body = root.xpath('//body')[0]
@@ -77,6 +84,20 @@ def load_article_from_json(raw, root):
         process_node(node, article)
 
 
+def cleanup_html_article(root):
+    main = root.xpath('//main')[0]
+    body = root.xpath('//body')[0]
+    for child in tuple(body):
+        body.remove(child)
+    body.append(main)
+    main.set('id', '')
+    main.tag = 'article'
+    for x in root.xpath('//*[@style]'):
+        x.set('style', '')
+    for x in root.xpath('//button'):
+        x.getparent().remove(x)
+
+
 def classes(classes):
     q = frozenset(classes.split(' '))
     return dict(attrs={
@@ -104,6 +125,7 @@ class Economist(BasicNewsRecipe):
 
     title = 'The Economist'
     language = 'en'
+    encoding = 'utf-8'
 
     __author__ = "Kovid Goyal"
     description = (
@@ -208,7 +230,10 @@ class Economist(BasicNewsRecipe):
         root = parse(raw)
         script = root.xpath('//script[@id="__NEXT_DATA__"]')
         if script:
-            load_article_from_json(script[0].text, root)
+            try:
+                load_article_from_json(script[0].text, root)
+            except JSONHasNoContent:
+                cleanup_html_article(root)
         for div in root.xpath('//div[@class="lazy-image"]'):
             noscript = list(div.iter('noscript'))
             if noscript and noscript[0].text:
@@ -248,7 +273,7 @@ class Economist(BasicNewsRecipe):
 
     def parse_index(self):
         # return [('Articles', [{'title':'test',
-        #     'url':'https://www.economist.com/economic-and-financial-indicators/2022/04/23/economic-data-commodities-and-markets'
+        #     'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-models-are-turbo-charging-ai-progress'
         # }])]
         if edition_date:
             url = 'https://www.economist.com/weeklyedition/' + edition_date