From 36fbbb6ae12dc1faf4c926c535426a14e14ebd5e Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 13 Dec 2019 14:48:45 +0530
Subject: [PATCH] Update The Economist

---
 recipes/economist.recipe      | 78 +++++++++++++++++++++++++++++------
 recipes/economist_free.recipe | 78 +++++++++++++++++++++++++++++------
 2 files changed, 132 insertions(+), 24 deletions(-)
diff --git a/recipes/economist.recipe b/recipes/economist.recipe
index 7fc026de77..cc2a4c65d8 100644
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@@ -1,19 +1,67 @@
 #!/usr/bin/env  python2
+# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
 
-__license__ = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-'''
-economist.com
-'''
 try:
     from http.cookiejar import Cookie
 except ImportError:
     from cookielib import Cookie
+import json
 from collections import OrderedDict
 
+from html5_parser import parse
+from lxml import etree
+
 from calibre.ebooks.BeautifulSoup import NavigableString, Tag
-from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.cleantext import clean_ascii_chars
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+def E(parent, name, text='', **attrs):
+    ans = parent.makeelement(name, **attrs)
+    ans.text = text
+    parent.append(ans)
+    return ans
+
+
+def process_node(node, html_parent):
+    ntype = node.get('type')
+    if ntype == 'tag':
+        c = html_parent.makeelement(node['name'])
+        c.attrib.update(node.get('attribs', {}))
+        html_parent.append(c)
+        for nc in node.get('children', ()):
+            process_node(nc, c)
+    elif ntype == 'text':
+        text = node.get('data')
+        if text:
+            if len(html_parent):
+                t = html_parent[-1]
+                t.tail = (t.tail or '') + text
+            else:
+                html_parent.text = (html_parent.text or '') + text
+
+
+def load_article_from_json(raw, root):
+    data = json.loads(raw)['props']['pageProps']['content']
+    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
+    body = root.xpath('//body')[0]
+    for child in tuple(body):
+        body.remove(child)
+    article = E(body, 'article')
+    E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
+    E(article, 'h1', data['headline'], style='font-size: x-large')
+    E(article, 'div', data['description'], style='font-style: italic')
+    E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
+    images = data['image']
+    if 'main' in images:
+        div = E(article, 'div')
+        try:
+            E(div, 'img', src=images['main']['url']['canonical'])
+        except Exception:
+            pass
+    text = data['text']
+    for node in text:
+        process_node(node, article)
 
 
 def classes(classes):
@@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
     resolve_internal_links = True
     remove_tags = [
         dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
+        dict(attrs={'aria-label': "Article Teaser"}),
         dict(attrs={
                 'class': [
                     'dblClkTrk', 'ec-article-info', 'share_inline_header',
@@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
         ),
         dict(attrs={
                 'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
-        classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
+        classes(
+            'share-links-header teaser--wrapped latest-updates-panel__container'
+            ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
+        )
     ]
     keep_only_tags = [dict(name='article', id=lambda x: not x)]
     no_stylesheets = True
@@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
         return br
 
     def preprocess_raw_html(self, raw, url):
-        import html5lib
-        root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
-        from lxml import etree
+        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
+        root = parse(raw)
+        script = root.xpath('//script[@id="__NEXT_DATA__"]')
+        if script:
+            load_article_from_json(script[0].text, root)
         for div in root.xpath('//div[@class="lazy-image"]'):
             noscript = list(div.iter('noscript'))
             if noscript and noscript[0].text:
-                img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
+                img = list(parse(noscript[0].text).iter('img'))
                 if img:
                     p = noscript[0].getparent()
                     idx = p.index(noscript[0])
@@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):
 
     def parse_index(self):
         # return [('Articles', [{'title':'test',
-        # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
+        #     'url':'file:///t/raw.html'
         # }])]
         raw = self.index_to_soup(self.INDEX, raw=True)
         # with open('/t/raw.html', 'wb') as f:
diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe
index 7fc026de77..cc2a4c65d8 100644
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@@ -1,19 +1,67 @@
 #!/usr/bin/env  python2
+# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
 
-__license__ = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-'''
-economist.com
-'''
 try:
     from http.cookiejar import Cookie
 except ImportError:
     from cookielib import Cookie
+import json
 from collections import OrderedDict
 
+from html5_parser import parse
+from lxml import etree
+
 from calibre.ebooks.BeautifulSoup import NavigableString, Tag
-from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.cleantext import clean_ascii_chars
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+def E(parent, name, text='', **attrs):
+    ans = parent.makeelement(name, **attrs)
+    ans.text = text
+    parent.append(ans)
+    return ans
+
+
+def process_node(node, html_parent):
+    ntype = node.get('type')
+    if ntype == 'tag':
+        c = html_parent.makeelement(node['name'])
+        c.attrib.update(node.get('attribs', {}))
+        html_parent.append(c)
+        for nc in node.get('children', ()):
+            process_node(nc, c)
+    elif ntype == 'text':
+        text = node.get('data')
+        if text:
+            if len(html_parent):
+                t = html_parent[-1]
+                t.tail = (t.tail or '') + text
+            else:
+                html_parent.text = (html_parent.text or '') + text
+
+
+def load_article_from_json(raw, root):
+    data = json.loads(raw)['props']['pageProps']['content']
+    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
+    body = root.xpath('//body')[0]
+    for child in tuple(body):
+        body.remove(child)
+    article = E(body, 'article')
+    E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
+    E(article, 'h1', data['headline'], style='font-size: x-large')
+    E(article, 'div', data['description'], style='font-style: italic')
+    E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
+    images = data['image']
+    if 'main' in images:
+        div = E(article, 'div')
+        try:
+            E(div, 'img', src=images['main']['url']['canonical'])
+        except Exception:
+            pass
+    text = data['text']
+    for node in text:
+        process_node(node, article)
 
 
 def classes(classes):
@@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
     resolve_internal_links = True
     remove_tags = [
         dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
+        dict(attrs={'aria-label': "Article Teaser"}),
         dict(attrs={
                 'class': [
                     'dblClkTrk', 'ec-article-info', 'share_inline_header',
@@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
         ),
         dict(attrs={
                 'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
-        classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
+        classes(
+            'share-links-header teaser--wrapped latest-updates-panel__container'
+            ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
+        )
     ]
     keep_only_tags = [dict(name='article', id=lambda x: not x)]
     no_stylesheets = True
@@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
         return br
 
     def preprocess_raw_html(self, raw, url):
-        import html5lib
-        root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
-        from lxml import etree
+        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
+        root = parse(raw)
+        script = root.xpath('//script[@id="__NEXT_DATA__"]')
+        if script:
+            load_article_from_json(script[0].text, root)
         for div in root.xpath('//div[@class="lazy-image"]'):
             noscript = list(div.iter('noscript'))
             if noscript and noscript[0].text:
-                img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
+                img = list(parse(noscript[0].text).iter('img'))
                 if img:
                     p = noscript[0].getparent()
                     idx = p.index(noscript[0])
@@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):
 
     def parse_index(self):
         # return [('Articles', [{'title':'test',
-        # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
+        #     'url':'file:///t/raw.html'
         # }])]
         raw = self.index_to_soup(self.INDEX, raw=True)
         # with open('/t/raw.html', 'wb') as f: