Merge branch 'master' of github.com:kovidgoyal/calibre

2025-07-08 10:44:09 -04:00 · 2019-12-13 18:09:42 +05:30 · 2019-12-13 18:09:42 +05:30 · 15e247f88a
commit 15e247f88a
parent 283dbdf392 36fbbb6ae1
2 changed files with 132 additions and 24 deletions
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -1,19 +1,67 @@
 #!/usr/bin/env  python2
+# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>

-__license__ = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-'''
-economist.com
-'''
 try:
    from http.cookiejar import Cookie
 except ImportError:
    from cookielib import Cookie
+import json
 from collections import OrderedDict

+from html5_parser import parse
+from lxml import etree
+
 from calibre.ebooks.BeautifulSoup import NavigableString, Tag
-from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.cleantext import clean_ascii_chars
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+def E(parent, name, text='', **attrs):
+    ans = parent.makeelement(name, **attrs)
+    ans.text = text
+    parent.append(ans)
+    return ans
+
+
+def process_node(node, html_parent):
+    ntype = node.get('type')
+    if ntype == 'tag':
+        c = html_parent.makeelement(node['name'])
+        c.attrib.update(node.get('attribs', {}))
+        html_parent.append(c)
+        for nc in node.get('children', ()):
+            process_node(nc, c)
+    elif ntype == 'text':
+        text = node.get('data')
+        if text:
+            if len(html_parent):
+                t = html_parent[-1]
+                t.tail = (t.tail or '') + text
+            else:
+                html_parent.text = (html_parent.text or '') + text
+
+
+def load_article_from_json(raw, root):
+    data = json.loads(raw)['props']['pageProps']['content']
+    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
+    body = root.xpath('//body')[0]
+    for child in tuple(body):
+        body.remove(child)
+    article = E(body, 'article')
+    E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
+    E(article, 'h1', data['headline'], style='font-size: x-large')
+    E(article, 'div', data['description'], style='font-style: italic')
+    E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
+    images = data['image']
+    if 'main' in images:
+        div = E(article, 'div')
+        try:
+            E(div, 'img', src=images['main']['url']['canonical'])
+        except Exception:
+            pass
+    text = data['text']
+    for node in text:
+        process_node(node, article)


 def classes(classes):
@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
    resolve_internal_links = True
    remove_tags = [
        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
+        dict(attrs={'aria-label': "Article Teaser"}),
        dict(attrs={
                'class': [
                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
        ),
        dict(attrs={
                'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
-        classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
+        classes(
+            'share-links-header teaser--wrapped latest-updates-panel__container'
+            ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
+        )
    ]
    keep_only_tags = [dict(name='article', id=lambda x: not x)]
    no_stylesheets = True
@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
        return br

    def preprocess_raw_html(self, raw, url):
-        import html5lib
-        root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
-        from lxml import etree
+        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
+        root = parse(raw)
+        script = root.xpath('//script[@id="__NEXT_DATA__"]')
+        if script:
+            load_article_from_json(script[0].text, root)
        for div in root.xpath('//div[@class="lazy-image"]'):
            noscript = list(div.iter('noscript'))
            if noscript and noscript[0].text:
-                img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
+                img = list(parse(noscript[0].text).iter('img'))
                if img:
                    p = noscript[0].getparent()
                    idx = p.index(noscript[0])
@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):

    def parse_index(self):
        # return [('Articles', [{'title':'test',
-        # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
+        #     'url':'file:///t/raw.html'
        # }])]
        raw = self.index_to_soup(self.INDEX, raw=True)
        # with open('/t/raw.html', 'wb') as f:
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -1,19 +1,67 @@
 #!/usr/bin/env  python2
+# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>

-__license__ = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-'''
-economist.com
-'''
 try:
    from http.cookiejar import Cookie
 except ImportError:
    from cookielib import Cookie
+import json
 from collections import OrderedDict

+from html5_parser import parse
+from lxml import etree
+
 from calibre.ebooks.BeautifulSoup import NavigableString, Tag
-from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.cleantext import clean_ascii_chars
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+def E(parent, name, text='', **attrs):
+    ans = parent.makeelement(name, **attrs)
+    ans.text = text
+    parent.append(ans)
+    return ans
+
+
+def process_node(node, html_parent):
+    ntype = node.get('type')
+    if ntype == 'tag':
+        c = html_parent.makeelement(node['name'])
+        c.attrib.update(node.get('attribs', {}))
+        html_parent.append(c)
+        for nc in node.get('children', ()):
+            process_node(nc, c)
+    elif ntype == 'text':
+        text = node.get('data')
+        if text:
+            if len(html_parent):
+                t = html_parent[-1]
+                t.tail = (t.tail or '') + text
+            else:
+                html_parent.text = (html_parent.text or '') + text
+
+
+def load_article_from_json(raw, root):
+    data = json.loads(raw)['props']['pageProps']['content']
+    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
+    body = root.xpath('//body')[0]
+    for child in tuple(body):
+        body.remove(child)
+    article = E(body, 'article')
+    E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
+    E(article, 'h1', data['headline'], style='font-size: x-large')
+    E(article, 'div', data['description'], style='font-style: italic')
+    E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
+    images = data['image']
+    if 'main' in images:
+        div = E(article, 'div')
+        try:
+            E(div, 'img', src=images['main']['url']['canonical'])
+        except Exception:
+            pass
+    text = data['text']
+    for node in text:
+        process_node(node, article)


 def classes(classes):
@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
    resolve_internal_links = True
    remove_tags = [
        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
+        dict(attrs={'aria-label': "Article Teaser"}),
        dict(attrs={
                'class': [
                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
        ),
        dict(attrs={
                'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
-        classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
+        classes(
+            'share-links-header teaser--wrapped latest-updates-panel__container'
+            ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
+        )
    ]
    keep_only_tags = [dict(name='article', id=lambda x: not x)]
    no_stylesheets = True
@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
        return br

    def preprocess_raw_html(self, raw, url):
-        import html5lib
-        root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
-        from lxml import etree
+        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
+        root = parse(raw)
+        script = root.xpath('//script[@id="__NEXT_DATA__"]')
+        if script:
+            load_article_from_json(script[0].text, root)
        for div in root.xpath('//div[@class="lazy-image"]'):
            noscript = list(div.iter('noscript'))
            if noscript and noscript[0].text:
-                img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
+                img = list(parse(noscript[0].text).iter('img'))
                if img:
                    p = noscript[0].getparent()
                    idx = p.index(noscript[0])
@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):

    def parse_index(self):
        # return [('Articles', [{'title':'test',
-        # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
+        #     'url':'file:///t/raw.html'
        # }])]
        raw = self.index_to_soup(self.INDEX, raw=True)
        # with open('/t/raw.html', 'wb') as f: