Merge branch 'master' of github.com:kovidgoyal/calibre

2025-08-11 09:13:57 -04:00 · 2019-12-13 18:09:42 +05:30 · 2019-12-13 18:09:42 +05:30 · 15e247f88a
commit 15e247f88a
parent 283dbdf392 36fbbb6ae1
2 changed files with 132 additions and 24 deletions
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@ -1,19 +1,67 @@
 #!/usr/bin/env  python2
 # License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
 __license__ = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 economist.com
 '''
 try:
    from http.cookiejar import Cookie
 except ImportError:
    from cookielib import Cookie
 import json
 from collections import OrderedDict
 from html5_parser import parse
 from lxml import etree
 from calibre.ebooks.BeautifulSoup import NavigableString, Tag
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.cleantext import clean_ascii_chars
 from calibre.web.feeds.news import BasicNewsRecipe
 def E(parent, name, text='', **attrs):
    ans = parent.makeelement(name, **attrs)
    ans.text = text
    parent.append(ans)
    return ans
 def process_node(node, html_parent):
    ntype = node.get('type')
    if ntype == 'tag':
        c = html_parent.makeelement(node['name'])
        c.attrib.update(node.get('attribs', {}))
        html_parent.append(c)
        for nc in node.get('children', ()):
            process_node(nc, c)
    elif ntype == 'text':
        text = node.get('data')
        if text:
            if len(html_parent):
                t = html_parent[-1]
                t.tail = (t.tail or '') + text
            else:
                html_parent.text = (html_parent.text or '') + text
 def load_article_from_json(raw, root):
    data = json.loads(raw)['props']['pageProps']['content']
    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
    body = root.xpath('//body')[0]
    for child in tuple(body):
        body.remove(child)
    article = E(body, 'article')
    E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
    E(article, 'h1', data['headline'], style='font-size: x-large')
    E(article, 'div', data['description'], style='font-style: italic')
    E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
    images = data['image']
    if 'main' in images:
        div = E(article, 'div')
        try:
            E(div, 'img', src=images['main']['url']['canonical'])
        except Exception:
            pass
    text = data['text']
    for node in text:
        process_node(node, article)
 def classes(classes):
@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
    resolve_internal_links = True
    remove_tags = [
        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
        dict(attrs={'aria-label': "Article Teaser"}),
        dict(attrs={
                'class': [
                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
        ),
        dict(attrs={
                'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
-        classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
+        classes(
            'share-links-header teaser--wrapped latest-updates-panel__container'
            ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
        )
    ]
    keep_only_tags = [dict(name='article', id=lambda x: not x)]
    no_stylesheets = True
@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
        return br
    def preprocess_raw_html(self, raw, url):
-        import html5lib
+        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
-        root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
+        root = parse(raw)
-        from lxml import etree
+        script = root.xpath('//script[@id="__NEXT_DATA__"]')
        if script:
            load_article_from_json(script[0].text, root)
        for div in root.xpath('//div[@class="lazy-image"]'):
            noscript = list(div.iter('noscript'))
            if noscript and noscript[0].text:
-                img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
+                img = list(parse(noscript[0].text).iter('img'))
                if img:
                    p = noscript[0].getparent()
                    idx = p.index(noscript[0])
@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):
    def parse_index(self):
        # return [('Articles', [{'title':'test',
-        # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
+        #     'url':'file:///t/raw.html'
        # }])]
        raw = self.index_to_soup(self.INDEX, raw=True)
        # with open('/t/raw.html', 'wb') as f:
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@ -1,19 +1,67 @@
 #!/usr/bin/env  python2
 # License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
 __license__ = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 economist.com
 '''
 try:
    from http.cookiejar import Cookie
 except ImportError:
    from cookielib import Cookie
 import json
 from collections import OrderedDict
 from html5_parser import parse
 from lxml import etree
 from calibre.ebooks.BeautifulSoup import NavigableString, Tag
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.utils.cleantext import clean_ascii_chars
 from calibre.web.feeds.news import BasicNewsRecipe
 def E(parent, name, text='', **attrs):
    ans = parent.makeelement(name, **attrs)
    ans.text = text
    parent.append(ans)
    return ans
 def process_node(node, html_parent):
    ntype = node.get('type')
    if ntype == 'tag':
        c = html_parent.makeelement(node['name'])
        c.attrib.update(node.get('attribs', {}))
        html_parent.append(c)
        for nc in node.get('children', ()):
            process_node(nc, c)
    elif ntype == 'text':
        text = node.get('data')
        if text:
            if len(html_parent):
                t = html_parent[-1]
                t.tail = (t.tail or '') + text
            else:
                html_parent.text = (html_parent.text or '') + text
 def load_article_from_json(raw, root):
    data = json.loads(raw)['props']['pageProps']['content']
    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
    body = root.xpath('//body')[0]
    for child in tuple(body):
        body.remove(child)
    article = E(body, 'article')
    E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
    E(article, 'h1', data['headline'], style='font-size: x-large')
    E(article, 'div', data['description'], style='font-style: italic')
    E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
    images = data['image']
    if 'main' in images:
        div = E(article, 'div')
        try:
            E(div, 'img', src=images['main']['url']['canonical'])
        except Exception:
            pass
    text = data['text']
    for node in text:
        process_node(node, article)
 def classes(classes):
@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
    resolve_internal_links = True
    remove_tags = [
        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
        dict(attrs={'aria-label': "Article Teaser"}),
        dict(attrs={
                'class': [
                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
        ),
        dict(attrs={
                'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
-        classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
+        classes(
            'share-links-header teaser--wrapped latest-updates-panel__container'
            ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
        )
    ]
    keep_only_tags = [dict(name='article', id=lambda x: not x)]
    no_stylesheets = True
@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
        return br
    def preprocess_raw_html(self, raw, url):
-        import html5lib
+        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
-        root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
+        root = parse(raw)
-        from lxml import etree
+        script = root.xpath('//script[@id="__NEXT_DATA__"]')
        if script:
            load_article_from_json(script[0].text, root)
        for div in root.xpath('//div[@class="lazy-image"]'):
            noscript = list(div.iter('noscript'))
            if noscript and noscript[0].text:
-                img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
+                img = list(parse(noscript[0].text).iter('img'))
                if img:
                    p = noscript[0].getparent()
                    idx = p.index(noscript[0])
@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):
    def parse_index(self):
        # return [('Articles', [{'title':'test',
-        # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
+        #     'url':'file:///t/raw.html'
        # }])]
        raw = self.index_to_soup(self.INDEX, raw=True)
        # with open('/t/raw.html', 'wb') as f: