From 36fbbb6ae12dc1faf4c926c535426a14e14ebd5e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 13 Dec 2019 14:48:45 +0530 Subject: [PATCH] Update The Economist --- recipes/economist.recipe | 78 +++++++++++++++++++++++++++++------ recipes/economist_free.recipe | 78 +++++++++++++++++++++++++++++------ 2 files changed, 132 insertions(+), 24 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 7fc026de77..cc2a4c65d8 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -1,19 +1,67 @@ #!/usr/bin/env python2 +# License: GPLv3 Copyright: 2008, Kovid Goyal -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -economist.com -''' try: from http.cookiejar import Cookie except ImportError: from cookielib import Cookie +import json from collections import OrderedDict +from html5_parser import parse +from lxml import etree + from calibre.ebooks.BeautifulSoup import NavigableString, Tag -from calibre.web.feeds.news import BasicNewsRecipe from calibre.utils.cleantext import clean_ascii_chars +from calibre.web.feeds.news import BasicNewsRecipe + + +def E(parent, name, text='', **attrs): + ans = parent.makeelement(name, **attrs) + ans.text = text + parent.append(ans) + return ans + + +def process_node(node, html_parent): + ntype = node.get('type') + if ntype == 'tag': + c = html_parent.makeelement(node['name']) + c.attrib.update(node.get('attribs', {})) + html_parent.append(c) + for nc in node.get('children', ()): + process_node(nc, c) + elif ntype == 'text': + text = node.get('data') + if text: + if len(html_parent): + t = html_parent[-1] + t.tail = (t.tail or '') + text + else: + html_parent.text = (html_parent.text or '') + text + + +def load_article_from_json(raw, root): + data = json.loads(raw)['props']['pageProps']['content'] + # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True)) + body = root.xpath('//body')[0] + for child in tuple(body): + body.remove(child) + article = E(body, 'article') + E(article, 'h4', data['subheadline'], style='color: red; margin: 0') + E(article, 'h1', data['headline'], style='font-size: x-large') + E(article, 'div', data['description'], style='font-style: italic') + E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em') + images = data['image'] + if 'main' in images: + div = E(article, 'div') + try: + E(div, 'img', src=images['main']['url']['canonical']) + except Exception: + pass + text = data['text'] + for node in text: + process_node(node, article) def classes(classes): @@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe): resolve_internal_links = True remove_tags = [ dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']), + dict(attrs={'aria-label': "Article Teaser"}), dict(attrs={ 'class': [ 'dblClkTrk', 'ec-article-info', 'share_inline_header', @@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe): ), dict(attrs={ 'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}), - classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form') + classes( + 'share-links-header teaser--wrapped latest-updates-panel__container' + ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel' + ) ] keep_only_tags = [dict(name='article', id=lambda x: not x)] no_stylesheets = True @@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe): return br def preprocess_raw_html(self, raw, url): - import html5lib - root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml') - from lxml import etree + # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) + root = parse(raw) + script = root.xpath('//script[@id="__NEXT_DATA__"]') + if script: + load_article_from_json(script[0].text, root) for div in root.xpath('//div[@class="lazy-image"]'): noscript = list(div.iter('noscript')) if noscript and noscript[0].text: - img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img')) + img = list(parse(noscript[0].text).iter('img')) if img: p = noscript[0].getparent() idx = p.index(noscript[0]) @@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe): def parse_index(self): # return [('Articles', [{'title':'test', - # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud' + # 'url':'file:///t/raw.html' # }])] raw = self.index_to_soup(self.INDEX, raw=True) # with open('/t/raw.html', 'wb') as f: diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index 7fc026de77..cc2a4c65d8 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -1,19 +1,67 @@ #!/usr/bin/env python2 +# License: GPLv3 Copyright: 2008, Kovid Goyal -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -economist.com -''' try: from http.cookiejar import Cookie except ImportError: from cookielib import Cookie +import json from collections import OrderedDict +from html5_parser import parse +from lxml import etree + from calibre.ebooks.BeautifulSoup import NavigableString, Tag -from calibre.web.feeds.news import BasicNewsRecipe from calibre.utils.cleantext import clean_ascii_chars +from calibre.web.feeds.news import BasicNewsRecipe + + +def E(parent, name, text='', **attrs): + ans = parent.makeelement(name, **attrs) + ans.text = text + parent.append(ans) + return ans + + +def process_node(node, html_parent): + ntype = node.get('type') + if ntype == 'tag': + c = html_parent.makeelement(node['name']) + c.attrib.update(node.get('attribs', {})) + html_parent.append(c) + for nc in node.get('children', ()): + process_node(nc, c) + elif ntype == 'text': + text = node.get('data') + if text: + if len(html_parent): + t = html_parent[-1] + t.tail = (t.tail or '') + text + else: + html_parent.text = (html_parent.text or '') + text + + +def load_article_from_json(raw, root): + data = json.loads(raw)['props']['pageProps']['content'] + # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True)) + body = root.xpath('//body')[0] + for child in tuple(body): + body.remove(child) + article = E(body, 'article') + E(article, 'h4', data['subheadline'], style='color: red; margin: 0') + E(article, 'h1', data['headline'], style='font-size: x-large') + E(article, 'div', data['description'], style='font-style: italic') + E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em') + images = data['image'] + if 'main' in images: + div = E(article, 'div') + try: + E(div, 'img', src=images['main']['url']['canonical']) + except Exception: + pass + text = data['text'] + for node in text: + process_node(node, article) def classes(classes): @@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe): resolve_internal_links = True remove_tags = [ dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']), + dict(attrs={'aria-label': "Article Teaser"}), dict(attrs={ 'class': [ 'dblClkTrk', 'ec-article-info', 'share_inline_header', @@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe): ), dict(attrs={ 'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}), - classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form') + classes( + 'share-links-header teaser--wrapped latest-updates-panel__container' + ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel' + ) ] keep_only_tags = [dict(name='article', id=lambda x: not x)] no_stylesheets = True @@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe): return br def preprocess_raw_html(self, raw, url): - import html5lib - root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml') - from lxml import etree + # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) + root = parse(raw) + script = root.xpath('//script[@id="__NEXT_DATA__"]') + if script: + load_article_from_json(script[0].text, root) for div in root.xpath('//div[@class="lazy-image"]'): noscript = list(div.iter('noscript')) if noscript and noscript[0].text: - img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img')) + img = list(parse(noscript[0].text).iter('img')) if img: p = noscript[0].getparent() idx = p.index(noscript[0]) @@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe): def parse_index(self): # return [('Articles', [{'title':'test', - # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud' + # 'url':'file:///t/raw.html' # }])] raw = self.index_to_soup(self.INDEX, raw=True) # with open('/t/raw.html', 'wb') as f: