From 1c6387c51b9339cfb53b36e2bb79dd32ad7305d0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Mar 2017 12:39:36 +0530 Subject: [PATCH] Update The Economist --- recipes/economist.recipe | 14 ++++++++++++-- recipes/economist_free.recipe | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/recipes/economist.recipe b/recipes/economist.recipe index 84644b2e69..23dbc12fab 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -80,13 +80,14 @@ class Economist(BasicNewsRecipe): ] } ), - classes('share-links-header teaser--wrapped'), + classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section'), ] keep_only_tags = [dict(name='article', id=lambda x: not x)] no_stylesheets = True preprocess_regexps = [ (re.compile('.*', re.DOTALL), lambda x: ''), ] + remove_attributes = ['data-reactid'] # economist.com has started throttling after about 60% of the total has # downloaded with connection reset by peer (104) errors. delay = 1 @@ -120,9 +121,18 @@ class Economist(BasicNewsRecipe): br.set_handle_gzip(True) return br + def preprocess_raw_html(self, raw, url): + soup = self.index_to_soup(raw) + for div in soup.findAll(**classes('lazy-image')): + noscript = div.find('noscript') + img = noscript.find('img') + noscript.replaceWith(img) + return type(u'')(soup) + def parse_index(self): # return [('Articles', [{'title':'test', - # 'url':'https://www.economist.com/news/americas/21699494-guide-cutting-corners-way-jos'}])] + # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud' + # }])] raw = self.index_to_soup(self.INDEX, raw=True) # with open('/t/raw.html', 'wb') as f: # f.write(raw) diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index 84644b2e69..23dbc12fab 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -80,13 +80,14 @@ class Economist(BasicNewsRecipe): ] } ), - classes('share-links-header teaser--wrapped'), + classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section'), ] keep_only_tags = [dict(name='article', id=lambda x: not x)] no_stylesheets = True preprocess_regexps = [ (re.compile('.*', re.DOTALL), lambda x: ''), ] + remove_attributes = ['data-reactid'] # economist.com has started throttling after about 60% of the total has # downloaded with connection reset by peer (104) errors. delay = 1 @@ -120,9 +121,18 @@ class Economist(BasicNewsRecipe): br.set_handle_gzip(True) return br + def preprocess_raw_html(self, raw, url): + soup = self.index_to_soup(raw) + for div in soup.findAll(**classes('lazy-image')): + noscript = div.find('noscript') + img = noscript.find('img') + noscript.replaceWith(img) + return type(u'')(soup) + def parse_index(self): # return [('Articles', [{'title':'test', - # 'url':'https://www.economist.com/news/americas/21699494-guide-cutting-corners-way-jos'}])] + # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud' + # }])] raw = self.index_to_soup(self.INDEX, raw=True) # with open('/t/raw.html', 'wb') as f: # f.write(raw)