From c82f04c3b10b4edf1cc7d73207e9b4f0d8a06d71 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 10 May 2016 08:28:58 +0530 Subject: [PATCH] Update The Guardian --- recipes/guardian.recipe | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index 0d68c99f07..d8d18c5764 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -40,15 +40,22 @@ class Guardian(BasicNewsRecipe): ] remove_tags = [ dict(attrs={'class': lambda x:x and '--twitter' in x}), + dict(attrs={'class': lambda x:x and 'submeta' in x.split()}), dict(attrs={'data-component': ['share', 'social']}), dict(attrs={'data-link-name': 'block share'}), dict(attrs={'class': lambda x:x and 'inline-expand-image' in x}), dict(attrs={'class': lambda x:x and 'modern-visible' in x.split()}), + dict(name=['link', 'meta', 'style']), ] remove_tags_after = [ dict(attrs={'class': lambda x: x and 'content__article-body' in x.split()}), ] + def preprocess_raw_html(self, raw, url): + import html5lib + from lxml import html + return html.tostring(html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml'), encoding=unicode) + def preprocess_html(self, soup): for img in soup.findAll('img', srcset=True): img['src'] = img['srcset'].partition(' ')[0]