From c82f04c3b10b4edf1cc7d73207e9b4f0d8a06d71 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 10 May 2016 08:28:58 +0530
Subject: [PATCH] Update The Guardian

---
 recipes/guardian.recipe | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe
index 0d68c99f07..d8d18c5764 100644
--- a/recipes/guardian.recipe
+++ b/recipes/guardian.recipe
@@ -40,15 +40,22 @@ class Guardian(BasicNewsRecipe):
     ]
     remove_tags = [
         dict(attrs={'class': lambda x:x and '--twitter' in x}),
+        dict(attrs={'class': lambda x:x and 'submeta' in x.split()}),
         dict(attrs={'data-component': ['share', 'social']}),
         dict(attrs={'data-link-name': 'block share'}),
         dict(attrs={'class': lambda x:x and 'inline-expand-image' in x}),
         dict(attrs={'class': lambda x:x and 'modern-visible' in x.split()}),
+        dict(name=['link', 'meta', 'style']),
     ]
     remove_tags_after = [
         dict(attrs={'class': lambda x: x and 'content__article-body' in x.split()}),
     ]
 
+    def preprocess_raw_html(self, raw, url):
+        import html5lib
+        from lxml import html
+        return html.tostring(html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml'), encoding=unicode)
+
     def preprocess_html(self, soup):
         for img in soup.findAll('img', srcset=True):
             img['src'] = img['srcset'].partition(' ')[0]