diff --git a/recipes/chr_mon.recipe b/recipes/chr_mon.recipe index 6f41b95763..50b626fcbf 100644 --- a/recipes/chr_mon.recipe +++ b/recipes/chr_mon.recipe @@ -33,6 +33,32 @@ class ChristianScienceMonitor(BasicNewsRecipe): remove_javascript = True no_stylesheets = True + requires_version = (0, 8, 39) + + def preprocess_raw_html(self, raw, url): + try: + from html5lib import parse + root = parse(raw, namespaceHTMLElements=False, + treebuilder='lxml').getroot() + from lxml import etree + for tag in root.xpath( + '//script|//style|//noscript|//meta|//link|//object'): + tag.getparent().remove(tag) + for elem in list(root.iterdescendants(tag=etree.Comment)): + elem.getparent().remove(elem) + ans = etree.tostring(root, encoding=unicode) + ans = re.sub('.*', lambda match : ''), - (r'