From a94539c32b4f2d3a359f733ee0a277af72c8c8ff Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 9 Jul 2013 14:44:46 +0530 Subject: [PATCH] jsbrowser: Fix handling of html with non lxml safe chars --- src/calibre/web/fetch/javascript.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/calibre/web/fetch/javascript.py b/src/calibre/web/fetch/javascript.py index 6e9ef86ff1..d7dfcf0a6a 100644 --- a/src/calibre/web/fetch/javascript.py +++ b/src/calibre/web/fetch/javascript.py @@ -145,8 +145,11 @@ def download_resources(browser, resource_cache, output_dir): elem.removeFromDocument() def save_html(browser, output_dir, postprocess_html, url, recursion_level): - html = strip_encoding_declarations(browser.html) import html5lib + from calibre.utils.cleantext import clean_xml_chars + html = strip_encoding_declarations(browser.html) + if isinstance(html, unicode): + html = clean_xml_chars(html) root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot() root = postprocess_html(root, url, recursion_level) if root is None: