From a94539c32b4f2d3a359f733ee0a277af72c8c8ff Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 9 Jul 2013 14:44:46 +0530
Subject: [PATCH] jsbrowser: Fix handling of html with non lxml safe chars

---
 src/calibre/web/fetch/javascript.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/web/fetch/javascript.py b/src/calibre/web/fetch/javascript.py
index 6e9ef86ff1..d7dfcf0a6a 100644
--- a/src/calibre/web/fetch/javascript.py
+++ b/src/calibre/web/fetch/javascript.py
@@ -145,8 +145,11 @@ def download_resources(browser, resource_cache, output_dir):
             elem.removeFromDocument()
 
 def save_html(browser, output_dir, postprocess_html, url, recursion_level):
-    html = strip_encoding_declarations(browser.html)
     import html5lib
+    from calibre.utils.cleantext import clean_xml_chars
+    html = strip_encoding_declarations(browser.html)
+    if isinstance(html, unicode):
+        html = clean_xml_chars(html)
     root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
     root = postprocess_html(root, url, recursion_level)
     if root is None: