jsbrowser: Fix handling of html with non lxml safe chars

2025-11-18 20:43:04 -05:00 · 2013-07-09 14:44:46 +05:30 · 2013-07-09 14:44:46 +05:30 · a94539c32b
commit a94539c32b
parent c4cb0e445e
1 changed files with 4 additions and 1 deletions
--- a/src/calibre/web/fetch/javascript.py
+++ b/src/calibre/web/fetch/javascript.py
@ -145,8 +145,11 @@ def download_resources(browser, resource_cache, output_dir):
            elem.removeFromDocument()

 def save_html(browser, output_dir, postprocess_html, url, recursion_level):
-    html = strip_encoding_declarations(browser.html)
    import html5lib
+    from calibre.utils.cleantext import clean_xml_chars
+    html = strip_encoding_declarations(browser.html)
+    if isinstance(html, unicode):
+        html = clean_xml_chars(html)
    root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
    root = postprocess_html(root, url, recursion_level)
    if root is None: