mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
jsbrowser: Fix handling of html with non lxml safe chars
This commit is contained in:
parent
c4cb0e445e
commit
a94539c32b
@ -145,8 +145,11 @@ def download_resources(browser, resource_cache, output_dir):
|
|||||||
elem.removeFromDocument()
|
elem.removeFromDocument()
|
||||||
|
|
||||||
def save_html(browser, output_dir, postprocess_html, url, recursion_level):
|
def save_html(browser, output_dir, postprocess_html, url, recursion_level):
|
||||||
html = strip_encoding_declarations(browser.html)
|
|
||||||
import html5lib
|
import html5lib
|
||||||
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
|
html = strip_encoding_declarations(browser.html)
|
||||||
|
if isinstance(html, unicode):
|
||||||
|
html = clean_xml_chars(html)
|
||||||
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
|
root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
|
||||||
root = postprocess_html(root, url, recursion_level)
|
root = postprocess_html(root, url, recursion_level)
|
||||||
if root is None:
|
if root is None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user