Remove all disallowed XML chars, not just ascii control codes

This commit is contained in:
Kovid Goyal 2015-12-19 10:55:19 +05:30
parent 237e3ff23d
commit 61064892b0

View File

@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile
from calibre.constants import (isosx, iswindows, islinux, isbsd,
filesystem_encoding)
from calibre import CurrentDir
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.cleantext import clean_xml_chars
PDFTOHTML = 'pdftohtml'
popen = subprocess.Popen
@ -125,7 +125,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
def parse_outline(raw, output_dir):
from lxml import etree
raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
outline = etree.fromstring(raw).xpath('(//outline)[1]')
if outline:
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx