From 61064892b0576aa8bf782708830bbe6e9b7b3685 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Dec 2015 10:55:19 +0530 Subject: [PATCH] Remove all disallowed XML chars, not just ascii control codes --- src/calibre/ebooks/pdf/pdftohtml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 7aa9f4bcf2..96fb28e923 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -14,7 +14,7 @@ from calibre.ptempfile import PersistentTemporaryFile from calibre.constants import (isosx, iswindows, islinux, isbsd, filesystem_encoding) from calibre import CurrentDir -from calibre.utils.cleantext import clean_ascii_chars +from calibre.utils.cleantext import clean_xml_chars PDFTOHTML = 'pdftohtml' popen = subprocess.Popen @@ -125,7 +125,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): def parse_outline(raw, output_dir): from lxml import etree - raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) + raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) outline = etree.fromstring(raw).xpath('(//outline)[1]') if outline: from calibre.ebooks.oeb.polish.toc import TOC, create_ncx