From 2326e936ff70294ab74ce22908adaab208e47a65 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Dec 2015 18:54:43 +0530 Subject: [PATCH] Use a more robust parser when parsing the outline generated by pdftohtml --- src/calibre/ebooks/pdf/pdftohtml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 96fb28e923..3599c4a477 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -125,8 +125,9 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): def parse_outline(raw, output_dir): from lxml import etree + from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) - outline = etree.fromstring(raw).xpath('(//outline)[1]') + outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]') if outline: from calibre.ebooks.oeb.polish.toc import TOC, create_ncx outline = outline[0]