Use a more robust parser when parsing the outline generated by pdftohtml

2025-08-30 23:00:21 -04:00 · 2015-12-19 18:54:43 +05:30 · 2015-12-19 18:54:43 +05:30 · 2326e936ff
commit 2326e936ff
parent 5a7b251025
1 changed files with 2 additions and 1 deletions
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -125,8 +125,9 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):

 def parse_outline(raw, output_dir):
    from lxml import etree
+    from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
    raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
-    outline = etree.fromstring(raw).xpath('(//outline)[1]')
+    outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]')
    if outline:
        from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
        outline = outline[0]