mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use a more robust parser when parsing the outline generated by pdftohtml
This commit is contained in:
parent
5a7b251025
commit
2326e936ff
@ -125,8 +125,9 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
|||||||
|
|
||||||
def parse_outline(raw, output_dir):
|
def parse_outline(raw, output_dir):
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
|
||||||
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
|
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
|
||||||
outline = etree.fromstring(raw).xpath('(//outline)[1]')
|
outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]')
|
||||||
if outline:
|
if outline:
|
||||||
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
|
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
|
||||||
outline = outline[0]
|
outline = outline[0]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user