mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
PDF Input: Fix conversion of PDF documents that contain ASCII control codes in their outlines not working. Fixes #1527697 [failure to convert book](https://bugs.launchpad.net/calibre/+bug/1527697)
This commit is contained in:
parent
f0962dd696
commit
237e3ff23d
@ -9,10 +9,12 @@ import errno, os, sys, subprocess, shutil, re
|
|||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from calibre.ebooks import ConversionError, DRMError
|
from calibre.ebooks import ConversionError, DRMError
|
||||||
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from calibre.constants import (isosx, iswindows, islinux, isbsd,
|
from calibre.constants import (isosx, iswindows, islinux, isbsd,
|
||||||
filesystem_encoding)
|
filesystem_encoding)
|
||||||
from calibre import CurrentDir
|
from calibre import CurrentDir
|
||||||
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
|
||||||
PDFTOHTML = 'pdftohtml'
|
PDFTOHTML = 'pdftohtml'
|
||||||
popen = subprocess.Popen
|
popen = subprocess.Popen
|
||||||
@ -123,6 +125,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
|||||||
|
|
||||||
def parse_outline(raw, output_dir):
|
def parse_outline(raw, output_dir):
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
|
||||||
outline = etree.fromstring(raw).xpath('(//outline)[1]')
|
outline = etree.fromstring(raw).xpath('(//outline)[1]')
|
||||||
if outline:
|
if outline:
|
||||||
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
|
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
|
||||||
|
Loading…
x
Reference in New Issue
Block a user