From 237e3ff23d39e6b474f96bbf9b5909e49cc7422b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Dec 2015 10:50:30 +0530 Subject: [PATCH] PDF Input: Fix conversion of PDF documents that contain ASCII control codes in their outlines not working. Fixes #1527697 [failure to convert book](https://bugs.launchpad.net/calibre/+bug/1527697) --- src/calibre/ebooks/pdf/pdftohtml.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 83fece70a3..7aa9f4bcf2 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -9,10 +9,12 @@ import errno, os, sys, subprocess, shutil, re from functools import partial from calibre.ebooks import ConversionError, DRMError +from calibre.ebooks.chardet import xml_to_unicode from calibre.ptempfile import PersistentTemporaryFile from calibre.constants import (isosx, iswindows, islinux, isbsd, filesystem_encoding) from calibre import CurrentDir +from calibre.utils.cleantext import clean_ascii_chars PDFTOHTML = 'pdftohtml' popen = subprocess.Popen @@ -123,6 +125,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): def parse_outline(raw, output_dir): from lxml import etree + raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) outline = etree.fromstring(raw).xpath('(//outline)[1]') if outline: from calibre.ebooks.oeb.polish.toc import TOC, create_ncx