diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py
index 7d5685a096..4916ecc6c3 100644
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@@ -5,15 +5,13 @@ __copyright__ = '2008, Kovid Goyal , ' \
'2009, John Schember '
__docformat__ = 'restructuredtext en'
-import errno
-import os
-import sys
-import subprocess
+import errno, os, sys, subprocess, shutil
from functools import partial
from calibre.ebooks import ConversionError, DRMError
from calibre.ptempfile import PersistentTemporaryFile
-from calibre.constants import isosx, iswindows, islinux, isbsd
+from calibre.constants import (isosx, iswindows, islinux, isbsd,
+ filesystem_encoding)
from calibre import CurrentDir
PDFTOHTML = 'pdftohtml'
@@ -30,31 +28,45 @@ def pdftohtml(output_dir, pdf_path, no_images):
'''
Convert the pdf into html using the pdftohtml app.
This will write the html as index.html into output_dir.
- It will also wirte all extracted images to the output_dir
+ It will also write all extracted images to the output_dir
'''
- if isinstance(pdf_path, unicode):
- pdf_path = pdf_path.encode(sys.getfilesystemencoding())
- if not os.access(pdf_path, os.R_OK):
- raise ConversionError('Cannot read from ' + pdf_path)
+ pdfsrc = os.path.join(output_dir, u'src.pdf')
+ index = os.path.join(output_dir, u'index.html')
+
+ with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
+ shutil.copyfileobj(src, dest)
with CurrentDir(output_dir):
- index = os.path.join(os.getcwdu(), 'index.html')
- # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
- pdf_path = os.path.abspath(pdf_path)
- cmd = [PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', pdf_path, os.path.basename(index)]
- if isbsd:
- cmd.remove('-nodrm')
- if no_images:
- cmd.append('-i')
+ # This is necessary as pdftohtml doesn't always (linux) respect
+ # absolute paths. Also, it allows us to safely pass only bytestring
+ # arguments to subprocess on widows
- logf = PersistentTemporaryFile('pdftohtml_log')
+ # subprocess in python 2 cannot handle unicode arguments on windows
+ # that cannot be encoded with mbcs. Ensure all args are
+ # bytestrings.
+ def a(x):
+ return os.path.basename(x).encode('ascii')
+
+ exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(PDFTOHTML,
+ unicode) else PDFTOHTML
+
+ cmd = [exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
+ b'-nodrm', b'-q', a(pdfsrc), a(index)]
+
+ if isbsd:
+ cmd.remove(b'-nodrm')
+ if no_images:
+ cmd.append(b'-i')
+
+ logf = PersistentTemporaryFile(u'pdftohtml_log')
try:
p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
stdin=subprocess.PIPE)
except OSError as err:
if err.errno == errno.ENOENT:
- raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'))
+ raise ConversionError(
+ _('Could not find pdftohtml, check it is in your PATH'))
else:
raise
@@ -70,6 +82,10 @@ def pdftohtml(output_dir, pdf_path, no_images):
logf.flush()
logf.close()
out = open(logf.name, 'rb').read().strip()
+ try:
+ os.remove(pdfsrc)
+ except:
+ pass
if ret != 0:
raise ConversionError(out)
if out:
@@ -84,3 +100,4 @@ def pdftohtml(output_dir, pdf_path, no_images):
i.seek(0)
i.truncate()
i.write(raw)
+