Fix pdftohtml on widows with unicode paths

This commit is contained in:
Kovid Goyal 2012-02-22 18:56:58 +05:30
parent fc0bbaf796
commit 3ce881ce34

View File

@ -5,15 +5,13 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \
'2009, John Schember <john@nachtimwald.com>' '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import errno import errno, os, sys, subprocess, shutil
import os
import sys
import subprocess
from functools import partial from functools import partial
from calibre.ebooks import ConversionError, DRMError from calibre.ebooks import ConversionError, DRMError
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.constants import isosx, iswindows, islinux, isbsd from calibre.constants import (isosx, iswindows, islinux, isbsd,
filesystem_encoding)
from calibre import CurrentDir from calibre import CurrentDir
PDFTOHTML = 'pdftohtml' PDFTOHTML = 'pdftohtml'
@ -30,31 +28,45 @@ def pdftohtml(output_dir, pdf_path, no_images):
''' '''
Convert the pdf into html using the pdftohtml app. Convert the pdf into html using the pdftohtml app.
This will write the html as index.html into output_dir. This will write the html as index.html into output_dir.
It will also wirte all extracted images to the output_dir It will also write all extracted images to the output_dir
''' '''
if isinstance(pdf_path, unicode): pdfsrc = os.path.join(output_dir, u'src.pdf')
pdf_path = pdf_path.encode(sys.getfilesystemencoding()) index = os.path.join(output_dir, u'index.html')
if not os.access(pdf_path, os.R_OK):
raise ConversionError('Cannot read from ' + pdf_path) with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
shutil.copyfileobj(src, dest)
with CurrentDir(output_dir): with CurrentDir(output_dir):
index = os.path.join(os.getcwdu(), 'index.html') # This is necessary as pdftohtml doesn't always (linux) respect
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths # absolute paths. Also, it allows us to safely pass only bytestring
pdf_path = os.path.abspath(pdf_path) # arguments to subprocess on widows
cmd = [PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', pdf_path, os.path.basename(index)]
if isbsd:
cmd.remove('-nodrm')
if no_images:
cmd.append('-i')
logf = PersistentTemporaryFile('pdftohtml_log') # subprocess in python 2 cannot handle unicode arguments on windows
# that cannot be encoded with mbcs. Ensure all args are
# bytestrings.
def a(x):
return os.path.basename(x).encode('ascii')
exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(PDFTOHTML,
unicode) else PDFTOHTML
cmd = [exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
b'-nodrm', b'-q', a(pdfsrc), a(index)]
if isbsd:
cmd.remove(b'-nodrm')
if no_images:
cmd.append(b'-i')
logf = PersistentTemporaryFile(u'pdftohtml_log')
try: try:
p = popen(cmd, stderr=logf._fd, stdout=logf._fd, p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
stdin=subprocess.PIPE) stdin=subprocess.PIPE)
except OSError as err: except OSError as err:
if err.errno == errno.ENOENT: if err.errno == errno.ENOENT:
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH')) raise ConversionError(
_('Could not find pdftohtml, check it is in your PATH'))
else: else:
raise raise
@ -70,6 +82,10 @@ def pdftohtml(output_dir, pdf_path, no_images):
logf.flush() logf.flush()
logf.close() logf.close()
out = open(logf.name, 'rb').read().strip() out = open(logf.name, 'rb').read().strip()
try:
os.remove(pdfsrc)
except:
pass
if ret != 0: if ret != 0:
raise ConversionError(out) raise ConversionError(out)
if out: if out:
@ -84,3 +100,4 @@ def pdftohtml(output_dir, pdf_path, no_images):
i.seek(0) i.seek(0)
i.truncate() i.truncate()
i.write(raw) i.write(raw)