mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix pdftohtml on widows with unicode paths
This commit is contained in:
parent
fc0bbaf796
commit
3ce881ce34
@ -5,15 +5,13 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \
|
|||||||
'2009, John Schember <john@nachtimwald.com>'
|
'2009, John Schember <john@nachtimwald.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import errno
|
import errno, os, sys, subprocess, shutil
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import subprocess
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from calibre.ebooks import ConversionError, DRMError
|
from calibre.ebooks import ConversionError, DRMError
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
from calibre.constants import isosx, iswindows, islinux, isbsd
|
from calibre.constants import (isosx, iswindows, islinux, isbsd,
|
||||||
|
filesystem_encoding)
|
||||||
from calibre import CurrentDir
|
from calibre import CurrentDir
|
||||||
|
|
||||||
PDFTOHTML = 'pdftohtml'
|
PDFTOHTML = 'pdftohtml'
|
||||||
@ -30,31 +28,45 @@ def pdftohtml(output_dir, pdf_path, no_images):
|
|||||||
'''
|
'''
|
||||||
Convert the pdf into html using the pdftohtml app.
|
Convert the pdf into html using the pdftohtml app.
|
||||||
This will write the html as index.html into output_dir.
|
This will write the html as index.html into output_dir.
|
||||||
It will also wirte all extracted images to the output_dir
|
It will also write all extracted images to the output_dir
|
||||||
'''
|
'''
|
||||||
|
|
||||||
if isinstance(pdf_path, unicode):
|
pdfsrc = os.path.join(output_dir, u'src.pdf')
|
||||||
pdf_path = pdf_path.encode(sys.getfilesystemencoding())
|
index = os.path.join(output_dir, u'index.html')
|
||||||
if not os.access(pdf_path, os.R_OK):
|
|
||||||
raise ConversionError('Cannot read from ' + pdf_path)
|
with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
|
||||||
|
shutil.copyfileobj(src, dest)
|
||||||
|
|
||||||
with CurrentDir(output_dir):
|
with CurrentDir(output_dir):
|
||||||
index = os.path.join(os.getcwdu(), 'index.html')
|
# This is necessary as pdftohtml doesn't always (linux) respect
|
||||||
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
|
# absolute paths. Also, it allows us to safely pass only bytestring
|
||||||
pdf_path = os.path.abspath(pdf_path)
|
# arguments to subprocess on widows
|
||||||
cmd = [PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', pdf_path, os.path.basename(index)]
|
|
||||||
if isbsd:
|
|
||||||
cmd.remove('-nodrm')
|
|
||||||
if no_images:
|
|
||||||
cmd.append('-i')
|
|
||||||
|
|
||||||
logf = PersistentTemporaryFile('pdftohtml_log')
|
# subprocess in python 2 cannot handle unicode arguments on windows
|
||||||
|
# that cannot be encoded with mbcs. Ensure all args are
|
||||||
|
# bytestrings.
|
||||||
|
def a(x):
|
||||||
|
return os.path.basename(x).encode('ascii')
|
||||||
|
|
||||||
|
exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(PDFTOHTML,
|
||||||
|
unicode) else PDFTOHTML
|
||||||
|
|
||||||
|
cmd = [exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
|
||||||
|
b'-nodrm', b'-q', a(pdfsrc), a(index)]
|
||||||
|
|
||||||
|
if isbsd:
|
||||||
|
cmd.remove(b'-nodrm')
|
||||||
|
if no_images:
|
||||||
|
cmd.append(b'-i')
|
||||||
|
|
||||||
|
logf = PersistentTemporaryFile(u'pdftohtml_log')
|
||||||
try:
|
try:
|
||||||
p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
|
p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
|
||||||
stdin=subprocess.PIPE)
|
stdin=subprocess.PIPE)
|
||||||
except OSError as err:
|
except OSError as err:
|
||||||
if err.errno == errno.ENOENT:
|
if err.errno == errno.ENOENT:
|
||||||
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'))
|
raise ConversionError(
|
||||||
|
_('Could not find pdftohtml, check it is in your PATH'))
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@ -70,6 +82,10 @@ def pdftohtml(output_dir, pdf_path, no_images):
|
|||||||
logf.flush()
|
logf.flush()
|
||||||
logf.close()
|
logf.close()
|
||||||
out = open(logf.name, 'rb').read().strip()
|
out = open(logf.name, 'rb').read().strip()
|
||||||
|
try:
|
||||||
|
os.remove(pdfsrc)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
if ret != 0:
|
if ret != 0:
|
||||||
raise ConversionError(out)
|
raise ConversionError(out)
|
||||||
if out:
|
if out:
|
||||||
@ -84,3 +100,4 @@ def pdftohtml(output_dir, pdf_path, no_images):
|
|||||||
i.seek(0)
|
i.seek(0)
|
||||||
i.truncate()
|
i.truncate()
|
||||||
i.write(raw)
|
i.write(raw)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user