Fix pdftohtml on widows with unicode paths

2026-04-03 15:51:58 -04:00 · 2012-02-22 18:56:58 +05:30 · 2012-02-22 18:56:58 +05:30 · 3ce881ce34
commit 3ce881ce34
parent fc0bbaf796
1 changed files with 37 additions and 20 deletions
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -5,15 +5,13 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \
                '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

-import errno
-import os
-import sys
-import subprocess
+import errno, os, sys, subprocess, shutil
 from functools import partial

 from calibre.ebooks import ConversionError, DRMError
 from calibre.ptempfile import PersistentTemporaryFile
-from calibre.constants import isosx, iswindows, islinux, isbsd
+from calibre.constants import (isosx, iswindows, islinux, isbsd,
+            filesystem_encoding)
 from calibre import CurrentDir

 PDFTOHTML = 'pdftohtml'
@ -30,31 +28,45 @@ def pdftohtml(output_dir, pdf_path, no_images):
    '''
    Convert the pdf into html using the pdftohtml app.
    This will write the html as index.html into output_dir.
-    It will also wirte all extracted images to the output_dir
+    It will also write all extracted images to the output_dir
    '''

-    if isinstance(pdf_path, unicode):
-        pdf_path = pdf_path.encode(sys.getfilesystemencoding())
-    if not os.access(pdf_path, os.R_OK):
-        raise ConversionError('Cannot read from ' + pdf_path)
+    pdfsrc = os.path.join(output_dir, u'src.pdf')
+    index = os.path.join(output_dir, u'index.html')
+
+    with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
+        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):
-        index = os.path.join(os.getcwdu(), 'index.html')
-        # This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
-        pdf_path = os.path.abspath(pdf_path)
-        cmd = [PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', pdf_path, os.path.basename(index)]
-        if isbsd:
-            cmd.remove('-nodrm')
-        if no_images:
-            cmd.append('-i')
+        # This is necessary as pdftohtml doesn't always (linux) respect
+        # absolute paths. Also, it allows us to safely pass only bytestring
+        # arguments to subprocess on widows

-        logf = PersistentTemporaryFile('pdftohtml_log')
+        # subprocess in python 2 cannot handle unicode arguments on windows
+        # that cannot be encoded with mbcs. Ensure all args are
+        # bytestrings.
+        def a(x):
+            return os.path.basename(x).encode('ascii')
+
+        exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(PDFTOHTML,
+                unicode) else PDFTOHTML
+
+        cmd = [exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
+                b'-nodrm', b'-q', a(pdfsrc), a(index)]
+
+        if isbsd:
+            cmd.remove(b'-nodrm')
+        if no_images:
+            cmd.append(b'-i')
+
+        logf = PersistentTemporaryFile(u'pdftohtml_log')
        try:
            p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
                    stdin=subprocess.PIPE)
        except OSError as err:
            if err.errno == errno.ENOENT:
-                raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'))
+                raise ConversionError(
+                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise

@ -70,6 +82,10 @@ def pdftohtml(output_dir, pdf_path, no_images):
        logf.flush()
        logf.close()
        out = open(logf.name, 'rb').read().strip()
+        try:
+            os.remove(pdfsrc)
+        except:
+            pass
        if ret != 0:
            raise ConversionError(out)
        if out:
@ -84,3 +100,4 @@ def pdftohtml(output_dir, pdf_path, no_images):
            i.seek(0)
            i.truncate()
            i.write(raw)
+