diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py
index 1ccc4209be..44da4e8095 100644
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@@ -1,30 +1,43 @@
-# -*- coding: utf-8 -*-
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPLv3 Copyright: 2008, Kovid Goyal
-from __future__ import print_function
-__license__ = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal , ' \
- '2009, John Schember '
-__docformat__ = 'restructuredtext en'
+from __future__ import print_function, unicode_literals
-import errno, os, sys, subprocess, shutil, re
-from functools import partial
+import errno
+import os
+import re
+import shutil
+import subprocess
+import sys
+from calibre import CurrentDir, replace_entities, prints
+from calibre.constants import (
+ filesystem_encoding, isbsd, islinux, isosx, ispy3, iswindows
+)
from calibre.ebooks import ConversionError, DRMError
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ptempfile import PersistentTemporaryFile
-from calibre.constants import (isosx, iswindows, islinux, isbsd,
- filesystem_encoding)
-from calibre import CurrentDir
from calibre.utils.cleantext import clean_xml_chars
+from calibre.utils.ipc import eintr_retry_call
+
PDFTOHTML = 'pdftohtml'
-popen = subprocess.Popen
+
+
+def popen(cmd, **kw):
+ if not ispy3:
+ cmd = [x.encode(filesystem_encoding) if not isinstance(x, bytes) else x for x in cmd]
+ if iswindows:
+ kw['creationflags'] = 0x08
+ return subprocess.Popen(cmd, **kw)
+
+
if isosx and hasattr(sys, 'frameworks_dir'):
PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
if iswindows and hasattr(sys, 'frozen'):
base = sys.extensions_location if hasattr(sys, 'new_app_layout') else os.path.dirname(sys.executable)
PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
- popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
if (islinux or isbsd) and getattr(sys, 'frozen', False):
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
@@ -36,37 +49,29 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
It will also write all extracted images to the output_dir
'''
- pdfsrc = os.path.join(output_dir, u'src.pdf')
- index = os.path.join(output_dir, u'index.'+('xml' if as_xml else 'html'))
+ pdfsrc = os.path.join(output_dir, 'src.pdf')
+ index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html'))
- with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
+ with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
shutil.copyfileobj(src, dest)
with CurrentDir(output_dir):
- # This is necessary as pdftohtml doesn't always (linux) respect
- # absolute paths. Also, it allows us to safely pass only bytestring
- # arguments to subprocess on widows
- # subprocess in python 2 cannot handle unicode arguments on windows
- # that cannot be encoded with mbcs. Ensure all args are
- # bytestrings.
def a(x):
- return os.path.basename(x).encode('ascii')
+ return os.path.basename(x)
- exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(PDFTOHTML,
- unicode) else PDFTOHTML
-
- cmd = [exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
- b'-nodrm', a(pdfsrc), a(index)]
+ exe = PDFTOHTML
+ cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
+ '-nodrm', a(pdfsrc), a(index)]
if isbsd:
- cmd.remove(b'-nodrm')
+ cmd.remove('-nodrm')
if no_images:
- cmd.append(b'-i')
+ cmd.append('-i')
if as_xml:
cmd.append('-xml')
- logf = PersistentTemporaryFile(u'pdftohtml_log')
+ logf = PersistentTemporaryFile('pdftohtml_log')
try:
p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
stdin=subprocess.PIPE)
@@ -76,53 +81,44 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
_('Could not find pdftohtml, check it is in your PATH'))
else:
raise
-
- while True:
- try:
- ret = p.wait()
- break
- except OSError as e:
- if e.errno == errno.EINTR:
- continue
- else:
- raise
+ ret = eintr_retry_call(p.wait)
logf.flush()
logf.close()
- out = open(logf.name, 'rb').read().strip()
+ out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
if ret != 0:
- raise ConversionError(b'pdftohtml failed with return code: %d\n%s' % (ret, out))
+ raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out))
if out:
- print("pdftohtml log:")
- print(out)
+ prints("pdftohtml log:")
+ prints(out)
if not os.path.exists(index) or os.stat(index).st_size < 100:
raise DRMError()
if not as_xml:
with lopen(index, 'r+b') as i:
- raw = i.read()
+ raw = i.read().decode('utf-8')
raw = flip_images(raw)
raw = raw.replace('\n = 0.20 output self closing
tags, this
# breaks the pdf heuristics regexps, so replace them
- raw = raw.replace(b'
', b'
')
- raw = re.sub(br'', '
')
+ raw = re.sub(r']+/?>', raw, flags=re.I):
+ for match in re.finditer('
]+/?>', raw, flags=re.I):
img = match.group()
- m = re.search(br'class="(x|y|xy)flip"', img)
+ m = re.search(r'class="(x|y|xy)flip"', img)
if m is None:
continue
flip = m.group(1)
- src = re.search(br'src="([^"]+)"', img)
+ src = re.search(r'src="([^"]+)"', img)
if src is None:
continue
img = src.group(1)
if not os.path.exists(img):
continue
flip_image(img, flip)
- raw = re.sub(br'\s*', b'', raw, flags=re.I|re.DOTALL)
+ raw = re.sub(r'\s*', '', raw, flags=re.I|re.DOTALL)
return raw