diff --git a/src/calibre/ebooks/pdf/pdftohtml.py b/src/calibre/ebooks/pdf/pdftohtml.py index 1ccc4209be..44da4e8095 100644 --- a/src/calibre/ebooks/pdf/pdftohtml.py +++ b/src/calibre/ebooks/pdf/pdftohtml.py @@ -1,30 +1,43 @@ -# -*- coding: utf-8 -*- +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2008, Kovid Goyal -from __future__ import print_function -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal , ' \ - '2009, John Schember ' -__docformat__ = 'restructuredtext en' +from __future__ import print_function, unicode_literals -import errno, os, sys, subprocess, shutil, re -from functools import partial +import errno +import os +import re +import shutil +import subprocess +import sys +from calibre import CurrentDir, replace_entities, prints +from calibre.constants import ( + filesystem_encoding, isbsd, islinux, isosx, ispy3, iswindows +) from calibre.ebooks import ConversionError, DRMError from calibre.ebooks.chardet import xml_to_unicode from calibre.ptempfile import PersistentTemporaryFile -from calibre.constants import (isosx, iswindows, islinux, isbsd, - filesystem_encoding) -from calibre import CurrentDir from calibre.utils.cleantext import clean_xml_chars +from calibre.utils.ipc import eintr_retry_call + PDFTOHTML = 'pdftohtml' -popen = subprocess.Popen + + +def popen(cmd, **kw): + if not ispy3: + cmd = [x.encode(filesystem_encoding) if not isinstance(x, bytes) else x for x in cmd] + if iswindows: + kw['creationflags'] = 0x08 + return subprocess.Popen(cmd, **kw) + + if isosx and hasattr(sys, 'frameworks_dir'): PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML) if iswindows and hasattr(sys, 'frozen'): base = sys.extensions_location if hasattr(sys, 'new_app_layout') else os.path.dirname(sys.executable) PDFTOHTML = os.path.join(base, 'pdftohtml.exe') - popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up if (islinux or isbsd) and getattr(sys, 'frozen', False): PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml') @@ -36,37 +49,29 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): It will also write all extracted images to the output_dir ''' - pdfsrc = os.path.join(output_dir, u'src.pdf') - index = os.path.join(output_dir, u'index.'+('xml' if as_xml else 'html')) + pdfsrc = os.path.join(output_dir, 'src.pdf') + index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html')) - with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest: + with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): - # This is necessary as pdftohtml doesn't always (linux) respect - # absolute paths. Also, it allows us to safely pass only bytestring - # arguments to subprocess on widows - # subprocess in python 2 cannot handle unicode arguments on windows - # that cannot be encoded with mbcs. Ensure all args are - # bytestrings. def a(x): - return os.path.basename(x).encode('ascii') + return os.path.basename(x) - exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(PDFTOHTML, - unicode) else PDFTOHTML - - cmd = [exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge', - b'-nodrm', a(pdfsrc), a(index)] + exe = PDFTOHTML + cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', + '-nodrm', a(pdfsrc), a(index)] if isbsd: - cmd.remove(b'-nodrm') + cmd.remove('-nodrm') if no_images: - cmd.append(b'-i') + cmd.append('-i') if as_xml: cmd.append('-xml') - logf = PersistentTemporaryFile(u'pdftohtml_log') + logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) @@ -76,53 +81,44 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): _('Could not find pdftohtml, check it is in your PATH')) else: raise - - while True: - try: - ret = p.wait() - break - except OSError as e: - if e.errno == errno.EINTR: - continue - else: - raise + ret = eintr_retry_call(p.wait) logf.flush() logf.close() - out = open(logf.name, 'rb').read().strip() + out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip() if ret != 0: - raise ConversionError(b'pdftohtml failed with return code: %d\n%s' % (ret, out)) + raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out)) if out: - print("pdftohtml log:") - print(out) + prints("pdftohtml log:") + prints(out) if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with lopen(index, 'r+b') as i: - raw = i.read() + raw = i.read().decode('utf-8') raw = flip_images(raw) raw = raw.replace('\n = 0.20 output self closing
tags, this # breaks the pdf heuristics regexps, so replace them - raw = raw.replace(b'
', b'
') - raw = re.sub(br'', '
') + raw = re.sub(r']+/?>', raw, flags=re.I): + for match in re.finditer(']+/?>', raw, flags=re.I): img = match.group() - m = re.search(br'class="(x|y|xy)flip"', img) + m = re.search(r'class="(x|y|xy)flip"', img) if m is None: continue flip = m.group(1) - src = re.search(br'src="([^"]+)"', img) + src = re.search(r'src="([^"]+)"', img) if src is None: continue img = src.group(1) if not os.path.exists(img): continue flip_image(img, flip) - raw = re.sub(br'\s*', b'', raw, flags=re.I|re.DOTALL) + raw = re.sub(r'\s*', '', raw, flags=re.I|re.DOTALL) return raw