PDF Input: Fix non-breaking spaces represented as entities in the output of pdftohtml, which breaks some search/replace expressions

Also modernize the pdftohtml module to make it more likely to work under py3, since I was there already.
2025-07-07 18:24:30 -04:00 · 2018-12-29 13:16:42 +05:30 · 2018-12-29 13:16:42 +05:30 · 19b02bd063
commit 19b02bd063
parent 2fb46fb3dd
1 changed files with 57 additions and 61 deletions
--- a/src/calibre/ebooks/pdf/pdftohtml.py
+++ b/src/calibre/ebooks/pdf/pdftohtml.py
@ -1,30 +1,43 @@
-# -*- coding: utf-8 -*-
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>

-from __future__ import print_function
-__license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \
-                '2009, John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
+from __future__ import print_function, unicode_literals

-import errno, os, sys, subprocess, shutil, re
-from functools import partial
+import errno
+import os
+import re
+import shutil
+import subprocess
+import sys

+from calibre import CurrentDir, replace_entities, prints
+from calibre.constants import (
+    filesystem_encoding, isbsd, islinux, isosx, ispy3, iswindows
+)
 from calibre.ebooks import ConversionError, DRMError
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ptempfile import PersistentTemporaryFile
-from calibre.constants import (isosx, iswindows, islinux, isbsd,
-            filesystem_encoding)
-from calibre import CurrentDir
 from calibre.utils.cleantext import clean_xml_chars
+from calibre.utils.ipc import eintr_retry_call
+

 PDFTOHTML = 'pdftohtml'
-popen = subprocess.Popen
+
+
+def popen(cmd, **kw):
+    if not ispy3:
+        cmd = [x.encode(filesystem_encoding) if not isinstance(x, bytes) else x for x in cmd]
+    if iswindows:
+        kw['creationflags'] = 0x08
+    return subprocess.Popen(cmd, **kw)
+
+
 if isosx and hasattr(sys, 'frameworks_dir'):
    PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
 if iswindows and hasattr(sys, 'frozen'):
    base = sys.extensions_location if hasattr(sys, 'new_app_layout') else os.path.dirname(sys.executable)
    PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
-    popen = partial(subprocess.Popen, creationflags=0x08)  # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
 if (islinux or isbsd) and getattr(sys, 'frozen', False):
    PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')

@ -36,37 +49,29 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
    It will also write all extracted images to the output_dir
    '''

-    pdfsrc = os.path.join(output_dir, u'src.pdf')
-    index = os.path.join(output_dir, u'index.'+('xml' if as_xml else 'html'))
+    pdfsrc = os.path.join(output_dir, 'src.pdf')
+    index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html'))

-    with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
+    with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
        shutil.copyfileobj(src, dest)

    with CurrentDir(output_dir):
-        # This is necessary as pdftohtml doesn't always (linux) respect
-        # absolute paths. Also, it allows us to safely pass only bytestring
-        # arguments to subprocess on widows

-        # subprocess in python 2 cannot handle unicode arguments on windows
-        # that cannot be encoded with mbcs. Ensure all args are
-        # bytestrings.
        def a(x):
-            return os.path.basename(x).encode('ascii')
+            return os.path.basename(x)

-        exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(PDFTOHTML,
-                unicode) else PDFTOHTML
-
-        cmd = [exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
-                b'-nodrm', a(pdfsrc), a(index)]
+        exe = PDFTOHTML
+        cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
+                '-nodrm', a(pdfsrc), a(index)]

        if isbsd:
-            cmd.remove(b'-nodrm')
+            cmd.remove('-nodrm')
        if no_images:
-            cmd.append(b'-i')
+            cmd.append('-i')
        if as_xml:
            cmd.append('-xml')

-        logf = PersistentTemporaryFile(u'pdftohtml_log')
+        logf = PersistentTemporaryFile('pdftohtml_log')
        try:
            p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
                    stdin=subprocess.PIPE)
@ -76,53 +81,44 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
                    _('Could not find pdftohtml, check it is in your PATH'))
            else:
                raise
-
-        while True:
-            try:
-                ret = p.wait()
-                break
-            except OSError as e:
-                if e.errno == errno.EINTR:
-                    continue
-                else:
-                    raise
+        ret = eintr_retry_call(p.wait)
        logf.flush()
        logf.close()
-        out = open(logf.name, 'rb').read().strip()
+        out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
        if ret != 0:
-            raise ConversionError(b'pdftohtml failed with return code: %d\n%s' % (ret, out))
+            raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out))
        if out:
-            print("pdftohtml log:")
-            print(out)
+            prints("pdftohtml log:")
+            prints(out)
        if not os.path.exists(index) or os.stat(index).st_size < 100:
            raise DRMError()

        if not as_xml:
            with lopen(index, 'r+b') as i:
-                raw = i.read()
+                raw = i.read().decode('utf-8')
                raw = flip_images(raw)
                raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n  <head', 1)
                i.seek(0)
                i.truncate()
                # versions of pdftohtml >= 0.20 output self closing <br> tags, this
                # breaks the pdf heuristics regexps, so replace them
-                raw = raw.replace(b'<br/>', b'<br>')
-                raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I)
-                raw = re.sub(br'<a id="(\d+)"', br'<a id="p\1"', raw, flags=re.I)
-                raw = re.sub(br'<a href="index.html#(\d+)"', br'<a href="#p\1"', raw, flags=re.I)
+                raw = raw.replace('<br/>', '<br>')
+                raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I)
+                raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
+                raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
+                raw = replace_entities(raw)

-                i.write(raw)
+                i.write(raw.encode('utf-8'))

-            cmd = [exe, b'-f', b'1', '-l', '1', b'-xml', b'-i', b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
-                    b'-nodrm', b'-q', b'-stdout', a(pdfsrc)]
+            cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
+                    '-nodrm', '-q', '-stdout', a(pdfsrc)]
+            if isbsd:
+                cmd.remove('-nodrm')
            p = popen(cmd, stdout=subprocess.PIPE)
            raw = p.stdout.read().strip()
            if p.wait() == 0 and raw:
                parse_outline(raw, output_dir)

-            if isbsd:
-                cmd.remove(b'-nodrm')
-
        try:
            os.remove(pdfsrc)
        except:
@ -161,24 +157,24 @@ def flip_image(img, flip):
    from calibre.utils.img import flip_image, image_and_format_from_data, image_to_data
    with lopen(img, 'r+b') as f:
        img, fmt = image_and_format_from_data(f.read())
-        img = flip_image(img, horizontal=b'x' in flip, vertical=b'y' in flip)
+        img = flip_image(img, horizontal='x' in flip, vertical='y' in flip)
        f.seek(0), f.truncate()
        f.write(image_to_data(img, fmt=fmt))


 def flip_images(raw):
-    for match in re.finditer(b'<IMG[^>]+/?>', raw, flags=re.I):
+    for match in re.finditer('<IMG[^>]+/?>', raw, flags=re.I):
        img = match.group()
-        m = re.search(br'class="(x|y|xy)flip"', img)
+        m = re.search(r'class="(x|y|xy)flip"', img)
        if m is None:
            continue
        flip = m.group(1)
-        src = re.search(br'src="([^"]+)"', img)
+        src = re.search(r'src="([^"]+)"', img)
        if src is None:
            continue
        img = src.group(1)
        if not os.path.exists(img):
            continue
        flip_image(img, flip)
-    raw = re.sub(br'<STYLE.+?</STYLE>\s*', b'', raw, flags=re.I|re.DOTALL)
+    raw = re.sub(r'<STYLE.+?</STYLE>\s*', '', raw, flags=re.I|re.DOTALL)
    return raw