PDF Input: Fix non-breaking spaces represented as entities in the output of pdftohtml, which breaks some search/replace expressions

Also modernize the pdftohtml module to make it more likely to work under
py3, since I was there already.
This commit is contained in:
Kovid Goyal 2018-12-29 13:16:42 +05:30
parent 2fb46fb3dd
commit 19b02bd063
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,30 +1,43 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import print_function
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, ' \
'2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from __future__ import print_function, unicode_literals
import errno, os, sys, subprocess, shutil, re
from functools import partial
import errno
import os
import re
import shutil
import subprocess
import sys
from calibre import CurrentDir, replace_entities, prints
from calibre.constants import (
filesystem_encoding, isbsd, islinux, isosx, ispy3, iswindows
)
from calibre.ebooks import ConversionError, DRMError
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ptempfile import PersistentTemporaryFile
from calibre.constants import (isosx, iswindows, islinux, isbsd,
filesystem_encoding)
from calibre import CurrentDir
from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.ipc import eintr_retry_call
PDFTOHTML = 'pdftohtml'
popen = subprocess.Popen
def popen(cmd, **kw):
if not ispy3:
cmd = [x.encode(filesystem_encoding) if not isinstance(x, bytes) else x for x in cmd]
if iswindows:
kw['creationflags'] = 0x08
return subprocess.Popen(cmd, **kw)
if isosx and hasattr(sys, 'frameworks_dir'):
PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
if iswindows and hasattr(sys, 'frozen'):
base = sys.extensions_location if hasattr(sys, 'new_app_layout') else os.path.dirname(sys.executable)
PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
if (islinux or isbsd) and getattr(sys, 'frozen', False):
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
@ -36,37 +49,29 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
It will also write all extracted images to the output_dir
'''
pdfsrc = os.path.join(output_dir, u'src.pdf')
index = os.path.join(output_dir, u'index.'+('xml' if as_xml else 'html'))
pdfsrc = os.path.join(output_dir, 'src.pdf')
index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html'))
with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest:
shutil.copyfileobj(src, dest)
with CurrentDir(output_dir):
# This is necessary as pdftohtml doesn't always (linux) respect
# absolute paths. Also, it allows us to safely pass only bytestring
# arguments to subprocess on widows
# subprocess in python 2 cannot handle unicode arguments on windows
# that cannot be encoded with mbcs. Ensure all args are
# bytestrings.
def a(x):
return os.path.basename(x).encode('ascii')
return os.path.basename(x)
exe = PDFTOHTML.encode(filesystem_encoding) if isinstance(PDFTOHTML,
unicode) else PDFTOHTML
cmd = [exe, b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
b'-nodrm', a(pdfsrc), a(index)]
exe = PDFTOHTML
cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
'-nodrm', a(pdfsrc), a(index)]
if isbsd:
cmd.remove(b'-nodrm')
cmd.remove('-nodrm')
if no_images:
cmd.append(b'-i')
cmd.append('-i')
if as_xml:
cmd.append('-xml')
logf = PersistentTemporaryFile(u'pdftohtml_log')
logf = PersistentTemporaryFile('pdftohtml_log')
try:
p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
stdin=subprocess.PIPE)
@ -76,53 +81,44 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
_('Could not find pdftohtml, check it is in your PATH'))
else:
raise
while True:
try:
ret = p.wait()
break
except OSError as e:
if e.errno == errno.EINTR:
continue
else:
raise
ret = eintr_retry_call(p.wait)
logf.flush()
logf.close()
out = open(logf.name, 'rb').read().strip()
out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
if ret != 0:
raise ConversionError(b'pdftohtml failed with return code: %d\n%s' % (ret, out))
raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out))
if out:
print("pdftohtml log:")
print(out)
prints("pdftohtml log:")
prints(out)
if not os.path.exists(index) or os.stat(index).st_size < 100:
raise DRMError()
if not as_xml:
with lopen(index, 'r+b') as i:
raw = i.read()
raw = i.read().decode('utf-8')
raw = flip_images(raw)
raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1)
i.seek(0)
i.truncate()
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
# breaks the pdf heuristics regexps, so replace them
raw = raw.replace(b'<br/>', b'<br>')
raw = re.sub(br'<a\s+name=(\d+)', br'<a id="\1"', raw, flags=re.I)
raw = re.sub(br'<a id="(\d+)"', br'<a id="p\1"', raw, flags=re.I)
raw = re.sub(br'<a href="index.html#(\d+)"', br'<a href="#p\1"', raw, flags=re.I)
raw = raw.replace('<br/>', '<br>')
raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I)
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I)
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I)
raw = replace_entities(raw)
i.write(raw)
i.write(raw.encode('utf-8'))
cmd = [exe, b'-f', b'1', '-l', '1', b'-xml', b'-i', b'-enc', b'UTF-8', b'-noframes', b'-p', b'-nomerge',
b'-nodrm', b'-q', b'-stdout', a(pdfsrc)]
cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
'-nodrm', '-q', '-stdout', a(pdfsrc)]
if isbsd:
cmd.remove('-nodrm')
p = popen(cmd, stdout=subprocess.PIPE)
raw = p.stdout.read().strip()
if p.wait() == 0 and raw:
parse_outline(raw, output_dir)
if isbsd:
cmd.remove(b'-nodrm')
try:
os.remove(pdfsrc)
except:
@ -161,24 +157,24 @@ def flip_image(img, flip):
from calibre.utils.img import flip_image, image_and_format_from_data, image_to_data
with lopen(img, 'r+b') as f:
img, fmt = image_and_format_from_data(f.read())
img = flip_image(img, horizontal=b'x' in flip, vertical=b'y' in flip)
img = flip_image(img, horizontal='x' in flip, vertical='y' in flip)
f.seek(0), f.truncate()
f.write(image_to_data(img, fmt=fmt))
def flip_images(raw):
for match in re.finditer(b'<IMG[^>]+/?>', raw, flags=re.I):
for match in re.finditer('<IMG[^>]+/?>', raw, flags=re.I):
img = match.group()
m = re.search(br'class="(x|y|xy)flip"', img)
m = re.search(r'class="(x|y|xy)flip"', img)
if m is None:
continue
flip = m.group(1)
src = re.search(br'src="([^"]+)"', img)
src = re.search(r'src="([^"]+)"', img)
if src is None:
continue
img = src.group(1)
if not os.path.exists(img):
continue
flip_image(img, flip)
raw = re.sub(br'<STYLE.+?</STYLE>\s*', b'', raw, flags=re.I|re.DOTALL)
raw = re.sub(r'<STYLE.+?</STYLE>\s*', '', raw, flags=re.I|re.DOTALL)
return raw