DOCX Input: Add support for EMF images

DOCX Input: Add support for embedded EMF images that are just wrappers
around an actual raster image. See #1224849 (DOCX to EPUB conversion failed with devide by zero)
This commit is contained in:
Kovid Goyal 2013-09-13 17:07:40 +05:30
parent c6c17ef976
commit beea0a48b5
5 changed files with 178 additions and 69 deletions

View File

@ -91,12 +91,13 @@ def get_hpos(anchor, page_width):
class Images(object): class Images(object):
def __init__(self): def __init__(self, log):
self.rid_map = {} self.rid_map = {}
self.used = {} self.used = {}
self.names = set() self.names = set()
self.all_images = set() self.all_images = set()
self.links = [] self.links = []
self.log = log
def __call__(self, relationships_by_id): def __call__(self, relationships_by_id):
self.rid_map = relationships_by_id self.rid_map = relationships_by_id
@ -109,6 +110,17 @@ class Images(object):
raw = self.docx.read(fname) raw = self.docx.read(fname)
base = base or ascii_filename(rid_map[rid].rpartition('/')[-1]).replace(' ', '_') or 'image' base = base or ascii_filename(rid_map[rid].rpartition('/')[-1]).replace(' ', '_') or 'image'
ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
if ext == 'emf':
# For an example, see: https://bugs.launchpad.net/bugs/1224849
self.log('Found an EMF image: %s, trying to extract embedded raster image' % base)
from calibre.utils.wmf.emf import emf_unwrap
try:
raw = emf_unwrap(raw)
except Exception as e:
self.log.exception('Failed to extract embedded raster image from EMF')
else:
ext = 'png'
base = base.rpartition('.')[0] base = base.rpartition('.')[0]
if not base: if not base:
base = 'image' base = 'image'

View File

@ -57,7 +57,7 @@ class Convert(object):
self.tables = Tables() self.tables = Tables()
self.fields = Fields() self.fields = Fields()
self.styles = Styles(self.tables) self.styles = Styles(self.tables)
self.images = Images() self.images = Images(self.log)
self.object_map = OrderedDict() self.object_map = OrderedDict()
self.html = HTML( self.html = HTML(
HEAD( HEAD(

View File

@ -5,6 +5,8 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import struct
class Unavailable(Exception): class Unavailable(Exception):
pass pass
@ -12,3 +14,66 @@ class NoRaster(Exception):
pass pass
class DIBHeader(object):
'''
See http://en.wikipedia.org/wiki/BMP_file_format
'''
def __init__(self, raw):
hsize = struct.unpack(b'<I', raw[:4])[0]
if hsize == 40:
parts = struct.unpack(b'<IiiHHIIIIII', raw[:hsize])
for i, attr in enumerate((
'header_size', 'width', 'height', 'color_planes',
'bits_per_pixel', 'compression', 'image_size',
'hres', 'vres', 'ncols', 'nimpcols'
)):
setattr(self, attr, parts[i])
elif hsize == 12:
parts = struct.unpack(b'<IHHHH', raw[:hsize])
for i, attr in enumerate((
'header_size', 'width', 'height', 'color_planes',
'bits_per_pixel')):
setattr(self, attr, parts[i])
else:
raise ValueError('Unsupported DIB header type of size: %d'%hsize)
self.bitmasks_size = 12 if getattr(self, 'compression', 0) == 3 else 0
self.color_table_size = 0
if self.bits_per_pixel != 24:
# See http://support.microsoft.com/kb/q81498/
# for all the gory Micro and soft details
self.color_table_size = getattr(self, 'ncols', 0) * 4
def create_bmp_from_dib(raw):
size = len(raw) + 14
dh = DIBHeader(raw)
pixel_array_offset = dh.header_size + dh.bitmasks_size + \
dh.color_table_size
parts = [b'BM', struct.pack(b'<I', size), b'\0'*4, struct.pack(b'<I',
pixel_array_offset)]
return b''.join(parts) + raw
def to_png(bmp):
# ImageMagick does not convert some bmp files correctly, while Qt does,
# so try Qt first. See for instance:
# https://bugs.launchpad.net/calibre/+bug/934167
# ImageMagick bug report:
# http://www.imagemagick.org/discourse-server/viewtopic.php?f=3&t=20350
from PyQt4.Qt import QImage, QByteArray, QBuffer
i = QImage()
if i.loadFromData(bmp):
ba = QByteArray()
buf = QBuffer(ba)
buf.open(QBuffer.WriteOnly)
i.save(buf, 'png')
return bytes(ba.data())
from calibre.utils.magick import Image
img = Image()
img.load(bmp)
return img.export('png')

View File

@ -0,0 +1,93 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys
from struct import unpack_from
from collections import namedtuple
from calibre.utils.wmf import create_bmp_from_dib, to_png
# Record types {{{
RECORD_TYPES = {
'EMR_BITBLT' : 0x4C,
'EMR_STRETCHBLT' : 0x4D,
'EMR_MASKBLT' : 0x4E,
'EMR_PLGBLT' : 0x4F,
'EMR_SETDIBITSTODEVICE' : 0x50,
'EMR_STRETCHDIBITS' : 0x51,
'EMR_ALPHABLEND' : 0x72,
'EMR_TRANSPARENTBLT' : 0x74,
'EOF' : 0xe,
'HEADER' : 0x1,
}
RECORD_RMAP = {v:k for k, v in RECORD_TYPES.iteritems()}
StretchDiBits = namedtuple(
'StretchDiBits', 'left top right bottom x_dest y_dest x_src y_src cx_src'
' cy_src bmp_hdr_offset bmp_header_size bmp_bits_offset'
' bmp_bits_size usage op dest_width dest_height')
# }}}
class EMF(object):
def __init__(self, raw, verbose=0):
self.pos = 0
self.found_eof = False
self.verbose = verbose
self.func_map = {v:getattr(self, 'handle_%s' % (k.replace('EMR_', '').lower()), self.handle_unknown) for k, v in RECORD_TYPES.iteritems()}
self.bitmaps = []
while self.pos < len(raw) and not self.found_eof:
self.read_record(raw)
self.has_raster_image = bool(self.bitmaps)
def handle_unknown(self, rtype, size, raw):
if self.verbose:
print ('Ignoring unknown record:', RECORD_RMAP.get(rtype, hex(rtype).upper()))
def handle_header(self, rtype, size, raw):
pass
def handle_stretchdibits(self, rtype, size, raw):
data = StretchDiBits(*unpack_from(b'<18I', raw, 8))
hdr = raw[data.bmp_hdr_offset:data.bmp_hdr_offset + data.bmp_header_size]
bits = raw[data.bmp_bits_offset:data.bmp_bits_offset + data.bmp_bits_size]
bmp = create_bmp_from_dib(hdr + bits)
self.bitmaps.append(bmp)
def handle_eof(self, rtype, size, raw):
self.found_eof = True
def read_record(self, raw):
rtype, size = unpack_from(b'<II', raw, self.pos)
record = raw[self.pos:self.pos+size]
self.pos += size
self.func_map.get(rtype, self.handle_unknown)(rtype, size, record)
def to_png(self):
bmps = list(sorted(self.bitmaps, key=lambda x: len(x)))
bmp = bmps[-1]
return to_png(bmp)
def emf_unwrap(raw, verbose=0):
'''
Return the largest embedded raster image in the EMF.
The returned data is in PNG format.
'''
w = EMF(raw, verbose=verbose)
if not w.has_raster_image:
raise ValueError('No raster image found in the EMF')
return w.to_png()
if __name__ == '__main__':
with open(sys.argv[-1], 'rb') as f:
raw = f.read()
emf = EMF(raw, verbose=4)
open('/t/test.bmp', 'wb').write(emf.bitmaps[0])
open('/t/test.png', 'wb').write(emf.to_png())

View File

@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
import sys, struct import sys, struct
from calibre.utils.wmf import create_bmp_from_dib, to_png
class WMFHeader(object): class WMFHeader(object):
@ -34,39 +34,6 @@ class WMFHeader(object):
self.records_start_at = header_size * 2 self.records_start_at = header_size * 2
class DIBHeader(object):
'''
See http://en.wikipedia.org/wiki/BMP_file_format
'''
def __init__(self, raw):
hsize = struct.unpack('<I', raw[:4])[0]
if hsize == 40:
parts = struct.unpack('<IiiHHIIIIII', raw[:hsize])
for i, attr in enumerate((
'header_size', 'width', 'height', 'color_planes',
'bits_per_pixel', 'compression', 'image_size',
'hres', 'vres', 'ncols', 'nimpcols'
)):
setattr(self, attr, parts[i])
elif hsize == 12:
parts = struct.unpack('<IHHHH', raw[:hsize])
for i, attr in enumerate((
'header_size', 'width', 'height', 'color_planes',
'bits_per_pixel')):
setattr(self, attr, parts[i])
else:
raise ValueError('Unsupported DIB header type of size: %d'%hsize)
self.bitmasks_size = 12 if getattr(self, 'compression', 0) == 3 else 0
self.color_table_size = 0
if self.bits_per_pixel != 24:
# See http://support.microsoft.com/kb/q81498/
# for all the gory Micro and soft details
self.color_table_size = getattr(self, 'ncols', 0) * 4
class WMF(object): class WMF(object):
def __init__(self, log=None, verbose=0): def __init__(self, log=None, verbose=0):
@ -80,7 +47,7 @@ class WMF(object):
self.window_extent = None self.window_extent = None
self.bitmaps = [] self.bitmaps = []
self.function_map = { # {{{ self.function_map = { # {{{
30: 'SaveDC', 30: 'SaveDC',
53: 'RealizePalette', 53: 'RealizePalette',
55: 'SetPalEntries', 55: 'SetPalEntries',
@ -160,7 +127,7 @@ class WMF(object):
2851: 'StretchBlt', 2851: 'StretchBlt',
2881: 'DibStretchBlt', 2881: 'DibStretchBlt',
3907: 'StretchDIBits' 3907: 'StretchDIBits'
} # }}} } # }}}
def __call__(self, stream_or_data): def __call__(self, stream_or_data):
data = stream_or_data data = stream_or_data
@ -174,7 +141,7 @@ class WMF(object):
self.records = [] self.records = []
while offset < len(data)-6: while offset < len(data)-6:
size, func = struct.unpack_from('<IH', data, offset) size, func = struct.unpack_from('<IH', data, offset)
size *= 2 # Convert to bytes size *= 2 # Convert to bytes
offset += hsize offset += hsize
params = '' params = ''
delta = size - hsize delta = size - hsize
@ -197,7 +164,6 @@ class WMF(object):
self.has_raster_image = len(self.bitmaps) > 0 self.has_raster_image = len(self.bitmaps) > 0
def SetMapMode(self, params): def SetMapMode(self, params):
if len(params) == 2: if len(params) == 2:
self.map_mode = struct.unpack('<H', params)[0] self.map_mode = struct.unpack('<H', params)[0]
@ -231,40 +197,13 @@ class WMF(object):
dest_width, y_dest, x_dest = struct.unpack_from('<IHHHHHHHH', raw, offset) dest_width, y_dest, x_dest = struct.unpack_from('<IHHHHHHHH', raw, offset)
offset += struct.calcsize(fmt) offset += struct.calcsize(fmt)
bmp_data = raw[offset:] bmp_data = raw[offset:]
bmp = self.create_bmp_from_dib(bmp_data) bmp = create_bmp_from_dib(bmp_data)
self.bitmaps.append(bmp) self.bitmaps.append(bmp)
def create_bmp_from_dib(self, raw):
size = len(raw) + 14
dh = DIBHeader(raw)
pixel_array_offset = dh.header_size + dh.bitmasks_size + \
dh.color_table_size
parts = ['BM', struct.pack('<I', size), '\0'*4, struct.pack('<I',
pixel_array_offset)]
return ''.join(parts) + raw
def to_png(self): def to_png(self):
bmps = list(sorted(self.bitmaps, key=lambda x: len(x))) bmps = list(sorted(self.bitmaps, key=lambda x: len(x)))
bmp = bmps[-1] bmp = bmps[-1]
return to_png(bmp)
# ImageMagick does not convert some bmp files correctly, while Qt does,
# so try Qt first. See for instance:
# https://bugs.launchpad.net/calibre/+bug/934167
# ImageMagick bug report:
# http://www.imagemagick.org/discourse-server/viewtopic.php?f=3&t=20350
from PyQt4.Qt import QImage, QByteArray, QBuffer
i = QImage()
if i.loadFromData(bmp):
ba = QByteArray()
buf = QBuffer(ba)
buf.open(QBuffer.WriteOnly)
i.save(buf, 'png')
return bytes(ba.data())
from calibre.utils.magick import Image
img = Image()
img.load(bmp)
return img.export('png')
def wmf_unwrap(wmf_data, verbose=0): def wmf_unwrap(wmf_data, verbose=0):
''' '''