From beea0a48b51aca369154d46e54c49c5b9edfa5df Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 13 Sep 2013 17:07:40 +0530 Subject: [PATCH] DOCX Input: Add support for EMF images DOCX Input: Add support for embedded EMF images that are just wrappers around an actual raster image. See #1224849 (DOCX to EPUB conversion failed with devide by zero) --- src/calibre/ebooks/docx/images.py | 14 ++++- src/calibre/ebooks/docx/to_html.py | 2 +- src/calibre/utils/wmf/__init__.py | 65 +++++++++++++++++++++ src/calibre/utils/wmf/emf.py | 93 ++++++++++++++++++++++++++++++ src/calibre/utils/wmf/parse.py | 73 ++--------------------- 5 files changed, 178 insertions(+), 69 deletions(-) create mode 100644 src/calibre/utils/wmf/emf.py diff --git a/src/calibre/ebooks/docx/images.py b/src/calibre/ebooks/docx/images.py index 3be3d51c05..4c0c8c8b6d 100644 --- a/src/calibre/ebooks/docx/images.py +++ b/src/calibre/ebooks/docx/images.py @@ -91,12 +91,13 @@ def get_hpos(anchor, page_width): class Images(object): - def __init__(self): + def __init__(self, log): self.rid_map = {} self.used = {} self.names = set() self.all_images = set() self.links = [] + self.log = log def __call__(self, relationships_by_id): self.rid_map = relationships_by_id @@ -109,6 +110,17 @@ class Images(object): raw = self.docx.read(fname) base = base or ascii_filename(rid_map[rid].rpartition('/')[-1]).replace(' ', '_') or 'image' ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg' + if ext == 'emf': + # For an example, see: https://bugs.launchpad.net/bugs/1224849 + self.log('Found an EMF image: %s, trying to extract embedded raster image' % base) + from calibre.utils.wmf.emf import emf_unwrap + try: + raw = emf_unwrap(raw) + except Exception as e: + self.log.exception('Failed to extract embedded raster image from EMF') + else: + ext = 'png' + base = base.rpartition('.')[0] if not base: base = 'image' diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index cdb50fcffd..b878714a76 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -57,7 +57,7 @@ class Convert(object): self.tables = Tables() self.fields = Fields() self.styles = Styles(self.tables) - self.images = Images() + self.images = Images(self.log) self.object_map = OrderedDict() self.html = HTML( HEAD( diff --git a/src/calibre/utils/wmf/__init__.py b/src/calibre/utils/wmf/__init__.py index d3e2ba2bb3..c1304c50be 100644 --- a/src/calibre/utils/wmf/__init__.py +++ b/src/calibre/utils/wmf/__init__.py @@ -5,6 +5,8 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import struct + class Unavailable(Exception): pass @@ -12,3 +14,66 @@ class NoRaster(Exception): pass +class DIBHeader(object): + + ''' + See http://en.wikipedia.org/wiki/BMP_file_format + ''' + + def __init__(self, raw): + hsize = struct.unpack(b' 0 - def SetMapMode(self, params): if len(params) == 2: self.map_mode = struct.unpack('