The plumbing for images in DOCX Output

2025-07-09 03:04:10 -04:00 · 2015-03-24 15:44:03 +05:30 · 2015-03-24 15:44:03 +05:30 · 8c827eefc6
commit 8c827eefc6
parent 2d768e9f4e
3 changed files with 129 additions and 29 deletions
--- a/src/calibre/ebooks/docx/writer/container.py
+++ b/src/calibre/ebooks/docx/writer/container.py
@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

-import textwrap
+import textwrap, os
 from io import BytesIO

 from lxml import etree
@ -14,7 +14,7 @@ from lxml.builder import ElementMaker

 from calibre import guess_type
 from calibre.constants import numeric_version, __appname__
-from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS
+from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS, IMAGES
 from calibre.ebooks.metadata import authors_to_string
 from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
 from calibre.ebooks.oeb.base import OPF, OPF2_NS
@ -51,7 +51,6 @@ class DocumentRelationships(object):

    def __init__(self):
        self.rmap = {}
-        self.counter = 0
        for typ, target in {
                STYLES: 'styles.xml',
                WEB_SETTINGS: 'webSettings.xml',
@ -64,11 +63,13 @@ class DocumentRelationships(object):
    def add_relationship(self, target, rtype, target_mode=None):
        ans = self.get_relationship_id(target, rtype, target_mode)
        if ans is None:
-            self.counter += 1
-            ans = 'rId%d' % self.counter
+            ans = 'rId%d' % (len(self.rmap) + 1)
            self.rmap[(target, rtype, target_mode)] = ans
        return ans

+    def add_image(self, target):
+        return self.add_relationship(target, IMAGES)
+
    def serialize(self):
        E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
        relationships = E.Relationships()
@ -113,8 +114,13 @@ class DOCX(object):
        }.iteritems():
            added.add(ext)
            types.append(E.Default(Extension=ext, ContentType=mt))
-        # TODO: Iterate over all resources and add mimetypes for any that are
-        # not already added
+        for fname in self.images:
+            ext = fname.rpartition(os.extsep)[-1]
+            if ext not in added:
+                added.add(ext)
+                mt = guess_type('a.' + ext)[0]
+                if mt:
+                    types.append(E.Default(Extension=ext, ContentType=mt))
        return xml2str(types)

    @property
@ -176,6 +182,8 @@ class DOCX(object):
            zf.writestr('word/document.xml', xml2str(self.document))
            zf.writestr('word/styles.xml', xml2str(self.styles))
            zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
+            for fname, data_getter in self.images.iteritems():
+                zf.writestr(fname, data_getter())

 if __name__ == '__main__':
    d = DOCX(None, None)
--- a/src/calibre/ebooks/docx/writer/from_html.py
+++ b/src/calibre/ebooks/docx/writer/from_html.py
@ -13,6 +13,7 @@ from lxml.builder import ElementMaker

 from calibre.ebooks.docx.names import namespaces
 from calibre.ebooks.docx.writer.styles import w, StylesManager
+from calibre.ebooks.docx.writer.images import ImagesManager
 from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
 from calibre.ebooks.oeb.base import XPath, barename
 from calibre.ebooks.pdf.render.common import PAPER_SIZES
@ -151,18 +152,26 @@ class Convert(object):

    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
-        SVGRasterizer()(self.oeb, self.opts)
+        self.svg_rasterizer = SVGRasterizer()
+        self.svg_rasterizer(self.oeb, self.opts)

        self.styles_manager = StylesManager()
+        self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)

-        for item in self.oeb.spine:
-            self.process_item(item)
+        try:
+            for item in self.oeb.spine:
+                self.process_item(item)

-        self.styles_manager.finalize(self.blocks)
-        self.write()
+            self.styles_manager.finalize(self.blocks)
+            self.write()
+        finally:
+            self.images_manager.cleanup()

    def process_item(self, item):
-        stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
+        stylizer = self.svg_rasterizer.stylizer_cache.get(item)
+        if stylizer is None:
+            stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
+        self.abshref = self.images_manager.abshref = item.abshref

        is_first_block = True
        for body in XPath('//h:body')(item.data):
@ -177,21 +186,24 @@ class Convert(object):
        block_style = stylizer.style(html_block)
        if block_style.is_hidden:
            return
-        if html_block.text:
-            docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)
+        if html_block.tag.endswith('}img'):
+            b = Block(self.styles_manager, html_block, stylizer.style(html_block))
+            self.blocks.append(b)
+            self.images_manager.add_image(html_block, b, stylizer)
+        else:
+            if html_block.text:
+                docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)

-        for child in html_block.iterchildren(etree.Element):
-            tag = barename(child.tag)
-            style = stylizer.style(child)
-            display = style._get('display')
-            if tag == 'img':
-                pass  # TODO: Handle images
-            if display == 'block' and tag != 'br':
-                b = Block(self.styles_manager, child, style)
-                self.blocks.append(b)
-                self.process_block(child, b, stylizer)
-            else:
-                self.process_inline(child, self.blocks[-1], stylizer)
+            for child in html_block.iterchildren(etree.Element):
+                tag = barename(child.tag)
+                style = stylizer.style(child)
+                display = style._get('display')
+                if display == 'block' and tag != 'br':
+                    b = Block(self.styles_manager, child, style)
+                    self.blocks.append(b)
+                    self.process_block(child, b, stylizer)
+                else:
+                    self.process_inline(child, self.blocks[-1], stylizer)

        if ignore_tail is False and html_block.tail and html_block.tail.strip():
            b = docx_block
@ -211,7 +223,7 @@ class Convert(object):
            if html_child.tail or html_child is not html_child.getparent()[-1]:
                docx_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(style['clear'], 'none'))
        elif tag == 'img':
-            return  # TODO: Handle images
+            self.images_manager.add_image(html_child, docx_block, stylizer)
        else:
            if html_child.text:
                docx_block.add_text(html_child.text, style, html_parent=html_child)
@ -249,7 +261,7 @@ class Convert(object):
            E.docGrid(**{w('linePitch'):"360"}),
        ))

-        dn = {k:v for k, v in namespaces.iteritems() if k in 'wr'}
+        dn = {k:v for k, v in namespaces.iteritems() if k in tuple('wra') + ('wp',)}
        E = ElementMaker(namespace=dn['w'], nsmap=dn)
        self.docx.styles = E.styles(
            E.docDefaults(
@ -268,4 +280,6 @@ class Convert(object):
                )
            )
        )
+        self.docx.images = {}
        self.styles_manager.serialize(self.docx.styles)
+        self.images_manager.serialize(self.docx.images)
--- a/src/calibre/ebooks/docx/writer/images.py
+++ b/src/calibre/ebooks/docx/writer/images.py
@ -0,0 +1,78 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import os
+import shutil, posixpath
+from collections import namedtuple
+from functools import partial
+
+from calibre.ebooks.oeb.base import urlunquote
+from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.utils.filenames import ascii_filename
+from calibre.utils.magick.draw import identify_data
+
+Image = namedtuple('Image', 'rid fname width height fmt item')
+
+class ImagesManager(object):
+
+    def __init__(self, oeb, document_relationships):
+        self.oeb, self.log = oeb, oeb.log
+        self.images = {}
+        self.seen_filenames = set()
+        self.document_relationships = document_relationships
+        self._tdir = None
+
+    @property
+    def tdir(self):
+        if self._tdir is None:
+            self._tdir = PersistentTemporaryDirectory(suffix='_docx_output_images')
+        return self._tdir
+
+    def cleanup(self):
+        if self._tdir is not None:
+            shutil.rmtree(self._tdir)
+            self._tdir = None
+
+    def add_image(self, img, block, stylizer):
+        src = img.get('src')
+        if not src:
+            return
+        href = self.abshref(src)
+        if href not in self.images:
+            item = self.oeb.manifest.hrefs.get(href)
+            if item is None or not isinstance(item.data, bytes):
+                return
+            width, height, fmt = identify_data(item.data)
+            image_fname = 'media/' + self.create_filename(href, fmt)
+            image_rid = self.document_relationships.add_image(image_fname)
+            self.images[href] = Image(image_rid, image_fname, width, height, fmt, item)
+            item.unload_data_from_memory()
+        return self.images[href].rid
+
+    def create_filename(self, href, fmt):
+        fname = ascii_filename(urlunquote(posixpath.basename(href)))
+        fname = posixpath.splitext(fname)[0]
+        fname = fname[:75].rstrip('.') or 'image'
+        num = 0
+        base = fname
+        while fname.lower() in self.seen_filenames:
+            num += 1
+            fname = base + str(num)
+        self.seen_filenames.add(fname.lower())
+        fname += os.extsep + fmt.lower()
+        return fname
+
+    def serialize(self, images_map):
+        for img in self.images.itervalues():
+            images_map['word/' + img.fname] = partial(self.get_data, img.item)
+
+    def get_data(self, item):
+        try:
+            return item.data
+        finally:
+            item.unload_data_from_memory(False)