The plumbing for images in DOCX Output

2025-07-09 03:04:10 -04:00 · 2015-03-24 15:44:03 +05:30 · 2015-03-24 15:44:03 +05:30 · 8c827eefc6
commit 8c827eefc6
parent 2d768e9f4e
3 changed files with 129 additions and 29 deletions
--- a/src/calibre/ebooks/docx/writer/container.py
+++ b/src/calibre/ebooks/docx/writer/container.py
@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
-import textwrap
+import textwrap, os
 from io import BytesIO
 from lxml import etree
@ -14,7 +14,7 @@ from lxml.builder import ElementMaker
 from calibre import guess_type
 from calibre.constants import numeric_version, __appname__
-from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS
+from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS, IMAGES
 from calibre.ebooks.metadata import authors_to_string
 from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
 from calibre.ebooks.oeb.base import OPF, OPF2_NS
@ -51,7 +51,6 @@ class DocumentRelationships(object):
    def __init__(self):
        self.rmap = {}
        self.counter = 0
        for typ, target in {
                STYLES: 'styles.xml',
                WEB_SETTINGS: 'webSettings.xml',
@ -64,11 +63,13 @@ class DocumentRelationships(object):
    def add_relationship(self, target, rtype, target_mode=None):
        ans = self.get_relationship_id(target, rtype, target_mode)
        if ans is None:
-            self.counter += 1
+            ans = 'rId%d' % (len(self.rmap) + 1)
            ans = 'rId%d' % self.counter
            self.rmap[(target, rtype, target_mode)] = ans
        return ans
    def add_image(self, target):
        return self.add_relationship(target, IMAGES)
    def serialize(self):
        E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
        relationships = E.Relationships()
@ -113,8 +114,13 @@ class DOCX(object):
        }.iteritems():
            added.add(ext)
            types.append(E.Default(Extension=ext, ContentType=mt))
-        # TODO: Iterate over all resources and add mimetypes for any that are
+        for fname in self.images:
-        # not already added
+            ext = fname.rpartition(os.extsep)[-1]
            if ext not in added:
                added.add(ext)
                mt = guess_type('a.' + ext)[0]
                if mt:
                    types.append(E.Default(Extension=ext, ContentType=mt))
        return xml2str(types)
    @property
@ -176,6 +182,8 @@ class DOCX(object):
            zf.writestr('word/document.xml', xml2str(self.document))
            zf.writestr('word/styles.xml', xml2str(self.styles))
            zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
            for fname, data_getter in self.images.iteritems():
                zf.writestr(fname, data_getter())
 if __name__ == '__main__':
    d = DOCX(None, None)
--- a/src/calibre/ebooks/docx/writer/from_html.py
+++ b/src/calibre/ebooks/docx/writer/from_html.py
@ -13,6 +13,7 @@ from lxml.builder import ElementMaker
 from calibre.ebooks.docx.names import namespaces
 from calibre.ebooks.docx.writer.styles import w, StylesManager
 from calibre.ebooks.docx.writer.images import ImagesManager
 from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
 from calibre.ebooks.oeb.base import XPath, barename
 from calibre.ebooks.pdf.render.common import PAPER_SIZES
@ -151,18 +152,26 @@ class Convert(object):
    def __call__(self):
        from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
-        SVGRasterizer()(self.oeb, self.opts)
+        self.svg_rasterizer = SVGRasterizer()
        self.svg_rasterizer(self.oeb, self.opts)
        self.styles_manager = StylesManager()
        self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
-        for item in self.oeb.spine:
+        try:
-            self.process_item(item)
+            for item in self.oeb.spine:
                self.process_item(item)
-        self.styles_manager.finalize(self.blocks)
+            self.styles_manager.finalize(self.blocks)
-        self.write()
+            self.write()
        finally:
            self.images_manager.cleanup()
    def process_item(self, item):
-        stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
+        stylizer = self.svg_rasterizer.stylizer_cache.get(item)
        if stylizer is None:
            stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
        self.abshref = self.images_manager.abshref = item.abshref
        is_first_block = True
        for body in XPath('//h:body')(item.data):
@ -177,21 +186,24 @@ class Convert(object):
        block_style = stylizer.style(html_block)
        if block_style.is_hidden:
            return
-        if html_block.text:
+        if html_block.tag.endswith('}img'):
-            docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)
+            b = Block(self.styles_manager, html_block, stylizer.style(html_block))
            self.blocks.append(b)
            self.images_manager.add_image(html_block, b, stylizer)
        else:
            if html_block.text:
                docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)
-        for child in html_block.iterchildren(etree.Element):
+            for child in html_block.iterchildren(etree.Element):
-            tag = barename(child.tag)
+                tag = barename(child.tag)
-            style = stylizer.style(child)
+                style = stylizer.style(child)
-            display = style._get('display')
+                display = style._get('display')
-            if tag == 'img':
+                if display == 'block' and tag != 'br':
-                pass  # TODO: Handle images
+                    b = Block(self.styles_manager, child, style)
-            if display == 'block' and tag != 'br':
+                    self.blocks.append(b)
-                b = Block(self.styles_manager, child, style)
+                    self.process_block(child, b, stylizer)
-                self.blocks.append(b)
+                else:
-                self.process_block(child, b, stylizer)
+                    self.process_inline(child, self.blocks[-1], stylizer)
            else:
                self.process_inline(child, self.blocks[-1], stylizer)
        if ignore_tail is False and html_block.tail and html_block.tail.strip():
            b = docx_block
@ -211,7 +223,7 @@ class Convert(object):
            if html_child.tail or html_child is not html_child.getparent()[-1]:
                docx_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(style['clear'], 'none'))
        elif tag == 'img':
-            return  # TODO: Handle images
+            self.images_manager.add_image(html_child, docx_block, stylizer)
        else:
            if html_child.text:
                docx_block.add_text(html_child.text, style, html_parent=html_child)
@ -249,7 +261,7 @@ class Convert(object):
            E.docGrid(**{w('linePitch'):"360"}),
        ))
-        dn = {k:v for k, v in namespaces.iteritems() if k in 'wr'}
+        dn = {k:v for k, v in namespaces.iteritems() if k in tuple('wra') + ('wp',)}
        E = ElementMaker(namespace=dn['w'], nsmap=dn)
        self.docx.styles = E.styles(
            E.docDefaults(
@ -268,4 +280,6 @@ class Convert(object):
                )
            )
        )
        self.docx.images = {}
        self.styles_manager.serialize(self.docx.styles)
        self.images_manager.serialize(self.docx.images)
--- a/src/calibre/ebooks/docx/writer/images.py
+++ b/src/calibre/ebooks/docx/writer/images.py
@ -0,0 +1,78 @@
 #!/usr/bin/env python2
 # vim:fileencoding=utf-8
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__ = 'GPL v3'
 __copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
 import os
 import shutil, posixpath
 from collections import namedtuple
 from functools import partial
 from calibre.ebooks.oeb.base import urlunquote
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.filenames import ascii_filename
 from calibre.utils.magick.draw import identify_data
 Image = namedtuple('Image', 'rid fname width height fmt item')
 class ImagesManager(object):
    def __init__(self, oeb, document_relationships):
        self.oeb, self.log = oeb, oeb.log
        self.images = {}
        self.seen_filenames = set()
        self.document_relationships = document_relationships
        self._tdir = None
    @property
    def tdir(self):
        if self._tdir is None:
            self._tdir = PersistentTemporaryDirectory(suffix='_docx_output_images')
        return self._tdir
    def cleanup(self):
        if self._tdir is not None:
            shutil.rmtree(self._tdir)
            self._tdir = None
    def add_image(self, img, block, stylizer):
        src = img.get('src')
        if not src:
            return
        href = self.abshref(src)
        if href not in self.images:
            item = self.oeb.manifest.hrefs.get(href)
            if item is None or not isinstance(item.data, bytes):
                return
            width, height, fmt = identify_data(item.data)
            image_fname = 'media/' + self.create_filename(href, fmt)
            image_rid = self.document_relationships.add_image(image_fname)
            self.images[href] = Image(image_rid, image_fname, width, height, fmt, item)
            item.unload_data_from_memory()
        return self.images[href].rid
    def create_filename(self, href, fmt):
        fname = ascii_filename(urlunquote(posixpath.basename(href)))
        fname = posixpath.splitext(fname)[0]
        fname = fname[:75].rstrip('.') or 'image'
        num = 0
        base = fname
        while fname.lower() in self.seen_filenames:
            num += 1
            fname = base + str(num)
        self.seen_filenames.add(fname.lower())
        fname += os.extsep + fmt.lower()
        return fname
    def serialize(self, images_map):
        for img in self.images.itervalues():
            images_map['word/' + img.fname] = partial(self.get_data, img.item)
    def get_data(self, item):
        try:
            return item.data
        finally:
            item.unload_data_from_memory(False)