The plumbing for images in DOCX Output

This commit is contained in:
Kovid Goyal 2015-03-24 15:44:03 +05:30
parent 2d768e9f4e
commit 8c827eefc6
3 changed files with 129 additions and 29 deletions

View File

@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap
import textwrap, os
from io import BytesIO
from lxml import etree
@ -14,7 +14,7 @@ from lxml.builder import ElementMaker
from calibre import guess_type
from calibre.constants import numeric_version, __appname__
from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS
from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS, IMAGES
from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
from calibre.ebooks.oeb.base import OPF, OPF2_NS
@ -51,7 +51,6 @@ class DocumentRelationships(object):
def __init__(self):
self.rmap = {}
self.counter = 0
for typ, target in {
STYLES: 'styles.xml',
WEB_SETTINGS: 'webSettings.xml',
@ -64,11 +63,13 @@ class DocumentRelationships(object):
def add_relationship(self, target, rtype, target_mode=None):
ans = self.get_relationship_id(target, rtype, target_mode)
if ans is None:
self.counter += 1
ans = 'rId%d' % self.counter
ans = 'rId%d' % (len(self.rmap) + 1)
self.rmap[(target, rtype, target_mode)] = ans
return ans
def add_image(self, target):
return self.add_relationship(target, IMAGES)
def serialize(self):
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
relationships = E.Relationships()
@ -113,8 +114,13 @@ class DOCX(object):
}.iteritems():
added.add(ext)
types.append(E.Default(Extension=ext, ContentType=mt))
# TODO: Iterate over all resources and add mimetypes for any that are
# not already added
for fname in self.images:
ext = fname.rpartition(os.extsep)[-1]
if ext not in added:
added.add(ext)
mt = guess_type('a.' + ext)[0]
if mt:
types.append(E.Default(Extension=ext, ContentType=mt))
return xml2str(types)
@property
@ -176,6 +182,8 @@ class DOCX(object):
zf.writestr('word/document.xml', xml2str(self.document))
zf.writestr('word/styles.xml', xml2str(self.styles))
zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
for fname, data_getter in self.images.iteritems():
zf.writestr(fname, data_getter())
if __name__ == '__main__':
d = DOCX(None, None)

View File

@ -13,6 +13,7 @@ from lxml.builder import ElementMaker
from calibre.ebooks.docx.names import namespaces
from calibre.ebooks.docx.writer.styles import w, StylesManager
from calibre.ebooks.docx.writer.images import ImagesManager
from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
from calibre.ebooks.oeb.base import XPath, barename
from calibre.ebooks.pdf.render.common import PAPER_SIZES
@ -151,18 +152,26 @@ class Convert(object):
def __call__(self):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
SVGRasterizer()(self.oeb, self.opts)
self.svg_rasterizer = SVGRasterizer()
self.svg_rasterizer(self.oeb, self.opts)
self.styles_manager = StylesManager()
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
for item in self.oeb.spine:
self.process_item(item)
try:
for item in self.oeb.spine:
self.process_item(item)
self.styles_manager.finalize(self.blocks)
self.write()
self.styles_manager.finalize(self.blocks)
self.write()
finally:
self.images_manager.cleanup()
def process_item(self, item):
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
stylizer = self.svg_rasterizer.stylizer_cache.get(item)
if stylizer is None:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
self.abshref = self.images_manager.abshref = item.abshref
is_first_block = True
for body in XPath('//h:body')(item.data):
@ -177,21 +186,24 @@ class Convert(object):
block_style = stylizer.style(html_block)
if block_style.is_hidden:
return
if html_block.text:
docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)
if html_block.tag.endswith('}img'):
b = Block(self.styles_manager, html_block, stylizer.style(html_block))
self.blocks.append(b)
self.images_manager.add_image(html_block, b, stylizer)
else:
if html_block.text:
docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)
for child in html_block.iterchildren(etree.Element):
tag = barename(child.tag)
style = stylizer.style(child)
display = style._get('display')
if tag == 'img':
pass # TODO: Handle images
if display == 'block' and tag != 'br':
b = Block(self.styles_manager, child, style)
self.blocks.append(b)
self.process_block(child, b, stylizer)
else:
self.process_inline(child, self.blocks[-1], stylizer)
for child in html_block.iterchildren(etree.Element):
tag = barename(child.tag)
style = stylizer.style(child)
display = style._get('display')
if display == 'block' and tag != 'br':
b = Block(self.styles_manager, child, style)
self.blocks.append(b)
self.process_block(child, b, stylizer)
else:
self.process_inline(child, self.blocks[-1], stylizer)
if ignore_tail is False and html_block.tail and html_block.tail.strip():
b = docx_block
@ -211,7 +223,7 @@ class Convert(object):
if html_child.tail or html_child is not html_child.getparent()[-1]:
docx_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(style['clear'], 'none'))
elif tag == 'img':
return # TODO: Handle images
self.images_manager.add_image(html_child, docx_block, stylizer)
else:
if html_child.text:
docx_block.add_text(html_child.text, style, html_parent=html_child)
@ -249,7 +261,7 @@ class Convert(object):
E.docGrid(**{w('linePitch'):"360"}),
))
dn = {k:v for k, v in namespaces.iteritems() if k in 'wr'}
dn = {k:v for k, v in namespaces.iteritems() if k in tuple('wra') + ('wp',)}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
self.docx.styles = E.styles(
E.docDefaults(
@ -268,4 +280,6 @@ class Convert(object):
)
)
)
self.docx.images = {}
self.styles_manager.serialize(self.docx.styles)
self.images_manager.serialize(self.docx.images)

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import os
import shutil, posixpath
from collections import namedtuple
from functools import partial
from calibre.ebooks.oeb.base import urlunquote
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.filenames import ascii_filename
from calibre.utils.magick.draw import identify_data
Image = namedtuple('Image', 'rid fname width height fmt item')
class ImagesManager(object):
def __init__(self, oeb, document_relationships):
self.oeb, self.log = oeb, oeb.log
self.images = {}
self.seen_filenames = set()
self.document_relationships = document_relationships
self._tdir = None
@property
def tdir(self):
if self._tdir is None:
self._tdir = PersistentTemporaryDirectory(suffix='_docx_output_images')
return self._tdir
def cleanup(self):
if self._tdir is not None:
shutil.rmtree(self._tdir)
self._tdir = None
def add_image(self, img, block, stylizer):
src = img.get('src')
if not src:
return
href = self.abshref(src)
if href not in self.images:
item = self.oeb.manifest.hrefs.get(href)
if item is None or not isinstance(item.data, bytes):
return
width, height, fmt = identify_data(item.data)
image_fname = 'media/' + self.create_filename(href, fmt)
image_rid = self.document_relationships.add_image(image_fname)
self.images[href] = Image(image_rid, image_fname, width, height, fmt, item)
item.unload_data_from_memory()
return self.images[href].rid
def create_filename(self, href, fmt):
fname = ascii_filename(urlunquote(posixpath.basename(href)))
fname = posixpath.splitext(fname)[0]
fname = fname[:75].rstrip('.') or 'image'
num = 0
base = fname
while fname.lower() in self.seen_filenames:
num += 1
fname = base + str(num)
self.seen_filenames.add(fname.lower())
fname += os.extsep + fmt.lower()
return fname
def serialize(self, images_map):
for img in self.images.itervalues():
images_map['word/' + img.fname] = partial(self.get_data, img.item)
def get_data(self, item):
try:
return item.data
finally:
item.unload_data_from_memory(False)