The plumbing for images in DOCX Output

This commit is contained in:
Kovid Goyal 2015-03-24 15:44:03 +05:30
parent 2d768e9f4e
commit 8c827eefc6
3 changed files with 129 additions and 29 deletions

View File

@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap import textwrap, os
from io import BytesIO from io import BytesIO
from lxml import etree from lxml import etree
@ -14,7 +14,7 @@ from lxml.builder import ElementMaker
from calibre import guess_type from calibre import guess_type
from calibre.constants import numeric_version, __appname__ from calibre.constants import numeric_version, __appname__
from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS, IMAGES
from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.metadata.opf2 import OPF as ReadOPF from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
from calibre.ebooks.oeb.base import OPF, OPF2_NS from calibre.ebooks.oeb.base import OPF, OPF2_NS
@ -51,7 +51,6 @@ class DocumentRelationships(object):
def __init__(self): def __init__(self):
self.rmap = {} self.rmap = {}
self.counter = 0
for typ, target in { for typ, target in {
STYLES: 'styles.xml', STYLES: 'styles.xml',
WEB_SETTINGS: 'webSettings.xml', WEB_SETTINGS: 'webSettings.xml',
@ -64,11 +63,13 @@ class DocumentRelationships(object):
def add_relationship(self, target, rtype, target_mode=None): def add_relationship(self, target, rtype, target_mode=None):
ans = self.get_relationship_id(target, rtype, target_mode) ans = self.get_relationship_id(target, rtype, target_mode)
if ans is None: if ans is None:
self.counter += 1 ans = 'rId%d' % (len(self.rmap) + 1)
ans = 'rId%d' % self.counter
self.rmap[(target, rtype, target_mode)] = ans self.rmap[(target, rtype, target_mode)] = ans
return ans return ans
def add_image(self, target):
return self.add_relationship(target, IMAGES)
def serialize(self): def serialize(self):
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
relationships = E.Relationships() relationships = E.Relationships()
@ -113,8 +114,13 @@ class DOCX(object):
}.iteritems(): }.iteritems():
added.add(ext) added.add(ext)
types.append(E.Default(Extension=ext, ContentType=mt)) types.append(E.Default(Extension=ext, ContentType=mt))
# TODO: Iterate over all resources and add mimetypes for any that are for fname in self.images:
# not already added ext = fname.rpartition(os.extsep)[-1]
if ext not in added:
added.add(ext)
mt = guess_type('a.' + ext)[0]
if mt:
types.append(E.Default(Extension=ext, ContentType=mt))
return xml2str(types) return xml2str(types)
@property @property
@ -176,6 +182,8 @@ class DOCX(object):
zf.writestr('word/document.xml', xml2str(self.document)) zf.writestr('word/document.xml', xml2str(self.document))
zf.writestr('word/styles.xml', xml2str(self.styles)) zf.writestr('word/styles.xml', xml2str(self.styles))
zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize()) zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
for fname, data_getter in self.images.iteritems():
zf.writestr(fname, data_getter())
if __name__ == '__main__': if __name__ == '__main__':
d = DOCX(None, None) d = DOCX(None, None)

View File

@ -13,6 +13,7 @@ from lxml.builder import ElementMaker
from calibre.ebooks.docx.names import namespaces from calibre.ebooks.docx.names import namespaces
from calibre.ebooks.docx.writer.styles import w, StylesManager from calibre.ebooks.docx.writer.styles import w, StylesManager
from calibre.ebooks.docx.writer.images import ImagesManager
from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
from calibre.ebooks.oeb.base import XPath, barename from calibre.ebooks.oeb.base import XPath, barename
from calibre.ebooks.pdf.render.common import PAPER_SIZES from calibre.ebooks.pdf.render.common import PAPER_SIZES
@ -151,18 +152,26 @@ class Convert(object):
def __call__(self): def __call__(self):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
SVGRasterizer()(self.oeb, self.opts) self.svg_rasterizer = SVGRasterizer()
self.svg_rasterizer(self.oeb, self.opts)
self.styles_manager = StylesManager() self.styles_manager = StylesManager()
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
for item in self.oeb.spine: try:
self.process_item(item) for item in self.oeb.spine:
self.process_item(item)
self.styles_manager.finalize(self.blocks) self.styles_manager.finalize(self.blocks)
self.write() self.write()
finally:
self.images_manager.cleanup()
def process_item(self, item): def process_item(self, item):
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) stylizer = self.svg_rasterizer.stylizer_cache.get(item)
if stylizer is None:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
self.abshref = self.images_manager.abshref = item.abshref
is_first_block = True is_first_block = True
for body in XPath('//h:body')(item.data): for body in XPath('//h:body')(item.data):
@ -177,21 +186,24 @@ class Convert(object):
block_style = stylizer.style(html_block) block_style = stylizer.style(html_block)
if block_style.is_hidden: if block_style.is_hidden:
return return
if html_block.text: if html_block.tag.endswith('}img'):
docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True) b = Block(self.styles_manager, html_block, stylizer.style(html_block))
self.blocks.append(b)
self.images_manager.add_image(html_block, b, stylizer)
else:
if html_block.text:
docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True)
for child in html_block.iterchildren(etree.Element): for child in html_block.iterchildren(etree.Element):
tag = barename(child.tag) tag = barename(child.tag)
style = stylizer.style(child) style = stylizer.style(child)
display = style._get('display') display = style._get('display')
if tag == 'img': if display == 'block' and tag != 'br':
pass # TODO: Handle images b = Block(self.styles_manager, child, style)
if display == 'block' and tag != 'br': self.blocks.append(b)
b = Block(self.styles_manager, child, style) self.process_block(child, b, stylizer)
self.blocks.append(b) else:
self.process_block(child, b, stylizer) self.process_inline(child, self.blocks[-1], stylizer)
else:
self.process_inline(child, self.blocks[-1], stylizer)
if ignore_tail is False and html_block.tail and html_block.tail.strip(): if ignore_tail is False and html_block.tail and html_block.tail.strip():
b = docx_block b = docx_block
@ -211,7 +223,7 @@ class Convert(object):
if html_child.tail or html_child is not html_child.getparent()[-1]: if html_child.tail or html_child is not html_child.getparent()[-1]:
docx_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(style['clear'], 'none')) docx_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(style['clear'], 'none'))
elif tag == 'img': elif tag == 'img':
return # TODO: Handle images self.images_manager.add_image(html_child, docx_block, stylizer)
else: else:
if html_child.text: if html_child.text:
docx_block.add_text(html_child.text, style, html_parent=html_child) docx_block.add_text(html_child.text, style, html_parent=html_child)
@ -249,7 +261,7 @@ class Convert(object):
E.docGrid(**{w('linePitch'):"360"}), E.docGrid(**{w('linePitch'):"360"}),
)) ))
dn = {k:v for k, v in namespaces.iteritems() if k in 'wr'} dn = {k:v for k, v in namespaces.iteritems() if k in tuple('wra') + ('wp',)}
E = ElementMaker(namespace=dn['w'], nsmap=dn) E = ElementMaker(namespace=dn['w'], nsmap=dn)
self.docx.styles = E.styles( self.docx.styles = E.styles(
E.docDefaults( E.docDefaults(
@ -268,4 +280,6 @@ class Convert(object):
) )
) )
) )
self.docx.images = {}
self.styles_manager.serialize(self.docx.styles) self.styles_manager.serialize(self.docx.styles)
self.images_manager.serialize(self.docx.images)

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
import os
import shutil, posixpath
from collections import namedtuple
from functools import partial
from calibre.ebooks.oeb.base import urlunquote
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.filenames import ascii_filename
from calibre.utils.magick.draw import identify_data
Image = namedtuple('Image', 'rid fname width height fmt item')
class ImagesManager(object):
def __init__(self, oeb, document_relationships):
self.oeb, self.log = oeb, oeb.log
self.images = {}
self.seen_filenames = set()
self.document_relationships = document_relationships
self._tdir = None
@property
def tdir(self):
if self._tdir is None:
self._tdir = PersistentTemporaryDirectory(suffix='_docx_output_images')
return self._tdir
def cleanup(self):
if self._tdir is not None:
shutil.rmtree(self._tdir)
self._tdir = None
def add_image(self, img, block, stylizer):
src = img.get('src')
if not src:
return
href = self.abshref(src)
if href not in self.images:
item = self.oeb.manifest.hrefs.get(href)
if item is None or not isinstance(item.data, bytes):
return
width, height, fmt = identify_data(item.data)
image_fname = 'media/' + self.create_filename(href, fmt)
image_rid = self.document_relationships.add_image(image_fname)
self.images[href] = Image(image_rid, image_fname, width, height, fmt, item)
item.unload_data_from_memory()
return self.images[href].rid
def create_filename(self, href, fmt):
fname = ascii_filename(urlunquote(posixpath.basename(href)))
fname = posixpath.splitext(fname)[0]
fname = fname[:75].rstrip('.') or 'image'
num = 0
base = fname
while fname.lower() in self.seen_filenames:
num += 1
fname = base + str(num)
self.seen_filenames.add(fname.lower())
fname += os.extsep + fmt.lower()
return fname
def serialize(self, images_map):
for img in self.images.itervalues():
images_map['word/' + img.fname] = partial(self.get_data, img.item)
def get_data(self, item):
try:
return item.data
finally:
item.unload_data_from_memory(False)