diff --git a/src/calibre/ebooks/docx/writer/container.py b/src/calibre/ebooks/docx/writer/container.py index 27a5e0b9ba..de7a802d05 100644 --- a/src/calibre/ebooks/docx/writer/container.py +++ b/src/calibre/ebooks/docx/writer/container.py @@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -import textwrap +import textwrap, os from io import BytesIO from lxml import etree @@ -14,7 +14,7 @@ from lxml.builder import ElementMaker from calibre import guess_type from calibre.constants import numeric_version, __appname__ -from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS +from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS, IMAGES from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata.opf2 import OPF as ReadOPF from calibre.ebooks.oeb.base import OPF, OPF2_NS @@ -51,7 +51,6 @@ class DocumentRelationships(object): def __init__(self): self.rmap = {} - self.counter = 0 for typ, target in { STYLES: 'styles.xml', WEB_SETTINGS: 'webSettings.xml', @@ -64,11 +63,13 @@ class DocumentRelationships(object): def add_relationship(self, target, rtype, target_mode=None): ans = self.get_relationship_id(target, rtype, target_mode) if ans is None: - self.counter += 1 - ans = 'rId%d' % self.counter + ans = 'rId%d' % (len(self.rmap) + 1) self.rmap[(target, rtype, target_mode)] = ans return ans + def add_image(self, target): + return self.add_relationship(target, IMAGES) + def serialize(self): E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) relationships = E.Relationships() @@ -113,8 +114,13 @@ class DOCX(object): }.iteritems(): added.add(ext) types.append(E.Default(Extension=ext, ContentType=mt)) - # TODO: Iterate over all resources and add mimetypes for any that are - # not already added + for fname in self.images: + ext = fname.rpartition(os.extsep)[-1] + if ext not in added: + added.add(ext) + mt = guess_type('a.' + ext)[0] + if mt: + types.append(E.Default(Extension=ext, ContentType=mt)) return xml2str(types) @property @@ -176,6 +182,8 @@ class DOCX(object): zf.writestr('word/document.xml', xml2str(self.document)) zf.writestr('word/styles.xml', xml2str(self.styles)) zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize()) + for fname, data_getter in self.images.iteritems(): + zf.writestr(fname, data_getter()) if __name__ == '__main__': d = DOCX(None, None) diff --git a/src/calibre/ebooks/docx/writer/from_html.py b/src/calibre/ebooks/docx/writer/from_html.py index 2acb41a444..3430ebf0ff 100644 --- a/src/calibre/ebooks/docx/writer/from_html.py +++ b/src/calibre/ebooks/docx/writer/from_html.py @@ -13,6 +13,7 @@ from lxml.builder import ElementMaker from calibre.ebooks.docx.names import namespaces from calibre.ebooks.docx.writer.styles import w, StylesManager +from calibre.ebooks.docx.writer.images import ImagesManager from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from calibre.ebooks.oeb.base import XPath, barename from calibre.ebooks.pdf.render.common import PAPER_SIZES @@ -151,18 +152,26 @@ class Convert(object): def __call__(self): from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer - SVGRasterizer()(self.oeb, self.opts) + self.svg_rasterizer = SVGRasterizer() + self.svg_rasterizer(self.oeb, self.opts) self.styles_manager = StylesManager() + self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) - for item in self.oeb.spine: - self.process_item(item) + try: + for item in self.oeb.spine: + self.process_item(item) - self.styles_manager.finalize(self.blocks) - self.write() + self.styles_manager.finalize(self.blocks) + self.write() + finally: + self.images_manager.cleanup() def process_item(self, item): - stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) + stylizer = self.svg_rasterizer.stylizer_cache.get(item) + if stylizer is None: + stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) + self.abshref = self.images_manager.abshref = item.abshref is_first_block = True for body in XPath('//h:body')(item.data): @@ -177,21 +186,24 @@ class Convert(object): block_style = stylizer.style(html_block) if block_style.is_hidden: return - if html_block.text: - docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True) + if html_block.tag.endswith('}img'): + b = Block(self.styles_manager, html_block, stylizer.style(html_block)) + self.blocks.append(b) + self.images_manager.add_image(html_block, b, stylizer) + else: + if html_block.text: + docx_block.add_text(html_block.text, block_style, ignore_leading_whitespace=True, is_parent_style=True) - for child in html_block.iterchildren(etree.Element): - tag = barename(child.tag) - style = stylizer.style(child) - display = style._get('display') - if tag == 'img': - pass # TODO: Handle images - if display == 'block' and tag != 'br': - b = Block(self.styles_manager, child, style) - self.blocks.append(b) - self.process_block(child, b, stylizer) - else: - self.process_inline(child, self.blocks[-1], stylizer) + for child in html_block.iterchildren(etree.Element): + tag = barename(child.tag) + style = stylizer.style(child) + display = style._get('display') + if display == 'block' and tag != 'br': + b = Block(self.styles_manager, child, style) + self.blocks.append(b) + self.process_block(child, b, stylizer) + else: + self.process_inline(child, self.blocks[-1], stylizer) if ignore_tail is False and html_block.tail and html_block.tail.strip(): b = docx_block @@ -211,7 +223,7 @@ class Convert(object): if html_child.tail or html_child is not html_child.getparent()[-1]: docx_block.add_break(clear={'both':'all', 'left':'left', 'right':'right'}.get(style['clear'], 'none')) elif tag == 'img': - return # TODO: Handle images + self.images_manager.add_image(html_child, docx_block, stylizer) else: if html_child.text: docx_block.add_text(html_child.text, style, html_parent=html_child) @@ -249,7 +261,7 @@ class Convert(object): E.docGrid(**{w('linePitch'):"360"}), )) - dn = {k:v for k, v in namespaces.iteritems() if k in 'wr'} + dn = {k:v for k, v in namespaces.iteritems() if k in tuple('wra') + ('wp',)} E = ElementMaker(namespace=dn['w'], nsmap=dn) self.docx.styles = E.styles( E.docDefaults( @@ -268,4 +280,6 @@ class Convert(object): ) ) ) + self.docx.images = {} self.styles_manager.serialize(self.docx.styles) + self.images_manager.serialize(self.docx.images) diff --git a/src/calibre/ebooks/docx/writer/images.py b/src/calibre/ebooks/docx/writer/images.py new file mode 100644 index 0000000000..5ad5ddada1 --- /dev/null +++ b/src/calibre/ebooks/docx/writer/images.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2015, Kovid Goyal ' + +import os +import shutil, posixpath +from collections import namedtuple +from functools import partial + +from calibre.ebooks.oeb.base import urlunquote +from calibre.ptempfile import PersistentTemporaryDirectory +from calibre.utils.filenames import ascii_filename +from calibre.utils.magick.draw import identify_data + +Image = namedtuple('Image', 'rid fname width height fmt item') + +class ImagesManager(object): + + def __init__(self, oeb, document_relationships): + self.oeb, self.log = oeb, oeb.log + self.images = {} + self.seen_filenames = set() + self.document_relationships = document_relationships + self._tdir = None + + @property + def tdir(self): + if self._tdir is None: + self._tdir = PersistentTemporaryDirectory(suffix='_docx_output_images') + return self._tdir + + def cleanup(self): + if self._tdir is not None: + shutil.rmtree(self._tdir) + self._tdir = None + + def add_image(self, img, block, stylizer): + src = img.get('src') + if not src: + return + href = self.abshref(src) + if href not in self.images: + item = self.oeb.manifest.hrefs.get(href) + if item is None or not isinstance(item.data, bytes): + return + width, height, fmt = identify_data(item.data) + image_fname = 'media/' + self.create_filename(href, fmt) + image_rid = self.document_relationships.add_image(image_fname) + self.images[href] = Image(image_rid, image_fname, width, height, fmt, item) + item.unload_data_from_memory() + return self.images[href].rid + + def create_filename(self, href, fmt): + fname = ascii_filename(urlunquote(posixpath.basename(href))) + fname = posixpath.splitext(fname)[0] + fname = fname[:75].rstrip('.') or 'image' + num = 0 + base = fname + while fname.lower() in self.seen_filenames: + num += 1 + fname = base + str(num) + self.seen_filenames.add(fname.lower()) + fname += os.extsep + fmt.lower() + return fname + + def serialize(self, images_map): + for img in self.images.itervalues(): + images_map['word/' + img.fname] = partial(self.get_data, img.item) + + def get_data(self, item): + try: + return item.data + finally: + item.unload_data_from_memory(False)