DOCX Output: Add support for SVG images. Now the generated DOCX will contain both the rasterized version of the SVG image and the original SVG image, which is supported by modern versions of Word.

This commit is contained in:
Kovid Goyal 2023-06-09 14:48:42 +05:30
parent 4c4cfb843c
commit 41f7a01e35
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 105 additions and 49 deletions

View File

@ -46,6 +46,7 @@ TRANSITIONAL_NAMESPACES = {
'xml': 'http://www.w3.org/XML/1998/namespace', 'xml': 'http://www.w3.org/XML/1998/namespace',
# Drawing # Drawing
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'a14': 'http://schemas.microsoft.com/office/drawing/2010/main',
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
'mv': 'urn:schemas-microsoft-com:mac:vml', 'mv': 'urn:schemas-microsoft-com:mac:vml',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture', 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
@ -76,6 +77,7 @@ STRICT_NAMESPACES = {
for k, v in iteritems(TRANSITIONAL_NAMESPACES) for k, v in iteritems(TRANSITIONAL_NAMESPACES)
} }
SVG_BLIP_URI = '{96DAC541-7B7A-43D3-8B79-37D633B846F1}' SVG_BLIP_URI = '{96DAC541-7B7A-43D3-8B79-37D633B846F1}'
USE_LOCAL_DPI_URI = '{28A0092B-C50C-407E-A947-70E740481C1C}'
# }}} # }}}

View File

@ -440,7 +440,7 @@ class Convert:
self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language) self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language)
self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log) self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log)
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts, self.svg_rasterizer)
self.lists_manager = ListsManager(self.docx) self.lists_manager = ListsManager(self.docx)
self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts)
self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager) self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager)
@ -481,9 +481,7 @@ class Convert:
def process_item(self, item): def process_item(self, item):
self.current_item = item self.current_item = item
stylizer = self.svg_rasterizer.stylizer_cache.get(item) stylizer = self.svg_rasterizer.stylizer(item)
if stylizer is None:
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css)
self.abshref = self.images_manager.abshref = item.abshref self.abshref = self.images_manager.abshref = item.abshref
self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang

View File

@ -12,11 +12,11 @@ from lxml import etree
from calibre import fit_image from calibre import fit_image
from calibre.ebooks.docx.images import pt_to_emu from calibre.ebooks.docx.images import pt_to_emu
from calibre.ebooks.docx.names import USE_LOCAL_DPI_URI, SVG_BLIP_URI
from calibre.ebooks.oeb.base import urlquote, urlunquote from calibre.ebooks.oeb.base import urlquote, urlunquote
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import identify from calibre.utils.imghdr import identify
from calibre.utils.resources import get_image_path as I from calibre.utils.resources import get_image_path as I
from polyglot.builtins import iteritems, itervalues
Image = namedtuple('Image', 'rid fname width height fmt item') Image = namedtuple('Image', 'rid fname width height fmt item')
@ -39,13 +39,26 @@ def get_image_margins(style):
class ImagesManager: class ImagesManager:
def __init__(self, oeb, document_relationships, opts): def __init__(self, oeb, document_relationships, opts, svg_rasterizer):
self.oeb, self.log = oeb, oeb.log self.oeb, self.log = oeb, oeb.log
self.svg_rasterizer = svg_rasterizer
self.page_width, self.page_height = opts.output_profile.width_pts, opts.output_profile.height_pts self.page_width, self.page_height = opts.output_profile.width_pts, opts.output_profile.height_pts
self.images = {} self.images = {}
self.seen_filenames = set() self.seen_filenames = set()
self.document_relationships = document_relationships self.document_relationships = document_relationships
self.count = 0 self.count = 0
self.svg_images = {}
def read_svg(self, href):
if href not in self.svg_images:
item = self.oeb.manifest.hrefs.get(href) or self.oeb.manifest.hrefs.get(urlquote(href))
if item is None:
self.log.warning('Failed to find image:', href)
return
image_fname = 'media/' + self.create_filename(href, 'svg')
image_rid = self.document_relationships.add_image(image_fname)
self.svg_images[href] = Image(image_rid, image_fname, -1, -1, 'svg', item)
return self.svg_images[href]
def read_image(self, href): def read_image(self, href):
if href not in self.images: if href not in self.images:
@ -84,6 +97,12 @@ class ImagesManager:
def create_image_markup(self, html_img, stylizer, href, as_block=False): def create_image_markup(self, html_img, stylizer, href, as_block=False):
# TODO: img inside a link (clickable image) # TODO: img inside a link (clickable image)
svg_rid = ''
svghref = self.svg_rasterizer.svg_originals.get(href)
if svghref:
si = self.read_svg(svghref)
if si:
svg_rid = si.rid
style = stylizer.style(html_img) style = stylizer.style(html_img)
floating = style['float'] floating = style['float']
if floating not in {'left', 'right'}: if floating not in {'left', 'right'}:
@ -134,7 +153,7 @@ class ImagesManager:
if fake_margins: if fake_margins:
# DOCX does not support setting margins for inline images, so we # DOCX does not support setting margins for inline images, so we
# fake it by using effect extents to simulate margins # fake it by using effect extents to simulate margins
makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in iteritems(get_image_margins(style))}) makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in get_image_margins(style).items()})
else: else:
makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0') makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0')
if floating is not None: if floating is not None:
@ -143,10 +162,10 @@ class ImagesManager:
makeelement(parent, 'wp:wrapTopAndBottom') makeelement(parent, 'wp:wrapTopAndBottom')
else: else:
makeelement(parent, 'wp:wrapSquare', wrapText='bothSides') makeelement(parent, 'wp:wrapSquare', wrapText='bothSides')
self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height) self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height, svg_rid=svg_rid)
return ans return ans
def create_docx_image_markup(self, parent, name, alt, img_rid, width, height): def create_docx_image_markup(self, parent, name, alt, img_rid, width, height, svg_rid=''):
makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces
makeelement(parent, 'wp:docPr', id=str(self.count), name=name, descr=alt) makeelement(parent, 'wp:docPr', id=str(self.count), name=name, descr=alt)
makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'), 'a:graphicFrameLocks', noChangeAspect="1") makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'), 'a:graphicFrameLocks', noChangeAspect="1")
@ -157,7 +176,11 @@ class ImagesManager:
makeelement(nvPicPr, 'pic:cNvPr', id='0', name=name, descr=alt) makeelement(nvPicPr, 'pic:cNvPr', id='0', name=name, descr=alt)
makeelement(nvPicPr, 'pic:cNvPicPr') makeelement(nvPicPr, 'pic:cNvPicPr')
bf = makeelement(pic, 'pic:blipFill') bf = makeelement(pic, 'pic:blipFill')
makeelement(bf, 'a:blip', r_embed=img_rid) blip = makeelement(bf, 'a:blip', r_embed=img_rid)
if svg_rid:
ext_list = makeelement(blip, 'a:extLst')
makeelement(makeelement(ext_list, 'a:ext', uri=USE_LOCAL_DPI_URI), 'a14:useLocalDpi', val='0')
makeelement(makeelement(ext_list, 'a:ext', uri=SVG_BLIP_URI), 'asvg:svgBlip', r_embed=svg_rid)
makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect') makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect')
spPr = makeelement(pic, 'pic:spPr') spPr = makeelement(pic, 'pic:spPr')
xfrm = makeelement(spPr, 'a:xfrm') xfrm = makeelement(spPr, 'a:xfrm')
@ -178,8 +201,10 @@ class ImagesManager:
return fname return fname
def serialize(self, images_map): def serialize(self, images_map):
for img in itervalues(self.images): for img in self.images.values():
images_map['word/' + img.fname] = partial(self.get_data, img.item) images_map['word/' + img.fname] = partial(self.get_data, img.item)
for img in self.svg_images.values():
images_map['word/' + img.fname] = lambda: img.item.data_as_bytes_or_none
def get_data(self, item): def get_data(self, item):
try: try:

View File

@ -15,6 +15,7 @@ from collections import defaultdict
from itertools import count from itertools import count
from lxml import etree, html from lxml import etree, html
from operator import attrgetter from operator import attrgetter
from typing import Optional
from calibre import as_unicode, force_unicode, get_types_map, isbytestring from calibre import as_unicode, force_unicode, get_types_map, isbytestring
from calibre.constants import __version__, filesystem_encoding from calibre.constants import __version__, filesystem_encoding
@ -1017,6 +1018,12 @@ class Manifest:
# }}} # }}}
@property
def data_as_bytes_or_none(self) -> Optional[bytes]:
if self._loader is None:
return None
return self._loader(getattr(self, 'html_input_href', self.href))
@property @property
def data(self): def data(self):
"""Provides MIME type sensitive access to the manifest """Provides MIME type sensitive access to the manifest
@ -1033,10 +1040,7 @@ class Manifest:
""" """
data = self._data data = self._data
if data is None: if data is None:
if self._loader is None: data = self.data_as_bytes_or_none
return None
data = self._loader(getattr(self, 'html_input_href',
self.href))
try: try:
mt = self.media_type.lower() mt = self.media_type.lower()
except Exception: except Exception:

View File

@ -5,32 +5,44 @@ SVG rasterization transform.
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import os, re import os
import re
from base64 import standard_b64encode
from functools import lru_cache
from lxml import etree
from qt.core import ( from qt.core import (
Qt, QByteArray, QBuffer, QIODevice, QColor, QImage, QPainter, QSvgRenderer) QBuffer, QByteArray, QColor, QImage, QIODevice, QPainter, QSvgRenderer, Qt,
from calibre.ebooks.oeb.base import XHTML, XLINK )
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath from calibre import guess_type
from calibre.ebooks.oeb.base import urlnormalize from calibre.ebooks.oeb.base import (
PNG_MIME, SVG_MIME, XHTML, XLINK, urlnormalize, xml2str, xpath,
)
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.imghdr import what from calibre.utils.imghdr import what
from polyglot.urllib import urldefrag from polyglot.urllib import urldefrag
IMAGE_TAGS = {XHTML('img'), XHTML('object')} IMAGE_TAGS = {XHTML('img'), XHTML('object')}
KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'} KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'}
TEST_SVG = b'''
<svg xmlns="http://www.w3.org/2000/svg" width="18" height="18" viewBox="0 0 18 18"> def test_svg(): # {{{
TEST_PNG_DATA_URI='data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAMAAABEpIrGAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAWJQTFRFAAAAAAAAAAAAAAAAAAAAAQEAAgIBAwIBBgQCBwUCCAYDCggECwkEDgsFDwwFEA0GHRcKHxkLIBkLIxwMJR0NJx8OKCAOKCAPKSAPMScSPTAWQTQXQzUYSjsaSjsbSzsbUD8dUUAdVEMeWkggW0ggW0ghW0khXUohYk4ja1Umb1gocVoocVopclspc1spdV0qd18reF8riW0xjXEzl3g2mns3nn04nn45n345oIA6ooE7o4I7pII7pIM7pYQ7p4U8qYY8rYo+s45Bxp1Hx55Hy6FJy6JJzaRJz6RKz6RLz6VL0qdL1KpM1apM1qtN16xN2KxN2K1O2a1O2q1O265P3K9P3bBQ3rBP37FP37FQ37JQ4rNR47VR5LVR7LxV7bxV7r1V7r5W8L9W8MBW8b9V8b9W8cBW8cBX8sBW8sBX8sFW8sFX88BX88FW88FX88FY88JX88JY9MFX9MJX9MJY9MNYSw0rOAAAAAR0Uk5T2+rr8giKtGMAAAFDSURBVDjLhdNFUwNBEIbhJWkkuLu7u5PgHtwWl0CGnW34aJLl/3OgUlRlGfKepqafmstUW1Yw8E9By6IMWVn/z7OsQOpYNrE0H4lEwuFwZHmyLnUb+AUzIiLMItDgrWIfKH3mnz4RA6PX/8Im8xuEgVfxxG33g+rVi9OT46OdPQ0kDgv8gCg3FMrLphkNyCD9BYiIqEErraP5ZrDGDrw2MoIhsPACGUH5g2gVqzWDKQ/gETKCZmHwbo4ZbHhJ1q1kBMMJCKbJCCof35V+qjCDOUCrMTKCFkc8vU5GENpW8NwmMxhVccYsGUHVvWKOFhlBySJicV6u7+7s6Ozq6anxgT44Lwy4jlKK4br96WDl09GA/gA4zp7gLh2MM3MS+EgCGl+iD9JB4cDZzbV9ZV/atn1+frvfaPhuX4HMq0cZsjKt/zfXXmDab9zjGwAAAABJRU5ErkJggg=='
return f'''
<svg xmlns="http://www.w3.org/2000/svg" width="64" height="64" viewBox="0 0 64 64">
<path d="M4.5 11H3v4h4v-1.5H4.5V11zM3 7h1.5V4.5H7V3H3v4zm10.5 6.5H11V15h4v-4h-1.5v2.5zM11 3v1.5h2.5V7H15V3h-4z"/> <path d="M4.5 11H3v4h4v-1.5H4.5V11zM3 7h1.5V4.5H7V3H3v4zm10.5 6.5H11V15h4v-4h-1.5v2.5zM11 3v1.5h2.5V7H15V3h-4z"/>
</svg>''' <image width="32" height="32" x="32" y="32" xlink:href="{TEST_PNG_DATA_URI}"/>
</svg>'''.encode()
# }}}
class Unavailable(Exception): class Unavailable(Exception):
pass pass
def rasterize_svg(data=TEST_SVG, sizes=(), width=0, height=0, print=None, fmt='PNG', as_qimage=False): def rasterize_svg(data=None, sizes=(), width=0, height=0, print=None, fmt='PNG', as_qimage=False):
if data is None:
data = test_svg()
svg = QSvgRenderer(QByteArray(data)) svg = QSvgRenderer(QByteArray(data))
size = svg.defaultSize() size = svg.defaultSize()
if size.width() == 100 and size.height() == 100 and sizes: if size.width() == 100 and size.height() == 100 and sizes:
@ -54,10 +66,16 @@ def rasterize_svg(data=TEST_SVG, sizes=(), width=0, height=0, print=None, fmt='P
return array.data() return array.data()
@lru_cache(maxsize=128)
def data_url(mime_type: str, data: bytes) -> str:
return f'data:{mime_type};base64,' + standard_b64encode(data).decode('ascii')
class SVGRasterizer: class SVGRasterizer:
def __init__(self, base_css=''): def __init__(self, base_css='', save_svg_originals=False):
self.base_css = base_css self.base_css = base_css
self.save_svg_originals = save_svg_originals
from calibre.gui2 import must_use_qt from calibre.gui2 import must_use_qt
must_use_qt() must_use_qt()
@ -71,20 +89,15 @@ class SVGRasterizer:
def __call__(self, oeb, context): def __call__(self, oeb, context):
oeb.logger.info('Rasterizing SVG images...') oeb.logger.info('Rasterizing SVG images...')
self.temp_files = []
self.stylizer_cache = {} self.stylizer_cache = {}
self.oeb = oeb self.oeb = oeb
self.opts = context self.opts = context
self.profile = context.dest self.profile = context.dest
self.images = {} self.images = {}
self.dataize_manifest() self.svg_originals = {}
self.scan_for_linked_resources_in_manifest()
self.rasterize_spine() self.rasterize_spine()
self.rasterize_cover() self.rasterize_cover()
for pt in self.temp_files:
try:
os.remove(pt)
except:
pass
def rasterize_svg(self, elem, width=0, height=0, format='PNG'): def rasterize_svg(self, elem, width=0, height=0, format='PNG'):
view_box = elem.get('viewBox', elem.get('viewbox', None)) view_box = elem.get('viewBox', elem.get('viewbox', None))
@ -110,38 +123,41 @@ class SVGRasterizer:
return rasterize_svg(xml2str(elem, with_tail=False), sizes=sizes, width=width, height=height, print=logger.info, fmt=format) return rasterize_svg(xml2str(elem, with_tail=False), sizes=sizes, width=width, height=height, print=logger.info, fmt=format)
def dataize_manifest(self): def scan_for_linked_resources_in_manifest(self):
for item in self.oeb.manifest.values(): for item in self.oeb.manifest.values():
if item.media_type == SVG_MIME and item.data is not None: if item.media_type == SVG_MIME and item.data is not None:
self.dataize_svg(item) self.scan_for_linked_resources_in_svg(item)
def dataize_svg(self, item, svg=None): def scan_for_linked_resources_in_svg(self, item, svg=None):
if svg is None: if svg is None:
svg = item.data svg = item.data
hrefs = self.oeb.manifest.hrefs hrefs = self.oeb.manifest.hrefs
ha = XLINK('href')
for elem in xpath(svg, '//svg:*[@xl:href]'): for elem in xpath(svg, '//svg:*[@xl:href]'):
href = urlnormalize(elem.attrib[XLINK('href')]) href = urlnormalize(elem.get(ha))
path = urldefrag(href)[0] path = urldefrag(href)[0]
if not path: if not path:
continue continue
abshref = item.abshref(path) abshref = item.abshref(path)
if abshref not in hrefs: linkee = hrefs.get(abshref)
if linkee is None:
continue continue
linkee = hrefs[abshref]
data = linkee.bytes_representation data = linkee.bytes_representation
ext = what(None, data) or 'jpg' ext = what(None, data)
with PersistentTemporaryFile(suffix='.'+ext) as pt: if not ext:
pt.write(data) continue
self.temp_files.append(pt.name) mt = guess_type('file.'+ext)[0]
elem.attrib[XLINK('href')] = pt.name if not mt or not mt.startswith('image/'):
continue
elem.set(ha, data_url(mt, data))
return svg return svg
def stylizer(self, item): def stylizer(self, item):
ans = self.stylizer_cache.get(item, None) ans = self.stylizer_cache.get(item, None)
if ans is None: if ans is None:
ans = Stylizer(item.data, item.href, self.oeb, self.opts, ans = self.stylizer_cache[item] = Stylizer(item.data, item.href, self.oeb, self.opts,
self.profile, base_css=self.base_css) self.profile, base_css=self.base_css)
self.stylizer_cache[item] = ans
return ans return ans
def rasterize_spine(self): def rasterize_spine(self):
@ -172,13 +188,19 @@ class SVGRasterizer:
height = style['height'] height = style['height']
width = (width / 72) * self.profile.dpi width = (width / 72) * self.profile.dpi
height = (height / 72) * self.profile.dpi height = (height / 72) * self.profile.dpi
elem = self.dataize_svg(item, elem) self.scan_for_linked_resources_in_svg(item, elem)
data = self.rasterize_svg(elem, width, height) data = self.rasterize_svg(elem, width, height)
manifest = self.oeb.manifest manifest = self.oeb.manifest
href = os.path.splitext(item.href)[0] + '.png' href = os.path.splitext(item.href)[0] + '.png'
id, href = manifest.generate(item.id, href) id, href = manifest.generate(item.id, href)
manifest.add(id, href, PNG_MIME, data=data) manifest.add(id, href, PNG_MIME, data=data)
img = elem.makeelement(XHTML('img'), src=item.relhref(href)) img = elem.makeelement(XHTML('img'), src=item.relhref(href))
if self.save_svg_originals:
svg_bytes = etree.tostring(elem, encoding='utf-8', xml_declaration=True, pretty_print=True, with_tail=False)
svg_id, svg_href = manifest.generate(item.id, 'inline.svg')
manifest.add(svg_id, svg_href, SVG_MIME, data=svg_bytes)
self.svg_originals[href] = svg_href
img.tail = elem.tail
elem.getparent().replace(elem, img) elem.getparent().replace(elem, img)
for prop in ('width', 'height'): for prop in ('width', 'height'):
if prop in elem.attrib: if prop in elem.attrib:
@ -215,6 +237,7 @@ class SVGRasterizer:
id, href = manifest.generate(svgitem.id, href) id, href = manifest.generate(svgitem.id, href)
manifest.add(id, href, PNG_MIME, data=data) manifest.add(id, href, PNG_MIME, data=data)
self.images[key] = href self.images[key] = href
self.svg_originals[href] = svgitem.href
elem.tag = XHTML('img') elem.tag = XHTML('img')
for attr in elem.attrib: for attr in elem.attrib:
if attr not in KEEP_ATTRS: if attr not in KEEP_ATTRS:
@ -244,3 +267,7 @@ class SVGRasterizer:
id, href = self.oeb.manifest.generate(cover.id, href) id, href = self.oeb.manifest.generate(cover.id, href)
self.oeb.manifest.add(id, href, PNG_MIME, data=data) self.oeb.manifest.add(id, href, PNG_MIME, data=data)
covers[0].value = id covers[0].value = id
if __name__ == '__main__':
open('/t/test-svg-rasterization.png', 'wb').write(rasterize_svg())