From 41f7a01e350b92b92c3db3ee9863401dfba58f05 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 9 Jun 2023 14:48:42 +0530 Subject: [PATCH] DOCX Output: Add support for SVG images. Now the generated DOCX will contain both the rasterized version of the SVG image and the original SVG image, which is supported by modern versions of Word. --- src/calibre/ebooks/docx/names.py | 2 + src/calibre/ebooks/docx/writer/from_html.py | 6 +- src/calibre/ebooks/docx/writer/images.py | 39 ++++++-- src/calibre/ebooks/oeb/base.py | 12 ++- .../ebooks/oeb/transforms/rasterize.py | 95 ++++++++++++------- 5 files changed, 105 insertions(+), 49 deletions(-) diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index 6124eb3941..702177f2b4 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -46,6 +46,7 @@ TRANSITIONAL_NAMESPACES = { 'xml': 'http://www.w3.org/XML/1998/namespace', # Drawing 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', + 'a14': 'http://schemas.microsoft.com/office/drawing/2010/main', 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', 'mv': 'urn:schemas-microsoft-com:mac:vml', 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture', @@ -76,6 +77,7 @@ STRICT_NAMESPACES = { for k, v in iteritems(TRANSITIONAL_NAMESPACES) } SVG_BLIP_URI = '{96DAC541-7B7A-43D3-8B79-37D633B846F1}' +USE_LOCAL_DPI_URI = '{28A0092B-C50C-407E-A947-70E740481C1C}' # }}} diff --git a/src/calibre/ebooks/docx/writer/from_html.py b/src/calibre/ebooks/docx/writer/from_html.py index a7ed841b78..7bd5628d8d 100644 --- a/src/calibre/ebooks/docx/writer/from_html.py +++ b/src/calibre/ebooks/docx/writer/from_html.py @@ -440,7 +440,7 @@ class Convert: self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language) self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships, self.log) - self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts) + self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships, self.opts, self.svg_rasterizer) self.lists_manager = ListsManager(self.docx) self.fonts_manager = FontsManager(self.docx.namespace, self.oeb, self.opts) self.blocks = Blocks(self.docx.namespace, self.styles_manager, self.links_manager) @@ -481,9 +481,7 @@ class Convert: def process_item(self, item): self.current_item = item - stylizer = self.svg_rasterizer.stylizer_cache.get(item) - if stylizer is None: - stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css) + stylizer = self.svg_rasterizer.stylizer(item) self.abshref = self.images_manager.abshref = item.abshref self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang diff --git a/src/calibre/ebooks/docx/writer/images.py b/src/calibre/ebooks/docx/writer/images.py index 18bdf6142a..d29a0d8a41 100644 --- a/src/calibre/ebooks/docx/writer/images.py +++ b/src/calibre/ebooks/docx/writer/images.py @@ -12,11 +12,11 @@ from lxml import etree from calibre import fit_image from calibre.ebooks.docx.images import pt_to_emu +from calibre.ebooks.docx.names import USE_LOCAL_DPI_URI, SVG_BLIP_URI from calibre.ebooks.oeb.base import urlquote, urlunquote from calibre.utils.filenames import ascii_filename from calibre.utils.imghdr import identify from calibre.utils.resources import get_image_path as I -from polyglot.builtins import iteritems, itervalues Image = namedtuple('Image', 'rid fname width height fmt item') @@ -39,13 +39,26 @@ def get_image_margins(style): class ImagesManager: - def __init__(self, oeb, document_relationships, opts): + def __init__(self, oeb, document_relationships, opts, svg_rasterizer): self.oeb, self.log = oeb, oeb.log + self.svg_rasterizer = svg_rasterizer self.page_width, self.page_height = opts.output_profile.width_pts, opts.output_profile.height_pts self.images = {} self.seen_filenames = set() self.document_relationships = document_relationships self.count = 0 + self.svg_images = {} + + def read_svg(self, href): + if href not in self.svg_images: + item = self.oeb.manifest.hrefs.get(href) or self.oeb.manifest.hrefs.get(urlquote(href)) + if item is None: + self.log.warning('Failed to find image:', href) + return + image_fname = 'media/' + self.create_filename(href, 'svg') + image_rid = self.document_relationships.add_image(image_fname) + self.svg_images[href] = Image(image_rid, image_fname, -1, -1, 'svg', item) + return self.svg_images[href] def read_image(self, href): if href not in self.images: @@ -84,6 +97,12 @@ class ImagesManager: def create_image_markup(self, html_img, stylizer, href, as_block=False): # TODO: img inside a link (clickable image) + svg_rid = '' + svghref = self.svg_rasterizer.svg_originals.get(href) + if svghref: + si = self.read_svg(svghref) + if si: + svg_rid = si.rid style = stylizer.style(html_img) floating = style['float'] if floating not in {'left', 'right'}: @@ -134,7 +153,7 @@ class ImagesManager: if fake_margins: # DOCX does not support setting margins for inline images, so we # fake it by using effect extents to simulate margins - makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in iteritems(get_image_margins(style))}) + makeelement(parent, 'wp:effectExtent', **{k[-1].lower():v for k, v in get_image_margins(style).items()}) else: makeelement(parent, 'wp:effectExtent', l='0', r='0', t='0', b='0') if floating is not None: @@ -143,10 +162,10 @@ class ImagesManager: makeelement(parent, 'wp:wrapTopAndBottom') else: makeelement(parent, 'wp:wrapSquare', wrapText='bothSides') - self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height) + self.create_docx_image_markup(parent, name, html_img.get('alt') or name, img.rid, width, height, svg_rid=svg_rid) return ans - def create_docx_image_markup(self, parent, name, alt, img_rid, width, height): + def create_docx_image_markup(self, parent, name, alt, img_rid, width, height, svg_rid=''): makeelement, namespaces = self.document_relationships.namespace.makeelement, self.document_relationships.namespace.namespaces makeelement(parent, 'wp:docPr', id=str(self.count), name=name, descr=alt) makeelement(makeelement(parent, 'wp:cNvGraphicFramePr'), 'a:graphicFrameLocks', noChangeAspect="1") @@ -157,7 +176,11 @@ class ImagesManager: makeelement(nvPicPr, 'pic:cNvPr', id='0', name=name, descr=alt) makeelement(nvPicPr, 'pic:cNvPicPr') bf = makeelement(pic, 'pic:blipFill') - makeelement(bf, 'a:blip', r_embed=img_rid) + blip = makeelement(bf, 'a:blip', r_embed=img_rid) + if svg_rid: + ext_list = makeelement(blip, 'a:extLst') + makeelement(makeelement(ext_list, 'a:ext', uri=USE_LOCAL_DPI_URI), 'a14:useLocalDpi', val='0') + makeelement(makeelement(ext_list, 'a:ext', uri=SVG_BLIP_URI), 'asvg:svgBlip', r_embed=svg_rid) makeelement(makeelement(bf, 'a:stretch'), 'a:fillRect') spPr = makeelement(pic, 'pic:spPr') xfrm = makeelement(spPr, 'a:xfrm') @@ -178,8 +201,10 @@ class ImagesManager: return fname def serialize(self, images_map): - for img in itervalues(self.images): + for img in self.images.values(): images_map['word/' + img.fname] = partial(self.get_data, img.item) + for img in self.svg_images.values(): + images_map['word/' + img.fname] = lambda: img.item.data_as_bytes_or_none def get_data(self, item): try: diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index be1e855cca..74d17da6d3 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -15,6 +15,7 @@ from collections import defaultdict from itertools import count from lxml import etree, html from operator import attrgetter +from typing import Optional from calibre import as_unicode, force_unicode, get_types_map, isbytestring from calibre.constants import __version__, filesystem_encoding @@ -1017,6 +1018,12 @@ class Manifest: # }}} + @property + def data_as_bytes_or_none(self) -> Optional[bytes]: + if self._loader is None: + return None + return self._loader(getattr(self, 'html_input_href', self.href)) + @property def data(self): """Provides MIME type sensitive access to the manifest @@ -1033,10 +1040,7 @@ class Manifest: """ data = self._data if data is None: - if self._loader is None: - return None - data = self._loader(getattr(self, 'html_input_href', - self.href)) + data = self.data_as_bytes_or_none try: mt = self.media_type.lower() except Exception: diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index 95396703da..ae5459adb1 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -5,32 +5,44 @@ SVG rasterization transform. __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import os, re +import os +import re +from base64 import standard_b64encode +from functools import lru_cache +from lxml import etree from qt.core import ( - Qt, QByteArray, QBuffer, QIODevice, QColor, QImage, QPainter, QSvgRenderer) -from calibre.ebooks.oeb.base import XHTML, XLINK -from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME -from calibre.ebooks.oeb.base import xml2str, xpath -from calibre.ebooks.oeb.base import urlnormalize + QBuffer, QByteArray, QColor, QImage, QIODevice, QPainter, QSvgRenderer, Qt, +) + +from calibre import guess_type +from calibre.ebooks.oeb.base import ( + PNG_MIME, SVG_MIME, XHTML, XLINK, urlnormalize, xml2str, xpath, +) from calibre.ebooks.oeb.stylizer import Stylizer -from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.imghdr import what from polyglot.urllib import urldefrag IMAGE_TAGS = {XHTML('img'), XHTML('object')} KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'} -TEST_SVG = b''' - - -''' + +def test_svg(): # {{{ + TEST_PNG_DATA_URI='' + return f''' + + + + '''.encode() +# }}} class Unavailable(Exception): pass -def rasterize_svg(data=TEST_SVG, sizes=(), width=0, height=0, print=None, fmt='PNG', as_qimage=False): +def rasterize_svg(data=None, sizes=(), width=0, height=0, print=None, fmt='PNG', as_qimage=False): + if data is None: + data = test_svg() svg = QSvgRenderer(QByteArray(data)) size = svg.defaultSize() if size.width() == 100 and size.height() == 100 and sizes: @@ -54,10 +66,16 @@ def rasterize_svg(data=TEST_SVG, sizes=(), width=0, height=0, print=None, fmt='P return array.data() +@lru_cache(maxsize=128) +def data_url(mime_type: str, data: bytes) -> str: + return f'data:{mime_type};base64,' + standard_b64encode(data).decode('ascii') + + class SVGRasterizer: - def __init__(self, base_css=''): + def __init__(self, base_css='', save_svg_originals=False): self.base_css = base_css + self.save_svg_originals = save_svg_originals from calibre.gui2 import must_use_qt must_use_qt() @@ -71,20 +89,15 @@ class SVGRasterizer: def __call__(self, oeb, context): oeb.logger.info('Rasterizing SVG images...') - self.temp_files = [] self.stylizer_cache = {} self.oeb = oeb self.opts = context self.profile = context.dest self.images = {} - self.dataize_manifest() + self.svg_originals = {} + self.scan_for_linked_resources_in_manifest() self.rasterize_spine() self.rasterize_cover() - for pt in self.temp_files: - try: - os.remove(pt) - except: - pass def rasterize_svg(self, elem, width=0, height=0, format='PNG'): view_box = elem.get('viewBox', elem.get('viewbox', None)) @@ -110,38 +123,41 @@ class SVGRasterizer: return rasterize_svg(xml2str(elem, with_tail=False), sizes=sizes, width=width, height=height, print=logger.info, fmt=format) - def dataize_manifest(self): + def scan_for_linked_resources_in_manifest(self): for item in self.oeb.manifest.values(): if item.media_type == SVG_MIME and item.data is not None: - self.dataize_svg(item) + self.scan_for_linked_resources_in_svg(item) - def dataize_svg(self, item, svg=None): + def scan_for_linked_resources_in_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs + ha = XLINK('href') for elem in xpath(svg, '//svg:*[@xl:href]'): - href = urlnormalize(elem.attrib[XLINK('href')]) + href = urlnormalize(elem.get(ha)) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) - if abshref not in hrefs: + linkee = hrefs.get(abshref) + if linkee is None: continue - linkee = hrefs[abshref] data = linkee.bytes_representation - ext = what(None, data) or 'jpg' - with PersistentTemporaryFile(suffix='.'+ext) as pt: - pt.write(data) - self.temp_files.append(pt.name) - elem.attrib[XLINK('href')] = pt.name + ext = what(None, data) + if not ext: + continue + mt = guess_type('file.'+ext)[0] + if not mt or not mt.startswith('image/'): + continue + elem.set(ha, data_url(mt, data)) + return svg def stylizer(self, item): ans = self.stylizer_cache.get(item, None) if ans is None: - ans = Stylizer(item.data, item.href, self.oeb, self.opts, + ans = self.stylizer_cache[item] = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile, base_css=self.base_css) - self.stylizer_cache[item] = ans return ans def rasterize_spine(self): @@ -172,13 +188,19 @@ class SVGRasterizer: height = style['height'] width = (width / 72) * self.profile.dpi height = (height / 72) * self.profile.dpi - elem = self.dataize_svg(item, elem) + self.scan_for_linked_resources_in_svg(item, elem) data = self.rasterize_svg(elem, width, height) manifest = self.oeb.manifest href = os.path.splitext(item.href)[0] + '.png' id, href = manifest.generate(item.id, href) manifest.add(id, href, PNG_MIME, data=data) img = elem.makeelement(XHTML('img'), src=item.relhref(href)) + if self.save_svg_originals: + svg_bytes = etree.tostring(elem, encoding='utf-8', xml_declaration=True, pretty_print=True, with_tail=False) + svg_id, svg_href = manifest.generate(item.id, 'inline.svg') + manifest.add(svg_id, svg_href, SVG_MIME, data=svg_bytes) + self.svg_originals[href] = svg_href + img.tail = elem.tail elem.getparent().replace(elem, img) for prop in ('width', 'height'): if prop in elem.attrib: @@ -215,6 +237,7 @@ class SVGRasterizer: id, href = manifest.generate(svgitem.id, href) manifest.add(id, href, PNG_MIME, data=data) self.images[key] = href + self.svg_originals[href] = svgitem.href elem.tag = XHTML('img') for attr in elem.attrib: if attr not in KEEP_ATTRS: @@ -244,3 +267,7 @@ class SVGRasterizer: id, href = self.oeb.manifest.generate(cover.id, href) self.oeb.manifest.add(id, href, PNG_MIME, data=data) covers[0].value = id + + +if __name__ == '__main__': + open('/t/test-svg-rasterization.png', 'wb').write(rasterize_svg())