Get towards writing a full docx skeleton with all text but no styles/images/etc.

2025-07-08 18:54:09 -04:00 · 2015-02-08 21:32:30 +05:30 · 2015-02-08 21:32:30 +05:30 · 0f0e62b3a0
commit 0f0e62b3a0
parent cbc85be903
3 changed files with 99 additions and 8 deletions
--- a/src/calibre/ebooks/docx/names.py
+++ b/src/calibre/ebooks/docx/names.py
@ -24,6 +24,7 @@ FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
 ENDNOTES  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes'
 THEMES    = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme'
 SETTINGS  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings'
 WEB_SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings'
 namespaces = {
    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
--- a/src/calibre/ebooks/docx/writer/container.py
+++ b/src/calibre/ebooks/docx/writer/container.py
@ -8,18 +8,57 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import textwrap
 from lxml import etree
 from lxml.builder import ElementMaker
 from calibre import guess_type
 from calibre.constants import numeric_version, __appname__
-from calibre.ebooks.docx.names import namespaces
+from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS
 from calibre.ebooks.oeb.base import xml2str
 from calibre.utils.zipfile import ZipFile
 def xml2str(root, pretty_print=False, with_tail=False):
    ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
                          pretty_print=pretty_print, with_tail=with_tail)
    return ans
 class DocumentRelationships(object):
    def __init__(self):
        self.rmap = {}
        self.counter = 0
        for typ, target in {
                STYLES: 'styles.xml',
                WEB_SETTINGS: 'webSettings.xml',
        }.iteritems():
            self.add_relationship(target, typ)
    def get_relationship_id(self, target, rtype, target_mode=None):
        return self.rmap.get((target, rtype, target_mode))
    def add_relationship(self, target, rtype, target_mode=None):
        ans = self.get_relationship_id(target, rtype, target_mode)
        if ans is None:
            self.counter += 1
            ans = 'rId%d' % self.counter
            self.rmap[(target, rtype, target_mode)] = ans
        return ans
    def serialize(self):
        E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
        relationships = E.Relationships()
        for (target, rtype, target_mode), rid in self.rmap.iteritems():
            r = E.Relationship(Id=rid, Type=rtype, Target=target)
            if target_mode is not None:
                r.set('TargetMode', target_mode)
            relationships.append(r)
        return xml2str(relationships)
 class DOCX(object):
    def __init__(self, opts, log):
        self.opts, self.log = opts, log
        self.document_relationships = DocumentRelationships()
    # Boilerplate {{{
    @property
@ -92,7 +131,9 @@ class DOCX(object):
            zf.writestr('_rels/.rels', self.containerrels)
            zf.writestr('docProps/app.xml', self.appproperties)
            zf.writestr('word/webSettings.xml', self.websettings)
-            # TODO: Write document and document relationships
+            zf.writestr('word/document.xml', xml2str(self.document))
            zf.writestr('word/styles.xml', xml2str(self.styles))
            zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
 if __name__ == '__main__':
    d = DOCX(None, None)
--- a/src/calibre/ebooks/docx/writer/from_html.py
+++ b/src/calibre/ebooks/docx/writer/from_html.py
@ -9,7 +9,9 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import re
 from lxml import etree
 from lxml.builder import ElementMaker
 from calibre.ebooks.docx.names import namespaces
 from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
 from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
 from calibre.ebooks.oeb.base import XPath, barename
@ -42,7 +44,7 @@ class TextStyle(object):
    ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color',
                 'background_color', 'underline', 'strike', 'dstrike', 'caps',
-                 'shadow', 'small_caps', 'spacing', 'vertical-align')
+                 'shadow', 'small_caps', 'spacing', 'vertical_align')
    def __init__(self, css):
        self.font_family = css['font-family']  # TODO: Resolve multiple font families and generic font family names
@ -113,6 +115,16 @@ class TextRun(object):
    def add_break(self, clear='none'):
        self.texts.append(LineBreak(clear=clear))
    def serialize(self, p):
        r = p.makeelement('{%s}r' % namespaces['w'])
        p.append(r)
        for text, preserve_whitespace in self.texts:
            t = r.makeelement('{%s}t' % namespaces['w'])
            r.append(t)
            t.text = text or ''
            if preserve_whitespace:
                t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
 style_cache = {}
 class Block(object):
@ -120,19 +132,29 @@ class Block(object):
    def __init__(self):
        self.runs = []
-    def add_text(self, text, style):
+    def add_text(self, text, style, ignore_leading_whitespace=False):
        ts = TextStyle(style)
        ws = style['white-space']
        if self.runs and ts == self.runs[-1].style:
            run = self.runs[-1]
        else:
            run = TextRun(ts)
            self.runs.append(run)
        preserve_whitespace = ws in {'pre', 'pre-wrap'}
        if ignore_leading_whitespace and not preserve_whitespace:
            text = text.lstrip()
        if ws == 'pre-line':
            for text in text.splitlines():
                run.add_text(text, False)
                run.add_break()
        else:
-            run.add_text(text, ws in {'pre', 'pre-wrap'})
+            run.add_text(text, preserve_whitespace)
    def serialize(self, body):
        p = body.makeelement('{%s}p' % namespaces['w'])
        body.append(p)
        for run in self.runs:
            run.serialize(p)
 class Convert(object):
@ -149,6 +171,8 @@ class Convert(object):
        for item in self.oeb.spine:
            self.process_item(item)
        self.write()
    def process_item(self, item):
        stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
@ -159,7 +183,7 @@ class Convert(object):
    def process_block(self, html_block, docx_block, stylizer, ignore_tail=False):
        if html_block.text:
-            docx_block.add_text(html_block.text, stylizer.style(html_block))
+            docx_block.add_text(html_block.text, stylizer.style(html_block), ignore_leading_whitespace=True)
        for child in html_block.iterchildren(etree.Element):
            tag = barename(child.tag)
@ -174,7 +198,7 @@ class Convert(object):
            else:
                self.process_inline(child, self.blocks[-1], stylizer)
-        if ignore_tail is False and html_block.tail:
+        if ignore_tail is False and html_block.tail and html_block.tail.strip():
            b = docx_block
            if b is not self.blocks[-1]:
                b = Block()
@ -200,3 +224,28 @@ class Convert(object):
        if html_child.tail:
            docx_block.add_text(html_child.tail, stylizer.style(html_child.getparent()))
    def write(self):
        dn = {k:v for k, v in namespaces.iteritems() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne'}}
        E = ElementMaker(namespace=dn['w'], nsmap=dn)
        self.docx.document = doc = E.document()
        body = E.body()
        doc.append(body)
        for block in self.blocks:
            block.serialize(body)
        dn = {k:v for k, v in namespaces.iteritems() if k in 'wr'}
        E = ElementMaker(namespace=dn['w'], nsmap=dn)
        self.docx.styles = E.styles(
            E.docDefaults(
                E.rPrDefault(
                    E.rPr(
                        E.rFonts(),
                    )
                ),
                E.pPrDefault(
                    E.pPr(
                    )
                )
            )
        )