From 0f0e62b3a0ce138d51a85c80ba2f8f3019fa389f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 8 Feb 2015 21:32:30 +0530
Subject: [PATCH] Get towards writing a full docx skeleton with all text but no
 styles/images/etc.

---
 src/calibre/ebooks/docx/names.py            |  1 +
 src/calibre/ebooks/docx/writer/container.py | 47 ++++++++++++++--
 src/calibre/ebooks/docx/writer/from_html.py | 59 +++++++++++++++++++--
 3 files changed, 99 insertions(+), 8 deletions(-)
diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py
index 33fc8f56af..f13b963e10 100644
--- a/src/calibre/ebooks/docx/names.py
+++ b/src/calibre/ebooks/docx/names.py
@@ -24,6 +24,7 @@ FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
 ENDNOTES  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes'
 THEMES    = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme'
 SETTINGS  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings'
+WEB_SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings'
 
 namespaces = {
     'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
diff --git a/src/calibre/ebooks/docx/writer/container.py b/src/calibre/ebooks/docx/writer/container.py
index 308224218c..6c7fc973ff 100644
--- a/src/calibre/ebooks/docx/writer/container.py
+++ b/src/calibre/ebooks/docx/writer/container.py
@@ -8,18 +8,57 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
 import textwrap
 
+from lxml import etree
 from lxml.builder import ElementMaker
 
 from calibre import guess_type
 from calibre.constants import numeric_version, __appname__
-from calibre.ebooks.docx.names import namespaces
-from calibre.ebooks.oeb.base import xml2str
+from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS
 from calibre.utils.zipfile import ZipFile
 
+def xml2str(root, pretty_print=False, with_tail=False):
+    ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
+                          pretty_print=pretty_print, with_tail=with_tail)
+    return ans
+
+
+class DocumentRelationships(object):
+
+    def __init__(self):
+        self.rmap = {}
+        self.counter = 0
+        for typ, target in {
+                STYLES: 'styles.xml',
+                WEB_SETTINGS: 'webSettings.xml',
+        }.iteritems():
+            self.add_relationship(target, typ)
+
+    def get_relationship_id(self, target, rtype, target_mode=None):
+        return self.rmap.get((target, rtype, target_mode))
+
+    def add_relationship(self, target, rtype, target_mode=None):
+        ans = self.get_relationship_id(target, rtype, target_mode)
+        if ans is None:
+            self.counter += 1
+            ans = 'rId%d' % self.counter
+            self.rmap[(target, rtype, target_mode)] = ans
+        return ans
+
+    def serialize(self):
+        E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
+        relationships = E.Relationships()
+        for (target, rtype, target_mode), rid in self.rmap.iteritems():
+            r = E.Relationship(Id=rid, Type=rtype, Target=target)
+            if target_mode is not None:
+                r.set('TargetMode', target_mode)
+            relationships.append(r)
+        return xml2str(relationships)
+
 class DOCX(object):
 
     def __init__(self, opts, log):
         self.opts, self.log = opts, log
+        self.document_relationships = DocumentRelationships()
 
     # Boilerplate {{{
     @property
@@ -92,7 +131,9 @@ class DOCX(object):
             zf.writestr('_rels/.rels', self.containerrels)
             zf.writestr('docProps/app.xml', self.appproperties)
             zf.writestr('word/webSettings.xml', self.websettings)
-            # TODO: Write document and document relationships
+            zf.writestr('word/document.xml', xml2str(self.document))
+            zf.writestr('word/styles.xml', xml2str(self.styles))
+            zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
 
 if __name__ == '__main__':
     d = DOCX(None, None)
diff --git a/src/calibre/ebooks/docx/writer/from_html.py b/src/calibre/ebooks/docx/writer/from_html.py
index 4a41ec85c3..f287827118 100644
--- a/src/calibre/ebooks/docx/writer/from_html.py
+++ b/src/calibre/ebooks/docx/writer/from_html.py
@@ -9,7 +9,9 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import re
 
 from lxml import etree
+from lxml.builder import ElementMaker
 
+from calibre.ebooks.docx.names import namespaces
 from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
 from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
 from calibre.ebooks.oeb.base import XPath, barename
@@ -42,7 +44,7 @@ class TextStyle(object):
 
     ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color',
                  'background_color', 'underline', 'strike', 'dstrike', 'caps',
-                 'shadow', 'small_caps', 'spacing', 'vertical-align')
+                 'shadow', 'small_caps', 'spacing', 'vertical_align')
 
     def __init__(self, css):
         self.font_family = css['font-family']  # TODO: Resolve multiple font families and generic font family names
@@ -113,6 +115,16 @@ class TextRun(object):
     def add_break(self, clear='none'):
         self.texts.append(LineBreak(clear=clear))
 
+    def serialize(self, p):
+        r = p.makeelement('{%s}r' % namespaces['w'])
+        p.append(r)
+        for text, preserve_whitespace in self.texts:
+            t = r.makeelement('{%s}t' % namespaces['w'])
+            r.append(t)
+            t.text = text or ''
+            if preserve_whitespace:
+                t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+
 style_cache = {}
 
 class Block(object):
@@ -120,19 +132,29 @@ class Block(object):
     def __init__(self):
         self.runs = []
 
-    def add_text(self, text, style):
+    def add_text(self, text, style, ignore_leading_whitespace=False):
         ts = TextStyle(style)
         ws = style['white-space']
         if self.runs and ts == self.runs[-1].style:
             run = self.runs[-1]
         else:
             run = TextRun(ts)
+            self.runs.append(run)
+        preserve_whitespace = ws in {'pre', 'pre-wrap'}
+        if ignore_leading_whitespace and not preserve_whitespace:
+            text = text.lstrip()
         if ws == 'pre-line':
             for text in text.splitlines():
                 run.add_text(text, False)
                 run.add_break()
         else:
-            run.add_text(text, ws in {'pre', 'pre-wrap'})
+            run.add_text(text, preserve_whitespace)
+
+    def serialize(self, body):
+        p = body.makeelement('{%s}p' % namespaces['w'])
+        body.append(p)
+        for run in self.runs:
+            run.serialize(p)
 
 class Convert(object):
 
@@ -149,6 +171,8 @@ class Convert(object):
         for item in self.oeb.spine:
             self.process_item(item)
 
+        self.write()
+
     def process_item(self, item):
         stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
 
@@ -159,7 +183,7 @@ class Convert(object):
 
     def process_block(self, html_block, docx_block, stylizer, ignore_tail=False):
         if html_block.text:
-            docx_block.add_text(html_block.text, stylizer.style(html_block))
+            docx_block.add_text(html_block.text, stylizer.style(html_block), ignore_leading_whitespace=True)
 
         for child in html_block.iterchildren(etree.Element):
             tag = barename(child.tag)
@@ -174,7 +198,7 @@ class Convert(object):
             else:
                 self.process_inline(child, self.blocks[-1], stylizer)
 
-        if ignore_tail is False and html_block.tail:
+        if ignore_tail is False and html_block.tail and html_block.tail.strip():
             b = docx_block
             if b is not self.blocks[-1]:
                 b = Block()
@@ -200,3 +224,28 @@ class Convert(object):
 
         if html_child.tail:
             docx_block.add_text(html_child.tail, stylizer.style(html_child.getparent()))
+
+    def write(self):
+        dn = {k:v for k, v in namespaces.iteritems() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne'}}
+        E = ElementMaker(namespace=dn['w'], nsmap=dn)
+        self.docx.document = doc = E.document()
+        body = E.body()
+        doc.append(body)
+        for block in self.blocks:
+            block.serialize(body)
+
+        dn = {k:v for k, v in namespaces.iteritems() if k in 'wr'}
+        E = ElementMaker(namespace=dn['w'], nsmap=dn)
+        self.docx.styles = E.styles(
+            E.docDefaults(
+                E.rPrDefault(
+                    E.rPr(
+                        E.rFonts(),
+                    )
+                ),
+                E.pPrDefault(
+                    E.pPr(
+                    )
+                )
+            )
+        )