From 0f0e62b3a0ce138d51a85c80ba2f8f3019fa389f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 8 Feb 2015 21:32:30 +0530 Subject: [PATCH] Get towards writing a full docx skeleton with all text but no styles/images/etc. --- src/calibre/ebooks/docx/names.py | 1 + src/calibre/ebooks/docx/writer/container.py | 47 ++++++++++++++-- src/calibre/ebooks/docx/writer/from_html.py | 59 +++++++++++++++++++-- 3 files changed, 99 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/docx/names.py b/src/calibre/ebooks/docx/names.py index 33fc8f56af..f13b963e10 100644 --- a/src/calibre/ebooks/docx/names.py +++ b/src/calibre/ebooks/docx/names.py @@ -24,6 +24,7 @@ FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes' THEMES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme' SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings' +WEB_SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings' namespaces = { 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', diff --git a/src/calibre/ebooks/docx/writer/container.py b/src/calibre/ebooks/docx/writer/container.py index 308224218c..6c7fc973ff 100644 --- a/src/calibre/ebooks/docx/writer/container.py +++ b/src/calibre/ebooks/docx/writer/container.py @@ -8,18 +8,57 @@ __copyright__ = '2013, Kovid Goyal ' import textwrap +from lxml import etree from lxml.builder import ElementMaker from calibre import guess_type from calibre.constants import numeric_version, __appname__ -from calibre.ebooks.docx.names import namespaces -from calibre.ebooks.oeb.base import xml2str +from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS from calibre.utils.zipfile import ZipFile +def xml2str(root, pretty_print=False, with_tail=False): + ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, + pretty_print=pretty_print, with_tail=with_tail) + return ans + + +class DocumentRelationships(object): + + def __init__(self): + self.rmap = {} + self.counter = 0 + for typ, target in { + STYLES: 'styles.xml', + WEB_SETTINGS: 'webSettings.xml', + }.iteritems(): + self.add_relationship(target, typ) + + def get_relationship_id(self, target, rtype, target_mode=None): + return self.rmap.get((target, rtype, target_mode)) + + def add_relationship(self, target, rtype, target_mode=None): + ans = self.get_relationship_id(target, rtype, target_mode) + if ans is None: + self.counter += 1 + ans = 'rId%d' % self.counter + self.rmap[(target, rtype, target_mode)] = ans + return ans + + def serialize(self): + E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) + relationships = E.Relationships() + for (target, rtype, target_mode), rid in self.rmap.iteritems(): + r = E.Relationship(Id=rid, Type=rtype, Target=target) + if target_mode is not None: + r.set('TargetMode', target_mode) + relationships.append(r) + return xml2str(relationships) + class DOCX(object): def __init__(self, opts, log): self.opts, self.log = opts, log + self.document_relationships = DocumentRelationships() # Boilerplate {{{ @property @@ -92,7 +131,9 @@ class DOCX(object): zf.writestr('_rels/.rels', self.containerrels) zf.writestr('docProps/app.xml', self.appproperties) zf.writestr('word/webSettings.xml', self.websettings) - # TODO: Write document and document relationships + zf.writestr('word/document.xml', xml2str(self.document)) + zf.writestr('word/styles.xml', xml2str(self.styles)) + zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize()) if __name__ == '__main__': d = DOCX(None, None) diff --git a/src/calibre/ebooks/docx/writer/from_html.py b/src/calibre/ebooks/docx/writer/from_html.py index 4a41ec85c3..f287827118 100644 --- a/src/calibre/ebooks/docx/writer/from_html.py +++ b/src/calibre/ebooks/docx/writer/from_html.py @@ -9,7 +9,9 @@ __copyright__ = '2013, Kovid Goyal ' import re from lxml import etree +from lxml.builder import ElementMaker +from calibre.ebooks.docx.names import namespaces from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from calibre.ebooks.oeb.base import XPath, barename @@ -42,7 +44,7 @@ class TextStyle(object): ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color', 'background_color', 'underline', 'strike', 'dstrike', 'caps', - 'shadow', 'small_caps', 'spacing', 'vertical-align') + 'shadow', 'small_caps', 'spacing', 'vertical_align') def __init__(self, css): self.font_family = css['font-family'] # TODO: Resolve multiple font families and generic font family names @@ -113,6 +115,16 @@ class TextRun(object): def add_break(self, clear='none'): self.texts.append(LineBreak(clear=clear)) + def serialize(self, p): + r = p.makeelement('{%s}r' % namespaces['w']) + p.append(r) + for text, preserve_whitespace in self.texts: + t = r.makeelement('{%s}t' % namespaces['w']) + r.append(t) + t.text = text or '' + if preserve_whitespace: + t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') + style_cache = {} class Block(object): @@ -120,19 +132,29 @@ class Block(object): def __init__(self): self.runs = [] - def add_text(self, text, style): + def add_text(self, text, style, ignore_leading_whitespace=False): ts = TextStyle(style) ws = style['white-space'] if self.runs and ts == self.runs[-1].style: run = self.runs[-1] else: run = TextRun(ts) + self.runs.append(run) + preserve_whitespace = ws in {'pre', 'pre-wrap'} + if ignore_leading_whitespace and not preserve_whitespace: + text = text.lstrip() if ws == 'pre-line': for text in text.splitlines(): run.add_text(text, False) run.add_break() else: - run.add_text(text, ws in {'pre', 'pre-wrap'}) + run.add_text(text, preserve_whitespace) + + def serialize(self, body): + p = body.makeelement('{%s}p' % namespaces['w']) + body.append(p) + for run in self.runs: + run.serialize(p) class Convert(object): @@ -149,6 +171,8 @@ class Convert(object): for item in self.oeb.spine: self.process_item(item) + self.write() + def process_item(self, item): stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) @@ -159,7 +183,7 @@ class Convert(object): def process_block(self, html_block, docx_block, stylizer, ignore_tail=False): if html_block.text: - docx_block.add_text(html_block.text, stylizer.style(html_block)) + docx_block.add_text(html_block.text, stylizer.style(html_block), ignore_leading_whitespace=True) for child in html_block.iterchildren(etree.Element): tag = barename(child.tag) @@ -174,7 +198,7 @@ class Convert(object): else: self.process_inline(child, self.blocks[-1], stylizer) - if ignore_tail is False and html_block.tail: + if ignore_tail is False and html_block.tail and html_block.tail.strip(): b = docx_block if b is not self.blocks[-1]: b = Block() @@ -200,3 +224,28 @@ class Convert(object): if html_child.tail: docx_block.add_text(html_child.tail, stylizer.style(html_child.getparent())) + + def write(self): + dn = {k:v for k, v in namespaces.iteritems() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne'}} + E = ElementMaker(namespace=dn['w'], nsmap=dn) + self.docx.document = doc = E.document() + body = E.body() + doc.append(body) + for block in self.blocks: + block.serialize(body) + + dn = {k:v for k, v in namespaces.iteritems() if k in 'wr'} + E = ElementMaker(namespace=dn['w'], nsmap=dn) + self.docx.styles = E.styles( + E.docDefaults( + E.rPrDefault( + E.rPr( + E.rFonts(), + ) + ), + E.pPrDefault( + E.pPr( + ) + ) + ) + )