Get towards writing a full docx skeleton with all text but no styles/images/etc.

This commit is contained in:
Kovid Goyal 2015-02-08 21:32:30 +05:30
parent cbc85be903
commit 0f0e62b3a0
3 changed files with 99 additions and 8 deletions

View File

@ -24,6 +24,7 @@ FOOTNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships
ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes' ENDNOTES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes'
THEMES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme' THEMES = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme'
SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings' SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings'
WEB_SETTINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings'
namespaces = { namespaces = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', 'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',

View File

@ -8,18 +8,57 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap import textwrap
from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from calibre import guess_type from calibre import guess_type
from calibre.constants import numeric_version, __appname__ from calibre.constants import numeric_version, __appname__
from calibre.ebooks.docx.names import namespaces from calibre.ebooks.docx.names import namespaces, STYLES, WEB_SETTINGS
from calibre.ebooks.oeb.base import xml2str
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
def xml2str(root, pretty_print=False, with_tail=False):
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print, with_tail=with_tail)
return ans
class DocumentRelationships(object):
def __init__(self):
self.rmap = {}
self.counter = 0
for typ, target in {
STYLES: 'styles.xml',
WEB_SETTINGS: 'webSettings.xml',
}.iteritems():
self.add_relationship(target, typ)
def get_relationship_id(self, target, rtype, target_mode=None):
return self.rmap.get((target, rtype, target_mode))
def add_relationship(self, target, rtype, target_mode=None):
ans = self.get_relationship_id(target, rtype, target_mode)
if ans is None:
self.counter += 1
ans = 'rId%d' % self.counter
self.rmap[(target, rtype, target_mode)] = ans
return ans
def serialize(self):
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
relationships = E.Relationships()
for (target, rtype, target_mode), rid in self.rmap.iteritems():
r = E.Relationship(Id=rid, Type=rtype, Target=target)
if target_mode is not None:
r.set('TargetMode', target_mode)
relationships.append(r)
return xml2str(relationships)
class DOCX(object): class DOCX(object):
def __init__(self, opts, log): def __init__(self, opts, log):
self.opts, self.log = opts, log self.opts, self.log = opts, log
self.document_relationships = DocumentRelationships()
# Boilerplate {{{ # Boilerplate {{{
@property @property
@ -92,7 +131,9 @@ class DOCX(object):
zf.writestr('_rels/.rels', self.containerrels) zf.writestr('_rels/.rels', self.containerrels)
zf.writestr('docProps/app.xml', self.appproperties) zf.writestr('docProps/app.xml', self.appproperties)
zf.writestr('word/webSettings.xml', self.websettings) zf.writestr('word/webSettings.xml', self.websettings)
# TODO: Write document and document relationships zf.writestr('word/document.xml', xml2str(self.document))
zf.writestr('word/styles.xml', xml2str(self.styles))
zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
if __name__ == '__main__': if __name__ == '__main__':
d = DOCX(None, None) d = DOCX(None, None)

View File

@ -9,7 +9,9 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re import re
from lxml import etree from lxml import etree
from lxml.builder import ElementMaker
from calibre.ebooks.docx.names import namespaces
from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
from calibre.ebooks.oeb.base import XPath, barename from calibre.ebooks.oeb.base import XPath, barename
@ -42,7 +44,7 @@ class TextStyle(object):
ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color', ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color',
'background_color', 'underline', 'strike', 'dstrike', 'caps', 'background_color', 'underline', 'strike', 'dstrike', 'caps',
'shadow', 'small_caps', 'spacing', 'vertical-align') 'shadow', 'small_caps', 'spacing', 'vertical_align')
def __init__(self, css): def __init__(self, css):
self.font_family = css['font-family'] # TODO: Resolve multiple font families and generic font family names self.font_family = css['font-family'] # TODO: Resolve multiple font families and generic font family names
@ -113,6 +115,16 @@ class TextRun(object):
def add_break(self, clear='none'): def add_break(self, clear='none'):
self.texts.append(LineBreak(clear=clear)) self.texts.append(LineBreak(clear=clear))
def serialize(self, p):
r = p.makeelement('{%s}r' % namespaces['w'])
p.append(r)
for text, preserve_whitespace in self.texts:
t = r.makeelement('{%s}t' % namespaces['w'])
r.append(t)
t.text = text or ''
if preserve_whitespace:
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
style_cache = {} style_cache = {}
class Block(object): class Block(object):
@ -120,19 +132,29 @@ class Block(object):
def __init__(self): def __init__(self):
self.runs = [] self.runs = []
def add_text(self, text, style): def add_text(self, text, style, ignore_leading_whitespace=False):
ts = TextStyle(style) ts = TextStyle(style)
ws = style['white-space'] ws = style['white-space']
if self.runs and ts == self.runs[-1].style: if self.runs and ts == self.runs[-1].style:
run = self.runs[-1] run = self.runs[-1]
else: else:
run = TextRun(ts) run = TextRun(ts)
self.runs.append(run)
preserve_whitespace = ws in {'pre', 'pre-wrap'}
if ignore_leading_whitespace and not preserve_whitespace:
text = text.lstrip()
if ws == 'pre-line': if ws == 'pre-line':
for text in text.splitlines(): for text in text.splitlines():
run.add_text(text, False) run.add_text(text, False)
run.add_break() run.add_break()
else: else:
run.add_text(text, ws in {'pre', 'pre-wrap'}) run.add_text(text, preserve_whitespace)
def serialize(self, body):
p = body.makeelement('{%s}p' % namespaces['w'])
body.append(p)
for run in self.runs:
run.serialize(p)
class Convert(object): class Convert(object):
@ -149,6 +171,8 @@ class Convert(object):
for item in self.oeb.spine: for item in self.oeb.spine:
self.process_item(item) self.process_item(item)
self.write()
def process_item(self, item): def process_item(self, item):
stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile)
@ -159,7 +183,7 @@ class Convert(object):
def process_block(self, html_block, docx_block, stylizer, ignore_tail=False): def process_block(self, html_block, docx_block, stylizer, ignore_tail=False):
if html_block.text: if html_block.text:
docx_block.add_text(html_block.text, stylizer.style(html_block)) docx_block.add_text(html_block.text, stylizer.style(html_block), ignore_leading_whitespace=True)
for child in html_block.iterchildren(etree.Element): for child in html_block.iterchildren(etree.Element):
tag = barename(child.tag) tag = barename(child.tag)
@ -174,7 +198,7 @@ class Convert(object):
else: else:
self.process_inline(child, self.blocks[-1], stylizer) self.process_inline(child, self.blocks[-1], stylizer)
if ignore_tail is False and html_block.tail: if ignore_tail is False and html_block.tail and html_block.tail.strip():
b = docx_block b = docx_block
if b is not self.blocks[-1]: if b is not self.blocks[-1]:
b = Block() b = Block()
@ -200,3 +224,28 @@ class Convert(object):
if html_child.tail: if html_child.tail:
docx_block.add_text(html_child.tail, stylizer.style(html_child.getparent())) docx_block.add_text(html_child.tail, stylizer.style(html_child.getparent()))
def write(self):
dn = {k:v for k, v in namespaces.iteritems() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne'}}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
self.docx.document = doc = E.document()
body = E.body()
doc.append(body)
for block in self.blocks:
block.serialize(body)
dn = {k:v for k, v in namespaces.iteritems() if k in 'wr'}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
self.docx.styles = E.styles(
E.docDefaults(
E.rPrDefault(
E.rPr(
E.rFonts(),
)
),
E.pPrDefault(
E.pPr(
)
)
)
)