More work on DOCX Output

Use a styles manager and fix handing of page-break-after:avoid
This commit is contained in:
Kovid Goyal 2015-02-16 18:12:17 +05:30
parent 69b15134df
commit 47841f1e0f
2 changed files with 50 additions and 23 deletions

View File

@ -12,7 +12,7 @@ from lxml import etree
from lxml.builder import ElementMaker
from calibre.ebooks.docx.names import namespaces
from calibre.ebooks.docx.writer.styles import w, BlockStyle, TextStyle
from calibre.ebooks.docx.writer.styles import w, StylesManager
from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
from calibre.ebooks.oeb.base import XPath, barename
@ -65,13 +65,13 @@ class TextRun(object):
self.texts.append((None, clear))
def serialize(self, p):
r = p.makeelement('{%s}r' % namespaces['w'])
r = p.makeelement(w('r'))
p.append(r)
for text, preserve_whitespace in self.texts:
if text is None:
r.append(r.makeelement(w('br'), **{w('clear'):preserve_whitespace}))
else:
t = r.makeelement('{%s}t' % namespaces['w'])
t = r.makeelement(w('t'))
r.append(t)
t.text = text or ''
if preserve_whitespace:
@ -79,14 +79,16 @@ class TextRun(object):
class Block(object):
def __init__(self, html_block, style, is_first_block=False):
def __init__(self, styles_manager, html_block, style, is_first_block=False):
self.html_block = html_block
self.html_style = style
self.style = BlockStyle(style, html_block, is_first_block=is_first_block)
self.style = styles_manager.create_block_style(style, html_block, is_first_block=is_first_block)
self.styles_manager = styles_manager
self.keep_next = False
self.runs = []
def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None):
ts = TextStyle(style)
ts = self.styles_manager.create_text_style(style)
ws = style['white-space']
if self.runs and ts == self.runs[-1].style:
run = self.runs[-1]
@ -107,13 +109,17 @@ class Block(object):
if self.runs:
run = self.runs[-1]
else:
run = TextRun(TextStyle(self.html_style), self.html_block)
run = TextRun(self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run)
run.add_break(clear=clear)
def serialize(self, body):
p = body.makeelement('{%s}p' % namespaces['w'])
p = body.makeelement(w('p'))
body.append(p)
ppr = p.makeelement(w('pPr'))
p.append(ppr)
if self.keep_next:
ppr.append(ppr.makeelement(w('keepNext')))
for run in self.runs:
run.serialize(p)
@ -129,6 +135,8 @@ class Convert(object):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
SVGRasterizer()(self.oeb, self.opts)
self.styles_manager = StylesManager()
for item in self.oeb.spine:
self.process_item(item)
@ -139,7 +147,7 @@ class Convert(object):
is_first_block = True
for body in XPath('//h:body')(item.data):
b = Block(body, stylizer.style(body), is_first_block=is_first_block)
b = Block(self.styles_manager, body, stylizer.style(body), is_first_block=is_first_block)
self.blocks.append(b)
is_first_block = False
self.process_block(body, b, stylizer, ignore_tail=True)
@ -158,7 +166,7 @@ class Convert(object):
if tag == 'img':
pass # TODO: Handle images
if display == 'block' and tag != 'br':
b = Block(child, style)
b = Block(self.styles_manager, child, style)
self.blocks.append(b)
self.process_block(child, b, stylizer)
else:
@ -167,9 +175,11 @@ class Convert(object):
if ignore_tail is False and html_block.tail and html_block.tail.strip():
b = docx_block
if b is not self.blocks[-1]:
b = Block(html_block, block_style)
b = Block(self.styles_manager, html_block, block_style)
self.blocks.append(b)
b.add_text(html_block.tail, stylizer.style(html_block.getparent()))
if block_style['page-break-after'] == 'avoid':
self.blocks[-1].keep_next = True
def process_inline(self, html_child, docx_block, stylizer):
tag = barename(html_child.tag)
@ -188,7 +198,7 @@ class Convert(object):
style = stylizer.style(child)
display = style.get('display', 'inline')
if display == 'block':
b = Block(child, style)
b = Block(self.styles_manager, child, style)
self.blocks.append(b)
self.process_block(child, b, stylizer)
else:

View File

@ -38,17 +38,17 @@ class DOCXStyle(object):
ALL_PROPS = ()
def __init__(self):
self.update_hash()
self._hash = hash(tuple(
getattr(self, x) for x in self.ALL_PROPS))
def __hash__(self):
return self._hash
def update_hash(self):
self._hash = hash(tuple(
getattr(self, x) for x in self.ALL_PROPS))
def __eq__(self, other):
return hash(self) == hash(other)
for x in self.ALL_PROPS:
if getattr(self, x) != getattr(other, x, None):
return False
return True
def __ne__(self, other):
return not self == other
@ -175,7 +175,7 @@ class TextStyle(DOCXStyle):
class BlockStyle(DOCXStyle):
ALL_PROPS = tuple(
'text_align page_break_before keep_lines keep_next css_text_indent text_indent line_height css_line_height background_color'.split()
'text_align page_break_before keep_lines css_text_indent text_indent line_height css_line_height background_color'.split()
+ ['margin_' + edge for edge in border_edges]
+ ['css_margin_' + edge for edge in border_edges]
+ [x%edge for edge in border_edges for x in border_props]
@ -184,8 +184,6 @@ class BlockStyle(DOCXStyle):
def __init__(self, css, html_block, is_first_block=False):
self.page_break_before = html_block.tag.endswith('}body') or (not is_first_block and css['page-break-before'] == 'always')
self.keep_lines = css['page-break-inside'] == 'avoid'
# TODO: Ensure that only the last docx block for this html block has the correct value for keep next
self.keep_next = css['page-break-after'] == 'avoid'
for edge in border_edges:
# In DOCX padding can only be a positive integer
setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge])))
@ -272,9 +270,28 @@ class BlockStyle(DOCXStyle):
style.append(style.makeelement(w('pageBreakBefore'), **{w('val'):'on'}))
if self.keep_lines:
style.append(style.makeelement(w('keepLines'), **{w('val'):'on'}))
if self.keep_next:
style.append(style.makeelement(w('keepNext'), **{w('val'):'on'}))
return style
class StylesManager(object):
def __init__(self):
self.block_styles, self.text_styles = {}, {}
def create_text_style(self, css_style):
ans = TextStyle(css_style)
existing = self.text_styles.get(ans, None)
if existing is None:
self.text_styles[ans] = ans
else:
ans = existing
return ans
def create_block_style(self, css_style, html_block, is_first_block=False):
ans = BlockStyle(css_style, html_block, is_first_block=is_first_block)
existing = self.block_styles.get(ans, None)
if existing is None:
self.block_styles[ans] = ans
else:
ans = existing
return ans