More work on DOCX Output

Use a styles manager and fix handing of page-break-after:avoid
This commit is contained in:
Kovid Goyal 2015-02-16 18:12:17 +05:30
parent 69b15134df
commit 47841f1e0f
2 changed files with 50 additions and 23 deletions

View File

@ -12,7 +12,7 @@ from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from calibre.ebooks.docx.names import namespaces from calibre.ebooks.docx.names import namespaces
from calibre.ebooks.docx.writer.styles import w, BlockStyle, TextStyle from calibre.ebooks.docx.writer.styles import w, StylesManager
from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St
from calibre.ebooks.oeb.base import XPath, barename from calibre.ebooks.oeb.base import XPath, barename
@ -65,13 +65,13 @@ class TextRun(object):
self.texts.append((None, clear)) self.texts.append((None, clear))
def serialize(self, p): def serialize(self, p):
r = p.makeelement('{%s}r' % namespaces['w']) r = p.makeelement(w('r'))
p.append(r) p.append(r)
for text, preserve_whitespace in self.texts: for text, preserve_whitespace in self.texts:
if text is None: if text is None:
r.append(r.makeelement(w('br'), **{w('clear'):preserve_whitespace})) r.append(r.makeelement(w('br'), **{w('clear'):preserve_whitespace}))
else: else:
t = r.makeelement('{%s}t' % namespaces['w']) t = r.makeelement(w('t'))
r.append(t) r.append(t)
t.text = text or '' t.text = text or ''
if preserve_whitespace: if preserve_whitespace:
@ -79,14 +79,16 @@ class TextRun(object):
class Block(object): class Block(object):
def __init__(self, html_block, style, is_first_block=False): def __init__(self, styles_manager, html_block, style, is_first_block=False):
self.html_block = html_block self.html_block = html_block
self.html_style = style self.html_style = style
self.style = BlockStyle(style, html_block, is_first_block=is_first_block) self.style = styles_manager.create_block_style(style, html_block, is_first_block=is_first_block)
self.styles_manager = styles_manager
self.keep_next = False
self.runs = [] self.runs = []
def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None): def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None):
ts = TextStyle(style) ts = self.styles_manager.create_text_style(style)
ws = style['white-space'] ws = style['white-space']
if self.runs and ts == self.runs[-1].style: if self.runs and ts == self.runs[-1].style:
run = self.runs[-1] run = self.runs[-1]
@ -107,13 +109,17 @@ class Block(object):
if self.runs: if self.runs:
run = self.runs[-1] run = self.runs[-1]
else: else:
run = TextRun(TextStyle(self.html_style), self.html_block) run = TextRun(self.styles_manager.create_text_style(self.html_style), self.html_block)
self.runs.append(run) self.runs.append(run)
run.add_break(clear=clear) run.add_break(clear=clear)
def serialize(self, body): def serialize(self, body):
p = body.makeelement('{%s}p' % namespaces['w']) p = body.makeelement(w('p'))
body.append(p) body.append(p)
ppr = p.makeelement(w('pPr'))
p.append(ppr)
if self.keep_next:
ppr.append(ppr.makeelement(w('keepNext')))
for run in self.runs: for run in self.runs:
run.serialize(p) run.serialize(p)
@ -129,6 +135,8 @@ class Convert(object):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
SVGRasterizer()(self.oeb, self.opts) SVGRasterizer()(self.oeb, self.opts)
self.styles_manager = StylesManager()
for item in self.oeb.spine: for item in self.oeb.spine:
self.process_item(item) self.process_item(item)
@ -139,7 +147,7 @@ class Convert(object):
is_first_block = True is_first_block = True
for body in XPath('//h:body')(item.data): for body in XPath('//h:body')(item.data):
b = Block(body, stylizer.style(body), is_first_block=is_first_block) b = Block(self.styles_manager, body, stylizer.style(body), is_first_block=is_first_block)
self.blocks.append(b) self.blocks.append(b)
is_first_block = False is_first_block = False
self.process_block(body, b, stylizer, ignore_tail=True) self.process_block(body, b, stylizer, ignore_tail=True)
@ -158,7 +166,7 @@ class Convert(object):
if tag == 'img': if tag == 'img':
pass # TODO: Handle images pass # TODO: Handle images
if display == 'block' and tag != 'br': if display == 'block' and tag != 'br':
b = Block(child, style) b = Block(self.styles_manager, child, style)
self.blocks.append(b) self.blocks.append(b)
self.process_block(child, b, stylizer) self.process_block(child, b, stylizer)
else: else:
@ -167,9 +175,11 @@ class Convert(object):
if ignore_tail is False and html_block.tail and html_block.tail.strip(): if ignore_tail is False and html_block.tail and html_block.tail.strip():
b = docx_block b = docx_block
if b is not self.blocks[-1]: if b is not self.blocks[-1]:
b = Block(html_block, block_style) b = Block(self.styles_manager, html_block, block_style)
self.blocks.append(b) self.blocks.append(b)
b.add_text(html_block.tail, stylizer.style(html_block.getparent())) b.add_text(html_block.tail, stylizer.style(html_block.getparent()))
if block_style['page-break-after'] == 'avoid':
self.blocks[-1].keep_next = True
def process_inline(self, html_child, docx_block, stylizer): def process_inline(self, html_child, docx_block, stylizer):
tag = barename(html_child.tag) tag = barename(html_child.tag)
@ -188,7 +198,7 @@ class Convert(object):
style = stylizer.style(child) style = stylizer.style(child)
display = style.get('display', 'inline') display = style.get('display', 'inline')
if display == 'block': if display == 'block':
b = Block(child, style) b = Block(self.styles_manager, child, style)
self.blocks.append(b) self.blocks.append(b)
self.process_block(child, b, stylizer) self.process_block(child, b, stylizer)
else: else:

View File

@ -38,17 +38,17 @@ class DOCXStyle(object):
ALL_PROPS = () ALL_PROPS = ()
def __init__(self): def __init__(self):
self.update_hash() self._hash = hash(tuple(
getattr(self, x) for x in self.ALL_PROPS))
def __hash__(self): def __hash__(self):
return self._hash return self._hash
def update_hash(self):
self._hash = hash(tuple(
getattr(self, x) for x in self.ALL_PROPS))
def __eq__(self, other): def __eq__(self, other):
return hash(self) == hash(other) for x in self.ALL_PROPS:
if getattr(self, x) != getattr(other, x, None):
return False
return True
def __ne__(self, other): def __ne__(self, other):
return not self == other return not self == other
@ -175,7 +175,7 @@ class TextStyle(DOCXStyle):
class BlockStyle(DOCXStyle): class BlockStyle(DOCXStyle):
ALL_PROPS = tuple( ALL_PROPS = tuple(
'text_align page_break_before keep_lines keep_next css_text_indent text_indent line_height css_line_height background_color'.split() 'text_align page_break_before keep_lines css_text_indent text_indent line_height css_line_height background_color'.split()
+ ['margin_' + edge for edge in border_edges] + ['margin_' + edge for edge in border_edges]
+ ['css_margin_' + edge for edge in border_edges] + ['css_margin_' + edge for edge in border_edges]
+ [x%edge for edge in border_edges for x in border_props] + [x%edge for edge in border_edges for x in border_props]
@ -184,8 +184,6 @@ class BlockStyle(DOCXStyle):
def __init__(self, css, html_block, is_first_block=False): def __init__(self, css, html_block, is_first_block=False):
self.page_break_before = html_block.tag.endswith('}body') or (not is_first_block and css['page-break-before'] == 'always') self.page_break_before = html_block.tag.endswith('}body') or (not is_first_block and css['page-break-before'] == 'always')
self.keep_lines = css['page-break-inside'] == 'avoid' self.keep_lines = css['page-break-inside'] == 'avoid'
# TODO: Ensure that only the last docx block for this html block has the correct value for keep next
self.keep_next = css['page-break-after'] == 'avoid'
for edge in border_edges: for edge in border_edges:
# In DOCX padding can only be a positive integer # In DOCX padding can only be a positive integer
setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge]))) setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge])))
@ -272,9 +270,28 @@ class BlockStyle(DOCXStyle):
style.append(style.makeelement(w('pageBreakBefore'), **{w('val'):'on'})) style.append(style.makeelement(w('pageBreakBefore'), **{w('val'):'on'}))
if self.keep_lines: if self.keep_lines:
style.append(style.makeelement(w('keepLines'), **{w('val'):'on'})) style.append(style.makeelement(w('keepLines'), **{w('val'):'on'}))
if self.keep_next:
style.append(style.makeelement(w('keepNext'), **{w('val'):'on'}))
return style return style
class StylesManager(object):
def __init__(self):
self.block_styles, self.text_styles = {}, {}
def create_text_style(self, css_style):
ans = TextStyle(css_style)
existing = self.text_styles.get(ans, None)
if existing is None:
self.text_styles[ans] = ans
else:
ans = existing
return ans
def create_block_style(self, css_style, html_block, is_first_block=False):
ans = BlockStyle(css_style, html_block, is_first_block=is_first_block)
existing = self.block_styles.get(ans, None)
if existing is None:
self.block_styles[ans] = ans
else:
ans = existing
return ans