From 47841f1e0fe0223e7bdd2ef3437970899a11862b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 16 Feb 2015 18:12:17 +0530 Subject: [PATCH] More work on DOCX Output Use a styles manager and fix handing of page-break-after:avoid --- src/calibre/ebooks/docx/writer/from_html.py | 34 +++++++++++------- src/calibre/ebooks/docx/writer/styles.py | 39 +++++++++++++++------ 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/docx/writer/from_html.py b/src/calibre/ebooks/docx/writer/from_html.py index f8deabc502..ad66fcfb10 100644 --- a/src/calibre/ebooks/docx/writer/from_html.py +++ b/src/calibre/ebooks/docx/writer/from_html.py @@ -12,7 +12,7 @@ from lxml import etree from lxml.builder import ElementMaker from calibre.ebooks.docx.names import namespaces -from calibre.ebooks.docx.writer.styles import w, BlockStyle, TextStyle +from calibre.ebooks.docx.writer.styles import w, StylesManager from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from calibre.ebooks.oeb.base import XPath, barename @@ -65,13 +65,13 @@ class TextRun(object): self.texts.append((None, clear)) def serialize(self, p): - r = p.makeelement('{%s}r' % namespaces['w']) + r = p.makeelement(w('r')) p.append(r) for text, preserve_whitespace in self.texts: if text is None: r.append(r.makeelement(w('br'), **{w('clear'):preserve_whitespace})) else: - t = r.makeelement('{%s}t' % namespaces['w']) + t = r.makeelement(w('t')) r.append(t) t.text = text or '' if preserve_whitespace: @@ -79,14 +79,16 @@ class TextRun(object): class Block(object): - def __init__(self, html_block, style, is_first_block=False): + def __init__(self, styles_manager, html_block, style, is_first_block=False): self.html_block = html_block self.html_style = style - self.style = BlockStyle(style, html_block, is_first_block=is_first_block) + self.style = styles_manager.create_block_style(style, html_block, is_first_block=is_first_block) + self.styles_manager = styles_manager + self.keep_next = False self.runs = [] def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None): - ts = TextStyle(style) + ts = self.styles_manager.create_text_style(style) ws = style['white-space'] if self.runs and ts == self.runs[-1].style: run = self.runs[-1] @@ -107,13 +109,17 @@ class Block(object): if self.runs: run = self.runs[-1] else: - run = TextRun(TextStyle(self.html_style), self.html_block) + run = TextRun(self.styles_manager.create_text_style(self.html_style), self.html_block) self.runs.append(run) run.add_break(clear=clear) def serialize(self, body): - p = body.makeelement('{%s}p' % namespaces['w']) + p = body.makeelement(w('p')) body.append(p) + ppr = p.makeelement(w('pPr')) + p.append(ppr) + if self.keep_next: + ppr.append(ppr.makeelement(w('keepNext'))) for run in self.runs: run.serialize(p) @@ -129,6 +135,8 @@ class Convert(object): from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer SVGRasterizer()(self.oeb, self.opts) + self.styles_manager = StylesManager() + for item in self.oeb.spine: self.process_item(item) @@ -139,7 +147,7 @@ class Convert(object): is_first_block = True for body in XPath('//h:body')(item.data): - b = Block(body, stylizer.style(body), is_first_block=is_first_block) + b = Block(self.styles_manager, body, stylizer.style(body), is_first_block=is_first_block) self.blocks.append(b) is_first_block = False self.process_block(body, b, stylizer, ignore_tail=True) @@ -158,7 +166,7 @@ class Convert(object): if tag == 'img': pass # TODO: Handle images if display == 'block' and tag != 'br': - b = Block(child, style) + b = Block(self.styles_manager, child, style) self.blocks.append(b) self.process_block(child, b, stylizer) else: @@ -167,9 +175,11 @@ class Convert(object): if ignore_tail is False and html_block.tail and html_block.tail.strip(): b = docx_block if b is not self.blocks[-1]: - b = Block(html_block, block_style) + b = Block(self.styles_manager, html_block, block_style) self.blocks.append(b) b.add_text(html_block.tail, stylizer.style(html_block.getparent())) + if block_style['page-break-after'] == 'avoid': + self.blocks[-1].keep_next = True def process_inline(self, html_child, docx_block, stylizer): tag = barename(html_child.tag) @@ -188,7 +198,7 @@ class Convert(object): style = stylizer.style(child) display = style.get('display', 'inline') if display == 'block': - b = Block(child, style) + b = Block(self.styles_manager, child, style) self.blocks.append(b) self.process_block(child, b, stylizer) else: diff --git a/src/calibre/ebooks/docx/writer/styles.py b/src/calibre/ebooks/docx/writer/styles.py index 6808caf684..329b73e508 100644 --- a/src/calibre/ebooks/docx/writer/styles.py +++ b/src/calibre/ebooks/docx/writer/styles.py @@ -38,17 +38,17 @@ class DOCXStyle(object): ALL_PROPS = () def __init__(self): - self.update_hash() + self._hash = hash(tuple( + getattr(self, x) for x in self.ALL_PROPS)) def __hash__(self): return self._hash - def update_hash(self): - self._hash = hash(tuple( - getattr(self, x) for x in self.ALL_PROPS)) - def __eq__(self, other): - return hash(self) == hash(other) + for x in self.ALL_PROPS: + if getattr(self, x) != getattr(other, x, None): + return False + return True def __ne__(self, other): return not self == other @@ -175,7 +175,7 @@ class TextStyle(DOCXStyle): class BlockStyle(DOCXStyle): ALL_PROPS = tuple( - 'text_align page_break_before keep_lines keep_next css_text_indent text_indent line_height css_line_height background_color'.split() + 'text_align page_break_before keep_lines css_text_indent text_indent line_height css_line_height background_color'.split() + ['margin_' + edge for edge in border_edges] + ['css_margin_' + edge for edge in border_edges] + [x%edge for edge in border_edges for x in border_props] @@ -184,8 +184,6 @@ class BlockStyle(DOCXStyle): def __init__(self, css, html_block, is_first_block=False): self.page_break_before = html_block.tag.endswith('}body') or (not is_first_block and css['page-break-before'] == 'always') self.keep_lines = css['page-break-inside'] == 'avoid' - # TODO: Ensure that only the last docx block for this html block has the correct value for keep next - self.keep_next = css['page-break-after'] == 'avoid' for edge in border_edges: # In DOCX padding can only be a positive integer setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge]))) @@ -272,9 +270,28 @@ class BlockStyle(DOCXStyle): style.append(style.makeelement(w('pageBreakBefore'), **{w('val'):'on'})) if self.keep_lines: style.append(style.makeelement(w('keepLines'), **{w('val'):'on'})) - if self.keep_next: - style.append(style.makeelement(w('keepNext'), **{w('val'):'on'})) return style +class StylesManager(object): + def __init__(self): + self.block_styles, self.text_styles = {}, {} + + def create_text_style(self, css_style): + ans = TextStyle(css_style) + existing = self.text_styles.get(ans, None) + if existing is None: + self.text_styles[ans] = ans + else: + ans = existing + return ans + + def create_block_style(self, css_style, html_block, is_first_block=False): + ans = BlockStyle(css_style, html_block, is_first_block=is_first_block) + existing = self.block_styles.get(ans, None) + if existing is None: + self.block_styles[ans] = ans + else: + ans = existing + return ans