From 4e8f148e597e2cc20b74a693711f85d6e968b6b8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 15 Feb 2015 15:43:59 +0530 Subject: [PATCH] Mapping from CSS to DOCX for block styles --- src/calibre/ebooks/__init__.py | 11 + src/calibre/ebooks/docx/writer/from_html.py | 220 +++++++++++++++++--- 2 files changed, 206 insertions(+), 25 deletions(-) diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py index dbbf94f27d..00a5384f96 100644 --- a/src/calibre/ebooks/__init__.py +++ b/src/calibre/ebooks/__init__.py @@ -277,6 +277,17 @@ def unit_convert(value, base, font, dpi, body_font_size=12): result = value * body_font_size return result +def parse_css_length(value): + try: + m = UNIT_RE.match(value) + except TypeError: + return None, None + if m is not None and m.group(1): + value = float(m.group(1)) + unit = m.group(2) + return value, unit.lower() + return None, None + def generate_masthead(title, output_path=None, width=600, height=60): from calibre.ebooks.conversion.config import load_defaults recs = load_defaults('mobi_output') diff --git a/src/calibre/ebooks/docx/writer/from_html.py b/src/calibre/ebooks/docx/writer/from_html.py index f287827118..d8f984188b 100644 --- a/src/calibre/ebooks/docx/writer/from_html.py +++ b/src/calibre/ebooks/docx/writer/from_html.py @@ -11,10 +11,12 @@ import re from lxml import etree from lxml.builder import ElementMaker +from calibre.ebooks import parse_css_length from calibre.ebooks.docx.names import namespaces from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero from calibre.ebooks.oeb.stylizer import Stylizer as Sz, Style as St from calibre.ebooks.oeb.base import XPath, barename +from tinycss.color3 import parse_color_string class Style(St): @@ -40,7 +42,62 @@ class Stylizer(Sz): except KeyError: return Style(element, self) -class TextStyle(object): +border_edges = ('left', 'top', 'right', 'bottom') +border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color') + +def css_color_to_rgb(value): + if not value: + return + if value.lower() == 'currentcolor': + return 'auto' + val = parse_color_string(value) + if val is None: + return + if val.alpha < 0.01: + return + return '%02X%02X%02X' % (int(val.red * 255), int(val.green * 255), int(val.blue * 255)) + +class DOCXStyle(object): + + ALL_PROPS = () + + def __init__(self): + self.update_hash() + + def __hash__(self): + return self._hash + + def update_hash(self): + self._hash = hash(tuple( + getattr(self, x) for x in self.ALL_PROPS)) + + def __eq__(self, other): + return hash(self) == hash(other) + + def __ne__(self, other): + return not self == other + + def __repr__(self): + return etree.tostring(self.serialize(etree.Element(w('style'), nsmap={'w':namespaces['w']})), pretty_print=True) + __str__ = __repr__ + +LINE_STYLES = { + 'none': 'none', + 'hidden': 'none', + 'dotted': 'dotted', + 'dashed': 'dashed', + 'solid': 'single', + 'double': 'double', + 'groove': 'threeDEngrave', + 'ridge': 'threeDEmboss', + 'inset': 'inset', + 'outset': 'outset', +} + +def w(x): + return '{%s}%s' % (namespaces['w'], x) + +class TextStyle(DOCXStyle): ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color', 'background_color', 'underline', 'strike', 'dstrike', 'caps', @@ -72,21 +129,127 @@ class TextStyle(object): except (ValueError, TypeError, AttributeError): self.spacing = None self.vertical_align = {'sub':'subscript', 'super':'superscript'}.get((css['vertical-align'] or '').lower(), 'baseline') - # TODO: Borders and padding - def __hash__(self): - return hash(tuple( - getattr(self, x) for x in self.ALL_PROPS)) + DOCXStyle.__init__(self) - def __eq__(self, other): - for x in self.ALL_PROPS: - if getattr(self, x) != getattr(other, x, None): - return False - return True +class BlockStyle(DOCXStyle): + + ALL_PROPS = tuple( + 'text_align page_break_before keep_lines keep_next css_text_indent text_indent line_height css_line_height background_color'.split() + + ['margin_' + edge for edge in border_edges] + + ['css_margin_' + edge for edge in border_edges] + + [x%edge for edge in border_edges for x in border_props] + ) + + def __init__(self, css, html_block, is_first_block=False): + self.page_break_before = html_block.tag.endswith('}body') or (not is_first_block and css['page-break-before'] == 'always') + self.keep_lines = css['page-break-inside'] == 'avoid' + # TODO: Ensure that only the last docx block for this html block has the correct value for keep next + self.keep_next = css['page-break-after'] == 'avoid' + for edge in border_edges: + # In DOCX padding can only be a positive integer + setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge]))) + # In DOCX margin must be a positive integer in twips (twentieth of a point) + setattr(self, 'margin_' + edge, max(0, int(css['margin-' + edge] * 20))) + setattr(self, 'css_margin_' + edge, css._style.get('margin-' + edge, '')) + val = min(96, max(2, int({'thin':0.2, 'medium':1, 'thick':2}.get(css['border-%s-width' % edge], 0) * 8))) + setattr(self, 'border_%s_width' % edge, val) + setattr(self, 'border_%s_color' % edge, css_color_to_rgb(css['border-%s-color' % edge])) + setattr(self, 'border_%s_style' % edge, LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none')) + self.text_indent = max(0, int(css['text-indent'] * 20)) + self.css_text_indent = css._get('text-indent') + self.line_height = max(0, int(css['line-height'] * 20)) + self.css_line_height = css._get('line-height') + self.background_color = css_color_to_rgb(css['background-color']) + self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get( + css['text-align'].lower(), 'left') + + DOCXStyle.__init__(self) + + def serialize(self, style): + spacing = style.makeelement(w('spacing')) + for edge, attr in {'top':'before', 'bottom':'after'}.iteritems(): + css_val, css_unit = parse_css_length(getattr(self, 'css_margin_' + edge)) + if css_unit in ('em', 'ex'): + lines = max(0, int(css_val * (50 if css_unit == 'ex' else 100))) + if lines > 0: + spacing.set(w(attr + 'Lines'), str(lines)) + else: + val = getattr(self, 'margin_' + edge) + if val > 0: + spacing.set(w(attr), str(val)) + if self.css_line_height != 'normal': + try: + css_val, css_unit = float(self.css_line_height), 'ratio' + except Exception: + css_val, css_unit = parse_css_length(self.css_line_height) + if css_unit in {'em', 'ex', '%', 'ratio'}: + mult = {'ex':0.5, '%':0.01}.get(css_unit, 1) + val = int(css_val * 240 * mult) + spacing.set(w('line'), str(val)) + else: + spacing.set(w('line'), str(self.line_height)) + spacing.set(w('lineRule', 'exactly')) + + if spacing.attrib: + style.append(spacing) + + ind = style.makeelement(w('ind')) + for edge in ('left', 'right'): + css_val, css_unit = parse_css_length(getattr(self, 'css_margin_' + edge)) + if css_unit in ('em', 'ex'): + chars = max(0, int(css_val * (50 if css_unit == 'ex' else 100))) + if chars > 0: + ind.set(w(edge + 'Chars'), str(chars)) + else: + val = getattr(self, 'margin_' + edge) + if val > 0: + ind.set(w(attr), str(val)) + css_val, css_unit = parse_css_length(self.css_text_indent) + if css_unit in ('em', 'ex'): + chars = max(0, int(css_val * (50 if css_unit == 'ex' else 100))) + if chars > 0: + ind.set('firstLineChars', str(chars)) + else: + val = self.text_indent + if val > 0: + ind.set('firstLine', str(val)) + if ind.attrib: + style.append(ind) + + if self.background_color: + shd = style.makeelement(w('shd')) + style.append(shd) + shd.set(w('val'), 'clear'), shd.set(w('fill'), self.background_color), shd.set(w('color'), 'auto') + + pbdr = style.makeelement(w('pBdr')) + for edge in border_edges: + e = pbdr.makeelement(w(edge)) + padding = getattr(self, 'padding_' + edge) + if padding > 0: + e.set(w('space'), str(padding)) + width = getattr(self, 'border_%s_width' % edge) + bstyle = getattr(self, 'border_%s_style' % edge) + if width > 0 and bstyle != 'none': + e.set(w('val'), bstyle) + e.set(w('sz'), str(width)) + e.set(w('color'), getattr(self, 'border_%s_color' % edge)) + if e.attrib: + pbdr.append(e) + if len(pbdr): + style.append(pbdr) + jc = style.makeelement(w('jc')) + jc.set(w('val'), self.text_align) + style.append(jc) + if self.page_break_before: + style.append(style.makeelement(w('pageBreakBefore'), **{w('val'):'on'})) + if self.keep_lines: + style.append(style.makeelement(w('keepLines'), **{w('val'):'on'})) + if self.keep_next: + style.append(style.makeelement(w('keepNext'), **{w('val'):'on'})) + return style - def __ne__(self, other): - return not self == other class LineBreak(object): @@ -97,7 +260,8 @@ class TextRun(object): ws_pat = None - def __init__(self, style): + def __init__(self, style, first_html_parent): + self.first_html_parent = first_html_parent if self.ws_pat is None: TextRun.ws_pat = self.ws_pat = re.compile(r'\s+') self.style = style @@ -125,20 +289,20 @@ class TextRun(object): if preserve_whitespace: t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') -style_cache = {} - class Block(object): - def __init__(self): + def __init__(self, html_block, style, is_first_block=False): + self.html_block = html_block + self.style = BlockStyle(style, html_block, is_first_block=is_first_block) self.runs = [] - def add_text(self, text, style, ignore_leading_whitespace=False): + def add_text(self, text, style, ignore_leading_whitespace=False, html_parent=None): ts = TextStyle(style) ws = style['white-space'] if self.runs and ts == self.runs[-1].style: run = self.runs[-1] else: - run = TextRun(ts) + run = TextRun(ts, html_parent or self.html_block) self.runs.append(run) preserve_whitespace = ws in {'pre', 'pre-wrap'} if ignore_leading_whitespace and not preserve_whitespace: @@ -176,9 +340,11 @@ class Convert(object): def process_item(self, item): stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.opts.output_profile) + is_first_block = True for body in XPath('//h:body')(item.data): - b = Block() + b = Block(body, stylizer.style(body), is_first_block=is_first_block) self.blocks.append(b) + is_first_block = False self.process_block(body, b, stylizer, ignore_tail=True) def process_block(self, html_block, docx_block, stylizer, ignore_tail=False): @@ -192,7 +358,7 @@ class Convert(object): if tag == 'img': return # TODO: Handle images if display == 'block': - b = Block() + b = Block(child, style) self.blocks.append(b) self.process_block(child, b, stylizer) else: @@ -201,7 +367,7 @@ class Convert(object): if ignore_tail is False and html_block.tail and html_block.tail.strip(): b = docx_block if b is not self.blocks[-1]: - b = Block() + b = Block(html_block, stylizer.style(html_block)) self.blocks.append(b) b.add_text(html_block.tail, stylizer.style(html_block.getparent())) @@ -211,19 +377,19 @@ class Convert(object): return # TODO: Handle images style = stylizer.style(html_child) if html_child.text: - docx_block.add_text(html_child.text, style) + docx_block.add_text(html_child.text, style, html_parent=html_child) for child in html_child.iterchildren(etree.Element): style = stylizer.style(child) display = style.get('display', 'inline') if display == 'block': - b = Block() + b = Block(child, style) self.blocks.append(b) self.process_block(child, b, stylizer) else: self.process_inline(child, self.blocks[-1], stylizer) if html_child.tail: - docx_block.add_text(html_child.tail, stylizer.style(html_child.getparent())) + self.blocks[-1].add_text(html_child.tail, stylizer.style(html_child.getparent()), html_parent=html_child.getparent()) def write(self): dn = {k:v for k, v in namespaces.iteritems() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne'}} @@ -240,11 +406,15 @@ class Convert(object): E.docDefaults( E.rPrDefault( E.rPr( - E.rFonts(), + E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}), + E.sz(**{w('val'):'22'}), + E.szCs(**{w('val'):'22'}), + E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"}) ) ), E.pPrDefault( E.pPr( + E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"}) ) ) )