From bcb19457e23d5aee00c1effe06f282c0419c59b9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 9 May 2013 17:01:45 +0530 Subject: [PATCH] More work on DOCX input --- src/calibre/ebooks/docx/block_styles.py | 265 ++++++++++++ src/calibre/ebooks/docx/char_styles.py | 228 ++++++++++ src/calibre/ebooks/docx/styles.py | 537 ++++++++---------------- src/calibre/ebooks/docx/to_html.py | 76 +++- 4 files changed, 747 insertions(+), 359 deletions(-) create mode 100644 src/calibre/ebooks/docx/block_styles.py create mode 100644 src/calibre/ebooks/docx/char_styles.py diff --git a/src/calibre/ebooks/docx/block_styles.py b/src/calibre/ebooks/docx/block_styles.py new file mode 100644 index 0000000000..3d1b91c957 --- /dev/null +++ b/src/calibre/ebooks/docx/block_styles.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from collections import OrderedDict +from calibre.ebooks.docx.names import XPath, get + +class Inherit: + pass +inherit = Inherit() + +def binary_property(parent, name): + vals = XPath('./w:%s' % name)(parent) + if not vals: + return inherit + val = get(vals[0], 'w:val', 'on') + return True if val in {'on', '1', 'true'} else False + +def simple_color(col, auto='black'): + if not col or col == 'auto' or len(col) != 6: + return auto + return '#'+col + +def simple_float(val, mult=1.0): + try: + return float(val) * mult + except (ValueError, TypeError, AttributeError, KeyError): + return None + + +LINE_STYLES = { # {{{ + 'basicBlackDashes': 'dashed', + 'basicBlackDots': 'dotted', + 'basicBlackSquares': 'dashed', + 'basicThinLines': 'solid', + 'dashDotStroked': 'groove', + 'dashed': 'dashed', + 'dashSmallGap': 'dashed', + 'dotDash': 'dashed', + 'dotDotDash': 'dashed', + 'dotted': 'dotted', + 'double': 'double', + 'inset': 'inset', + 'nil': 'none', + 'none': 'none', + 'outset': 'outset', + 'single': 'solid', + 'thick': 'solid', + 'thickThinLargeGap': 'double', + 'thickThinMediumGap': 'double', + 'thickThinSmallGap' : 'double', + 'thinThickLargeGap': 'double', + 'thinThickMediumGap': 'double', + 'thinThickSmallGap': 'double', + 'thinThickThinLargeGap': 'double', + 'thinThickThinMediumGap': 'double', + 'thinThickThinSmallGap': 'double', + 'threeDEmboss': 'ridge', + 'threeDEngrave': 'groove', + 'triple': 'double', +} # }}} + +# Read from XML {{{ +def read_border(parent, dest): + tvals = {'padding_%s':inherit, 'border_%s_width':inherit, + 'border_%s_style':inherit, 'border_%s_color':inherit} + vals = {} + for edge in ('left', 'top', 'right', 'bottom'): + vals.update({k % edge:v for k, v in tvals.iteritems()}) + + for border in XPath('./w:pBdr')(parent): + for edge in ('left', 'top', 'right', 'bottom'): + for elem in XPath('./w:%s' % edge): + color = get(elem, 'w:color') + if color is not None: + vals['border_%s_color' % edge] = simple_color(color) + style = get(elem, 'w:val') + if style is not None: + vals['border_%s_style' % edge] = LINE_STYLES.get(style, 'solid') + space = get(elem, 'w:space') + if space is not None: + try: + vals['padding_%s' % edge] = float(space) + except (ValueError, TypeError): + pass + sz = get(elem, 'w:sz') + if sz is not None: + # we dont care about art borders (they are only used for page borders) + try: + vals['border_%s_width' % edge] = min(96, max(2, float(sz))) / 8 + except (ValueError, TypeError): + pass + + for key, val in vals.iteritems(): + setattr(dest, key, val) + +def read_indent(parent, dest): + padding_left = padding_right = text_indent = inherit + for indent in XPath('./w:ind')(parent): + l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars') + pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None + if pl is not None: + padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt') + + r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars') + pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None + if pr is not None: + padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt') + + h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars') + fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars') + ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else + simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None) + if ti is not None: + text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt') + + setattr(dest, 'margin_left', padding_left) + setattr(dest, 'margin_right', padding_right) + setattr(dest, 'text_indent', text_indent) + +def read_justification(parent, dest): + ans = inherit + for jc in XPath('./w:jc[@w:val]')(parent): + val = get(jc, 'w:val') + if not val: + continue + if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val: + ans = 'justify' + if val in {'left', 'center', 'right',}: + ans = val + setattr(dest, 'text_align', ans) + +def read_spacing(parent, dest): + padding_top = padding_bottom = line_height = inherit + for s in XPath('./w:spacing')(parent): + a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing') + pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None + if pb is not None: + padding_bottom = '%.3g%s' % (pb, 'ex' if al is not None else 'pt') + + b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing') + pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None + if pt is not None: + padding_top = '%.3g%s' % (pt, 'ex' if bl is not None else 'pt') + + l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto') + if l is not None: + lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0) + line_height = '%.3g%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '') + + setattr(dest, 'margin_top', padding_top) + setattr(dest, 'margin_bottom', padding_bottom) + setattr(dest, 'line_height', line_height) + +def read_direction(parent, dest): + ans = inherit + for jc in XPath('./w:textFlow[@w:val]')(parent): + val = get(jc, 'w:val') + if not val: + continue + if 'rl' in val.lower(): + ans = 'rtl' + setattr(dest, 'direction', ans) + +def read_shd(parent, dest): + ans = inherit + for shd in XPath('./w:shd[@w:fill]')(parent): + val = get(shd, 'w:fill') + if val: + ans = simple_color(val, auto='transparent') + setattr(dest, 'background_color', ans) +# }}} + +class ParagraphStyle(object): + + all_properties = ( + 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi', + 'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents', + 'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers', + 'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap', + + # Border margins padding + 'border_left_width', 'border_left_style', 'border_left_color', 'padding_left', + 'border_top_width', 'border_top_style', 'border_top_color', 'padding_top', + 'border_right_width', 'border_right_style', 'border_right_color', 'padding_right', + 'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom', + 'margin_left', 'margin_top', 'margin_right', 'margin_bottom', + + # Misc. + 'text_indent', 'text_align', 'line_height', 'direction', 'background_color', + ) + + def __init__(self, pPr=None): + self.linked_style = None + if pPr is None: + for p in self.all_properties: + setattr(self, p, inherit) + else: + for p in ( + 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi', + 'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents', + 'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers', + 'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap', + ): + setattr(self, p, binary_property(pPr, p)) + + for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'): + f = globals()['read_%s' % x] + f(pPr, self) + + for s in XPath('./w:pStyle[@w:val]')(pPr): + self.linked_style = get(s, 'w:val') + + self._css = None + + def update(self, other): + for prop in self.all_properties: + nval = getattr(other, prop) + if nval is not inherit: + setattr(self, prop, nval) + if other.linked_style is not None: + self.linked_style = other.linked_style + + def resolve_based_on(self, parent): + for p in self.all_properties: + val = getattr(self, p) + if val is inherit: + setattr(self, p, getattr(parent, p)) + + @property + def css(self): + if self._css is None: + self._css = c = OrderedDict() + if self.keepLines is True: + c['page-break-inside'] = 'avoid' + if self.pageBreakBefore is True: + c['page-break-before'] = 'always' + for edge in ('left', 'top', 'right', 'bottom'): + val = getattr(self, 'border_%s_width' % edge) + if val is not inherit: + c['border-left-width'] = '%.3gpt' % val + for x in ('style', 'color'): + val = getattr(self, 'border_%s_%s' % (edge, x)) + if val is not inherit: + c['border-%s-%s' % (edge, x)] = val + val = getattr(self, 'padding_%s' % edge) + if val is not inherit: + c['padding-%s' % edge] = '%.3gpt' % val + val = getattr(self, 'margin_%s' % edge) + if val is not inherit: + c['margin-%s' % edge] = val + + for x in ('text_indent', 'text_align', 'line_height', 'background_color'): + val = getattr(self, x) + if val is not inherit: + c[x.replace('_', '-')] = val + return self._css + + # TODO: keepNext must be done at markup level + + diff --git a/src/calibre/ebooks/docx/char_styles.py b/src/calibre/ebooks/docx/char_styles.py new file mode 100644 index 0000000000..87203ff44a --- /dev/null +++ b/src/calibre/ebooks/docx/char_styles.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +from collections import OrderedDict +from calibre.ebooks.docx.block_styles import ( # noqa + inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd) +from calibre.ebooks.docx.names import XPath, get + +# Read from XML {{{ +def read_text_border(parent, dest): + border_color = border_style = border_width = padding = inherit + elems = XPath('./w:bdr')(parent) + if elems: + border_color = simple_color('auto') + border_style = 'solid' + border_width = 1 + for elem in elems: + color = get(elem, 'w:color') + if color is not None: + border_color = simple_color(color) + style = get(elem, 'w:val') + if style is not None: + border_style = LINE_STYLES.get(style, 'solid') + space = get(elem, 'w:space') + if space is not None: + try: + padding = float(space) + except (ValueError, TypeError): + pass + sz = get(elem, 'w:sz') + if sz is not None: + # we dont care about art borders (they are only used for page borders) + try: + border_width = min(96, max(2, float(sz))) / 8 + except (ValueError, TypeError): + pass + + setattr(dest, 'border_color', border_color) + setattr(dest, 'border_style', border_style) + setattr(dest, 'border_width', border_width) + setattr(dest, 'padding', padding) + +def read_color(parent, dest): + ans = inherit + for col in XPath('./w:color[@w:val]')(parent): + val = get(col, 'w:val') + if not val: + continue + ans = simple_color(val) + setattr(dest, 'color', ans) + +def read_highlight(parent, dest): + ans = inherit + for col in XPath('./w:highlight[@w:val]')(parent): + val = get(col, 'w:val') + if not val: + continue + if not val or val == 'none': + val = 'transparent' + ans = val + setattr(dest, 'highlight', ans) + +def read_lang(parent, dest): + ans = inherit + for col in XPath('./w:lang[@w:val]')(parent): + val = get(col, 'w:val') + if not val: + continue + try: + code = int(val, 16) + except (ValueError, TypeError): + ans = val + else: + from calibre.ebooks.docx.lcid import lcid + val = lcid.get(code, None) + if val: + ans = val + setattr(dest, 'lang', ans) + +def read_letter_spacing(parent, dest): + ans = inherit + for col in XPath('./w:spacing[@w:val]')(parent): + val = simple_float(get(col, 'w:val'), 0.05) + if val is not None: + ans = val + setattr(dest, 'letter_spacing', ans) + +def read_sz(parent, dest): + ans = inherit + for col in XPath('./w:sz[@w:val]')(parent): + val = simple_float(get(col, 'w:val'), 0.5) + if val is not None: + ans = val + setattr(dest, 'font_size', ans) + +def read_underline(parent, dest): + ans = inherit + for col in XPath('./w:u[@w:val]')(parent): + val = get(col, 'w:val') + if val: + ans = 'underline' + setattr(dest, 'text_decoration', ans) + +def read_vert_align(parent, dest): + ans = inherit + for col in XPath('./w:vertAlign[@w:val]')(parent): + val = get(col, 'w:val') + if val and val in {'baseline', 'subscript', 'superscript'}: + ans = val + setattr(dest, 'vert_align', ans) +# }}} + +class RunStyle(object): + + all_properties = { + 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', + 'rtl', 'shadow', 'smallCaps', 'strike', 'vanish', + + 'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background_color', + 'letter_spacing', 'font_size', 'text_decoration', 'vert_align', 'lang', + } + + toggle_properties = { + 'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'vanish', + } + + def __init__(self, rPr=None): + self.linked_style = None + if rPr is None: + for p in self.all_properties: + setattr(self, p, inherit) + else: + for p in ( + 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', + 'smallCaps', 'strike', 'vanish', + ): + setattr(self, p, binary_property(rPr, p)) + + for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align', 'lang'): + f = globals()['read_%s' % x] + f(rPr, self) + + for s in XPath('./w:rStyle[@w:val]')(rPr): + self.linked_style = get(s, 'w:val') + + self._css = None + + def update(self, other): + for prop in self.all_properties: + nval = getattr(other, prop) + if nval is not inherit: + setattr(self, prop, nval) + if other.linked_style is not None: + self.linked_style = other.linked_style + + def resolve_based_on(self, parent): + for p in self.all_properties: + val = getattr(self, p) + if val is inherit: + setattr(self, p, getattr(parent, p)) + + @property + def css(self): + if self._css is None: + c = self._css = OrderedDict() + td = set() + if self.text_decoration is not inherit: + td.add(self.text_decoration) + if self.strike: + td.add('line-through') + if self.dstrike: + td.add('overline') + td.add('line-through') + if td: + c['text-decoration'] = ' '.join(td) + if self.caps is True: + c['text-transform'] = 'uppercase' + if self.i is True: + c['font-style'] = 'italic' + if self.shadow: + c['text-shadow'] = '2px 2px' + if self.smallCaps is True: + c['font-variant'] = 'small-caps' + if self.vanish is True: + c['display'] = 'none' + + for x in ('color', 'style', 'width'): + val = getattr(self, 'border_'+x) + if x == 'width' and val is not inherit: + val = '%.3gpt' % val + if val is not inherit: + c['border-%s' % x] = val + if self.padding is not inherit: + c['padding'] = '%.3gpt' % self.padding + + for x in ('color', 'background_color'): + val = getattr(self, x) + if val is not inherit: + c[x.replace('_', '-')] = val + + for x in ('letter_spacing', 'font_size'): + val = getattr(self, x) + if val is not inherit: + c[x.replace('_', '-')] = '%.3gpt' % val + + if self.highlight is not inherit and self.highlight != 'transparent': + c['background-color'] = self.highlight + return self._css + + def same_border(self, other): + for x in (self, other): + has_border = False + for y in ('color', 'style', 'width'): + if ('border-%s' % y) in x.css: + has_border = True + break + if not has_border: + return False + + s = tuple(self.css.get('border-%s' % y, None) for y in ('color', 'style', 'width')) + o = tuple(other.css.get('border-%s' % y, None) for y in ('color', 'style', 'width')) + return s == o + diff --git a/src/calibre/ebooks/docx/styles.py b/src/calibre/ebooks/docx/styles.py index f88b09bd26..6d41715d6d 100644 --- a/src/calibre/ebooks/docx/styles.py +++ b/src/calibre/ebooks/docx/styles.py @@ -6,356 +6,23 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -from collections import OrderedDict +from collections import OrderedDict, Counter +from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit +from calibre.ebooks.docx.char_styles import RunStyle from calibre.ebooks.docx.names import XPath, get -class Inherit: - pass -inherit = Inherit() - -def binary_property(parent, name): - vals = XPath('./w:%s' % name)(parent) - if not vals: - return inherit - val = get(vals[0], 'w:val', 'on') - return True if val in {'on', '1', 'true'} else False - -def simple_color(col, auto='black'): - if not col or col == 'auto' or len(col) != 6: - return auto - return '#'+col - -def simple_float(val, mult=1.0): - try: - return float(val) * mult - except (ValueError, TypeError, AttributeError, KeyError): - return None - -# Block styles {{{ - -LINE_STYLES = { # {{{ - 'basicBlackDashes': 'dashed', - 'basicBlackDots': 'dotted', - 'basicBlackSquares': 'dashed', - 'basicThinLines': 'solid', - 'dashDotStroked': 'groove', - 'dashed': 'dashed', - 'dashSmallGap': 'dashed', - 'dotDash': 'dashed', - 'dotDotDash': 'dashed', - 'dotted': 'dotted', - 'double': 'double', - 'inset': 'inset', - 'nil': 'none', - 'none': 'none', - 'outset': 'outset', - 'single': 'solid', - 'thick': 'solid', - 'thickThinLargeGap': 'double', - 'thickThinMediumGap': 'double', - 'thickThinSmallGap' : 'double', - 'thinThickLargeGap': 'double', - 'thinThickMediumGap': 'double', - 'thinThickSmallGap': 'double', - 'thinThickThinLargeGap': 'double', - 'thinThickThinMediumGap': 'double', - 'thinThickThinSmallGap': 'double', - 'threeDEmboss': 'ridge', - 'threeDEngrave': 'groove', - 'triple': 'double', -} # }}} - -def read_border(parent, dest): - tvals = {'padding_%s':inherit, 'border_%s_width':inherit, - 'border_%s_style':inherit, 'border_%s_color':inherit} - vals = {} - for edge in ('left', 'top', 'right', 'bottom'): - vals.update({k % edge:v for k, v in tvals.iteritems()}) - - for border in XPath('./w:pBdr')(parent): - for edge in ('left', 'top', 'right', 'bottom'): - for elem in XPath('./w:%s' % edge): - color = get(elem, 'w:color') - if color is not None: - vals['border_%s_color' % edge] = simple_color(color) - style = get(elem, 'w:val') - if style is not None: - vals['border_%s_style' % edge] = LINE_STYLES.get(style, 'solid') - space = get(elem, 'w:space') - if space is not None: - try: - vals['padding_%s' % edge] = float(space) - except (ValueError, TypeError): - pass - sz = get(elem, 'w:sz') - if sz is not None: - # we dont care about art borders (they are only used for page borders) - try: - vals['border_%s_width' % edge] = min(96, max(2, float(sz))) / 8 - except (ValueError, TypeError): - pass - - for key, val in vals.iteritems(): - setattr(dest, key, val) - -def read_indent(parent, dest): - padding_left = padding_right = text_indent = inherit - for indent in XPath('./w:ind')(parent): - l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars') - pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None - if pl is not None: - padding_left = '%.3f%s' % (pl, 'em' if lc is not None else 'pt') - - r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars') - pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None - if pr is not None: - padding_right = '%.3f%s' % (pr, 'em' if rc is not None else 'pt') - - h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars') - fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars') - ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else - simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None) - if ti is not None: - text_indent = '%.3f%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt') - - setattr(dest, 'margin_left', padding_left) - setattr(dest, 'margin_right', padding_right) - setattr(dest, 'text_indent', text_indent) - -def read_justification(parent, dest): - ans = inherit - for jc in XPath('./w:jc[@w:val]')(parent): - val = get(jc, 'w:val') - if not val: - continue - if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val: - ans = 'justify' - if val in {'left', 'center', 'right',}: - ans = val - setattr(dest, 'text_align', ans) - -def read_spacing(parent, dest): - padding_top = padding_bottom = line_height = inherit - for s in XPath('./w:spacing')(parent): - a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing') - pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None - if pb is not None: - padding_bottom = '%.3f%s' % (pb, 'ex' if al is not None else 'pt') - - b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing') - pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None - if pt is not None: - padding_top = '%.3f%s' % (pt, 'ex' if bl is not None else 'pt') - - l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto') - if l is not None: - lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0) - line_height = '%.3f%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '') - - setattr(dest, 'margin_top', padding_top) - setattr(dest, 'margin_bottom', padding_bottom) - setattr(dest, 'line_height', line_height) - -def read_direction(parent, dest): - ans = inherit - for jc in XPath('./w:textFlow[@w:val]')(parent): - val = get(jc, 'w:val') - if not val: - continue - if 'rl' in val.lower(): - ans = 'rtl' - setattr(dest, 'direction', ans) - -def read_shd(parent, dest): - ans = inherit - for shd in XPath('./w:shd[@w:fill]')(parent): - val = get(shd, 'w:fill') - if val: - ans = simple_color(val, auto='transparent') - setattr(dest, 'background_color', ans) - -class ParagraphStyle(object): - - all_properties = ( - 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi', - 'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents', - 'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers', - 'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap', - - # Border margins padding - 'border_left_width', 'border_left_style', 'border_left_color', 'padding_left', - 'border_top_width', 'border_top_style', 'border_top_color', 'padding_top', - 'border_right_width', 'border_right_style', 'border_right_color', 'padding_right', - 'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom', - 'margin_left', 'margin_top', 'margin_right', 'margin_bottom', - - # Misc. - 'text_indent', 'text_align', 'line_height', 'direction', 'background_color', - ) - - def __init__(self, pPr): - for p in ( - 'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi', - 'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents', - 'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers', - 'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap', - ): - setattr(self, p, binary_property(pPr, p)) - - for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'): - f = globals()['read_%s' % x] - f(pPr, self) - - # TODO: numPr and outlineLvl - - def update(self, other): - for prop in self.all_properties: - nval = getattr(other, prop) - if nval is not inherit: - setattr(self, prop, nval) - -# }}} - -# Character styles {{{ -def read_text_border(parent, dest): - border_color = border_style = border_width = padding = inherit - elems = XPath('./w:bdr')(parent) - if elems: - border_color = simple_color('auto') - border_style = 'solid' - border_width = 1 - for elem in elems: - color = get(elem, 'w:color') - if color is not None: - border_color = simple_color(color) - style = get(elem, 'w:val') - if style is not None: - border_style = LINE_STYLES.get(style, 'solid') - space = get(elem, 'w:space') - if space is not None: - try: - padding = float(space) - except (ValueError, TypeError): - pass - sz = get(elem, 'w:sz') - if sz is not None: - # we dont care about art borders (they are only used for page borders) - try: - border_width = min(96, max(2, float(sz))) / 8 - except (ValueError, TypeError): - pass - - setattr(dest, 'border_color', border_color) - setattr(dest, 'border_style', border_style) - setattr(dest, 'border_width', border_width) - setattr(dest, 'padding', padding) - -def read_color(parent, dest): - ans = inherit - for col in XPath('./w:color[@w:val]')(parent): - val = get(col, 'w:val') - if not val: - continue - ans = simple_color(val) - setattr(dest, 'color', ans) - -def read_highlight(parent, dest): - ans = inherit - for col in XPath('./w:highlight[@w:val]')(parent): - val = get(col, 'w:val') - if not val: - continue - if not val or val == 'none': - val = 'transparent' - ans = val - setattr(dest, 'highlight', ans) - -def read_lang(parent, dest): - ans = inherit - for col in XPath('./w:lang[@w:val]')(parent): - val = get(col, 'w:val') - if not val: - continue - try: - code = int(val, 16) - except (ValueError, TypeError): - ans = val - else: - from calibre.ebooks.docx.lcid import lcid - val = lcid.get(code, None) - if val: - ans = val - setattr(dest, 'lang', ans) - -def read_letter_spacing(parent, dest): - ans = inherit - for col in XPath('./w:spacing[@w:val]')(parent): - val = simple_float(get(col, 'w:val'), 0.05) - if val: - ans = val - setattr(dest, 'letter_spacing', ans) - -def read_sz(parent, dest): - ans = inherit - for col in XPath('./w:sz[@w:val]')(parent): - val = simple_float(get(col, 'w:val'), 0.5) - if val: - ans = val - setattr(dest, 'font_size', ans) - -def read_underline(parent, dest): - ans = inherit - for col in XPath('./w:u[@w:val]')(parent): - val = get(col, 'w:val') - if val: - ans = 'underline' - setattr(dest, 'text_decoration', ans) - -def read_vert_align(parent, dest): - ans = inherit - for col in XPath('./w:vertAlign[@w:val]')(parent): - val = get(col, 'w:val') - if val and val in {'baseline', 'subscript', 'superscript'}: - ans = val - setattr(dest, 'vert_align', ans) - - -class RunStyle(object): - - all_properties = ( - 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', - 'smallCaps', 'strike', 'vanish', - - 'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background-color', - 'letter_spacing', 'font_size', 'text_decoration', 'vert_align', - ) - - def __init__(self, rPr): - for p in ( - 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', - 'smallCaps', 'strike', 'vanish', - ): - setattr(self, p, binary_property(rPr, p)) - - for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align'): - f = globals()['read_%s' % x] - f(rPr, self) - - def update(self, other): - for prop in self.all_properties: - nval = getattr(other, prop) - if nval is not inherit: - setattr(self, prop, nval) -# }}} class Style(object): + ''' + Class representing a element. Can contain block, character, etc. styles. + ''' name_path = XPath('./w:name[@w:val]') based_on_path = XPath('./w:basedOn[@w:val]') - link_path = XPath('./w:link[@w:val]') def __init__(self, elem): + self.resolved = False self.style_id = get(elem, 'w:styleId') self.style_type = get(elem, 'w:type') names = self.name_path(elem) @@ -364,10 +31,6 @@ class Style(object): self.based_on = get(based_on[0], 'w:val') if based_on else None if self.style_type == 'numbering': self.based_on = None - link = self.link_path(elem) - self.link = get(link[0], 'w:val') if link else None - if self.style_type not in {'paragraph', 'character'}: - self.link = None self.paragraph_style = self.character_style = None @@ -387,11 +50,30 @@ class Style(object): else: self.character_style.update(rs) + def resolve_based_on(self, parent): + if parent.paragraph_style is not None: + if self.paragraph_style is None: + self.paragraph_style = ParagraphStyle() + self.paragraph_style.resolve_based_on(parent.paragraph_style) + if parent.character_style is not None: + if self.character_style is None: + self.character_style = RunStyle() + self.character_style.resolve_based_on(parent.character_style) + class Styles(object): + ''' + Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup. + ''' + def __init__(self): self.id_map = OrderedDict() + self.para_cache = {} + self.para_char_cache = {} + self.run_cache = {} + self.classes = {} + self.counter = Counter() def __iter__(self): for s in self.id_map.itervalues(): @@ -412,19 +94,160 @@ class Styles(object): if s.style_id: self.id_map[s.style_id] = s - # Nuke based_on, link attributes that refer to missing/incompatible - # styles + self.default_paragraph_style = self.default_character_style = None + + for dd in XPath('./w:docDefaults')(root): + for pd in XPath('./w:pPrDefault')(dd): + for pPr in XPath('./w:pPr')(pd): + ps = ParagraphStyle(pPr) + if self.default_paragraph_style is None: + self.default_paragraph_style = ps + else: + self.default_paragraph_style.update(ps) + for pd in XPath('./w:rPrDefault')(dd): + for pPr in XPath('./w:rPr')(pd): + ps = RunStyle(pPr) + if self.default_character_style is None: + self.default_character_style = ps + else: + self.default_character_style.update(ps) + + def resolve(s, p): + if p is not None: + if not p.resolved: + resolve(p, self.get(p.based_on)) + s.resolve_based_on(p) + s.resolved = True + for s in self: - bo = s.based_on - if bo is not None: - p = self.get(bo) - if p is None or p.style_type != s.style_type: - s.based_on = None - link = s.link - if link is not None: - p = self.get(link) - if p is None or (s.style_type, p.style_type) not in {('paragraph', 'character'), ('character', 'paragraph')}: - s.link = None + if not s.resolved: + resolve(s, self.get(s.based_on)) - # TODO: Document defaults (docDefaults) + def para_val(self, parent_styles, direct_formatting, attr): + val = getattr(direct_formatting, attr) + if val is inherit: + for ps in reversed(parent_styles): + pval = getattr(ps, attr) + if pval is not inherit: + val = pval + break + return val + + def run_val(self, parent_styles, direct_formatting, attr): + val = getattr(direct_formatting, attr) + if val is not inherit: + return val + if attr in direct_formatting.toggle_properties: + val = False + for rs in parent_styles: + pval = getattr(rs, attr) + if pval is True: + val ^= True + return val + for rs in reversed(parent_styles): + rval = getattr(rs, attr) + if rval is not inherit: + return rval + return val + + def resolve_paragraph(self, p): + ans = self.para_cache.get(p, None) + if ans is None: + ans = self.para_cache[p] = ParagraphStyle() + ans.style_name = None + direct_formatting = None + for pPr in XPath('./w:pPr')(p): + ps = ParagraphStyle(pPr) + if direct_formatting is None: + direct_formatting = ps + else: + direct_formatting.update(ps) + + if direct_formatting is None: + direct_formatting = ParagraphStyle() + parent_styles = [] + if self.default_paragraph_style is not None: + parent_styles.append(self.default_paragraph_style) + if direct_formatting.linked_style is not None: + ls = self.get(direct_formatting.linked_style) + if ls is not None: + ans.style_name = ls.name + ps = ls.paragraph_style + if ps is not None: + parent_styles.append(ps) + if ls.character_style is not None: + self.para_char_cache[p] = ls.character_style + + for attr in ans.all_properties: + setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr)) + return ans + + def resolve_run(self, r): + ans = self.run_cache.get(r, None) + if ans is None: + p = r.getparent() + ans = self.run_cache[r] = RunStyle() + direct_formatting = None + for rPr in XPath('./w:rPr')(r): + rs = RunStyle(rPr) + if direct_formatting is None: + direct_formatting = rs + else: + direct_formatting.update(rs) + + if direct_formatting is None: + direct_formatting = RunStyle() + + parent_styles = [] + if self.default_character_style is not None: + parent_styles.append(self.default_character_style) + pstyle = self.para_char_cache.get(p, None) + if pstyle is not None: + parent_styles.append(pstyle) + if direct_formatting.linked_style is not None: + ls = self.get(direct_formatting.linked_style).character_style + if ls is not None: + parent_styles.append(ls) + + for attr in ans.all_properties: + setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr)) + + return ans + + def resolve(self, obj): + if obj.tag.endswith('}p'): + return self.resolve_paragraph(obj) + if obj.tag.endswith('}r'): + return self.resolve_run(obj) + + def register(self, css, prefix): + h = hash(tuple(css.iteritems())) + ans, _ = self.classes.get(h, (None, None)) + if ans is None: + self.counter[prefix] += 1 + ans = '%s_%d' % (prefix, self.counter[prefix]) + self.classes[h] = (ans, css) + return ans + + def generate_classes(self): + for bs in self.para_cache.itervalues(): + css = bs.css + if css: + self.register(css, 'block') + for bs in self.run_cache.itervalues(): + css = bs.css + if css: + self.register(css, 'text') + + def class_name(self, css): + h = hash(tuple(css.iteritems())) + return self.classes.get(h, (None, None))[0] + + def generate_css(self): + ans = [] + for (cls, css) in sorted(self.classes.itervalues(), key=lambda x:x[0]): + b = ('\t%s: %s;' % (k, v) for k, v in css.iteritems()) + b = '\n'.join(b) + ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';'))) + return '\n'.join(ans) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index f0e2c6385d..e443a2084a 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -import sys, os +import sys, os, re from lxml import html from lxml.html.builder import ( @@ -14,7 +14,7 @@ from lxml.html.builder import ( from calibre.ebooks.docx.container import DOCX, fromstring from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES -from calibre.ebooks.docx.styles import Styles +from calibre.ebooks.docx.styles import Styles, inherit from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 class Text: @@ -35,6 +35,7 @@ class Convert(object): self.mi = self.docx.metadata self.body = BODY() self.styles = Styles() + self.object_map = {} self.html = HTML( HEAD( META(charset='utf-8'), @@ -75,6 +76,16 @@ class Convert(object): for child in self.body: child.tail = '\n\t' self.body[-1].tail = '\n' + + self.styles.generate_classes() + for obj, html_obj in self.object_map.iteritems(): + style = self.styles.resolve(obj) + if style is not None: + css = style.css + if css: + cls = self.styles.class_name(css) + if cls: + html_obj.set('class', cls) self.write() def read_styles(self, relationships_by_type): @@ -96,6 +107,10 @@ class Convert(object): raw = html.tostring(self.html, encoding='utf-8', doctype='') with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: f.write(raw) + css = self.styles.generate_css() + if css: + with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f: + f.write(css.encode('utf-8')) def convert_p(self, p): dest = P() @@ -103,10 +118,58 @@ class Convert(object): span = self.convert_run(run) dest.append(span) + style = self.styles.resolve_paragraph(p) + m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) + if m is not None: + n = min(1, max(6, int(m.group(1)))) + dest.tag = 'h%d' % n + + if style.direction == 'rtl': + dest.set('dir', 'rtl') + + border_runs = [] + common_borders = [] + for span in dest: + run = self.object_map[span] + style = self.styles.resolve_run(run) + if not border_runs or border_runs[-1][1].same_border(style): + border_runs.append((span, style)) + elif border_runs: + if len(border_runs) > 1: + common_borders.append(border_runs) + border_runs = [] + + for border_run in common_borders: + spans = [] + bs = {} + for span, style in border_run: + c = style.css + spans.append(span) + for x in ('width', 'color', 'style'): + val = c.pop('border-%s' % x, None) + if val is not None: + bs['border-%s' % x] = val + if bs: + cls = self.styles.register(bs, 'text_border') + wrapper = self.wrap_elems(spans, SPAN()) + wrapper.set('class', cls) + + self.object_map[p] = dest return dest + def wrap_elems(self, elems, wrapper): + p = elems[0].getparent() + idx = p.index(elems[0]) + p.insert(idx, wrapper) + wrapper.tail = elems[-1].tail + elems[-1].tail = None + for elem in elems: + p.remove(elem) + wrapper.append(elem) + def convert_run(self, run): ans = SPAN() + ans.run = run text = Text(ans, 'text', []) for child in run: @@ -121,6 +184,7 @@ class Convert(object): text.buf.append(child.text) elif is_tag(child, 'w:cr'): text.add_elem(BR()) + ans.append(text.elem) elif is_tag(child, 'w:br'): typ = child.get('type', None) if typ in {'column', 'page'}: @@ -132,8 +196,16 @@ class Convert(object): else: br = BR() text.add_elem(br) + ans.append(text.elem) if text.buf: setattr(text.elem, text.attr, ''.join(text.buf)) + + style = self.styles.resolve_run(run) + if style.vert_align in {'superscript', 'subscript'}: + ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup' + if style.lang is not inherit: + ans.lang = style.lang + self.object_map[ans] = run return ans if __name__ == '__main__':