More work on DOCX input

This commit is contained in:
Kovid Goyal 2013-05-09 17:01:45 +05:30
parent 876c0de8e7
commit bcb19457e2
4 changed files with 747 additions and 359 deletions

View File

@ -0,0 +1,265 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.names import XPath, get
class Inherit:
pass
inherit = Inherit()
def binary_property(parent, name):
vals = XPath('./w:%s' % name)(parent)
if not vals:
return inherit
val = get(vals[0], 'w:val', 'on')
return True if val in {'on', '1', 'true'} else False
def simple_color(col, auto='black'):
if not col or col == 'auto' or len(col) != 6:
return auto
return '#'+col
def simple_float(val, mult=1.0):
try:
return float(val) * mult
except (ValueError, TypeError, AttributeError, KeyError):
return None
LINE_STYLES = { # {{{
'basicBlackDashes': 'dashed',
'basicBlackDots': 'dotted',
'basicBlackSquares': 'dashed',
'basicThinLines': 'solid',
'dashDotStroked': 'groove',
'dashed': 'dashed',
'dashSmallGap': 'dashed',
'dotDash': 'dashed',
'dotDotDash': 'dashed',
'dotted': 'dotted',
'double': 'double',
'inset': 'inset',
'nil': 'none',
'none': 'none',
'outset': 'outset',
'single': 'solid',
'thick': 'solid',
'thickThinLargeGap': 'double',
'thickThinMediumGap': 'double',
'thickThinSmallGap' : 'double',
'thinThickLargeGap': 'double',
'thinThickMediumGap': 'double',
'thinThickSmallGap': 'double',
'thinThickThinLargeGap': 'double',
'thinThickThinMediumGap': 'double',
'thinThickThinSmallGap': 'double',
'threeDEmboss': 'ridge',
'threeDEngrave': 'groove',
'triple': 'double',
} # }}}
# Read from XML {{{
def read_border(parent, dest):
tvals = {'padding_%s':inherit, 'border_%s_width':inherit,
'border_%s_style':inherit, 'border_%s_color':inherit}
vals = {}
for edge in ('left', 'top', 'right', 'bottom'):
vals.update({k % edge:v for k, v in tvals.iteritems()})
for border in XPath('./w:pBdr')(parent):
for edge in ('left', 'top', 'right', 'bottom'):
for elem in XPath('./w:%s' % edge):
color = get(elem, 'w:color')
if color is not None:
vals['border_%s_color' % edge] = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
vals['border_%s_style' % edge] = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
vals['padding_%s' % edge] = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
vals['border_%s_width' % edge] = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
for key, val in vals.iteritems():
setattr(dest, key, val)
def read_indent(parent, dest):
padding_left = padding_right = text_indent = inherit
for indent in XPath('./w:ind')(parent):
l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
if pl is not None:
padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt')
r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
if pr is not None:
padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt')
h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
if ti is not None:
text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
setattr(dest, 'margin_left', padding_left)
setattr(dest, 'margin_right', padding_right)
setattr(dest, 'text_indent', text_indent)
def read_justification(parent, dest):
ans = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
ans = 'justify'
if val in {'left', 'center', 'right',}:
ans = val
setattr(dest, 'text_align', ans)
def read_spacing(parent, dest):
padding_top = padding_bottom = line_height = inherit
for s in XPath('./w:spacing')(parent):
a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
if pb is not None:
padding_bottom = '%.3g%s' % (pb, 'ex' if al is not None else 'pt')
b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
if pt is not None:
padding_top = '%.3g%s' % (pt, 'ex' if bl is not None else 'pt')
l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
if l is not None:
lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0)
line_height = '%.3g%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '')
setattr(dest, 'margin_top', padding_top)
setattr(dest, 'margin_bottom', padding_bottom)
setattr(dest, 'line_height', line_height)
def read_direction(parent, dest):
ans = inherit
for jc in XPath('./w:textFlow[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if 'rl' in val.lower():
ans = 'rtl'
setattr(dest, 'direction', ans)
def read_shd(parent, dest):
ans = inherit
for shd in XPath('./w:shd[@w:fill]')(parent):
val = get(shd, 'w:fill')
if val:
ans = simple_color(val, auto='transparent')
setattr(dest, 'background_color', ans)
# }}}
class ParagraphStyle(object):
all_properties = (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
# Border margins padding
'border_left_width', 'border_left_style', 'border_left_color', 'padding_left',
'border_top_width', 'border_top_style', 'border_top_color', 'padding_top',
'border_right_width', 'border_right_style', 'border_right_color', 'padding_right',
'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom',
'margin_left', 'margin_top', 'margin_right', 'margin_bottom',
# Misc.
'text_indent', 'text_align', 'line_height', 'direction', 'background_color',
)
def __init__(self, pPr=None):
self.linked_style = None
if pPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for p in (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
):
setattr(self, p, binary_property(pPr, p))
for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'):
f = globals()['read_%s' % x]
f(pPr, self)
for s in XPath('./w:pStyle[@w:val]')(pPr):
self.linked_style = get(s, 'w:val')
self._css = None
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
if other.linked_style is not None:
self.linked_style = other.linked_style
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
@property
def css(self):
if self._css is None:
self._css = c = OrderedDict()
if self.keepLines is True:
c['page-break-inside'] = 'avoid'
if self.pageBreakBefore is True:
c['page-break-before'] = 'always'
for edge in ('left', 'top', 'right', 'bottom'):
val = getattr(self, 'border_%s_width' % edge)
if val is not inherit:
c['border-left-width'] = '%.3gpt' % val
for x in ('style', 'color'):
val = getattr(self, 'border_%s_%s' % (edge, x))
if val is not inherit:
c['border-%s-%s' % (edge, x)] = val
val = getattr(self, 'padding_%s' % edge)
if val is not inherit:
c['padding-%s' % edge] = '%.3gpt' % val
val = getattr(self, 'margin_%s' % edge)
if val is not inherit:
c['margin-%s' % edge] = val
for x in ('text_indent', 'text_align', 'line_height', 'background_color'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = val
return self._css
# TODO: keepNext must be done at markup level

View File

@ -0,0 +1,228 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.block_styles import ( # noqa
inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd)
from calibre.ebooks.docx.names import XPath, get
# Read from XML {{{
def read_text_border(parent, dest):
border_color = border_style = border_width = padding = inherit
elems = XPath('./w:bdr')(parent)
if elems:
border_color = simple_color('auto')
border_style = 'solid'
border_width = 1
for elem in elems:
color = get(elem, 'w:color')
if color is not None:
border_color = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
border_style = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
padding = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
border_width = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
setattr(dest, 'border_color', border_color)
setattr(dest, 'border_style', border_style)
setattr(dest, 'border_width', border_width)
setattr(dest, 'padding', padding)
def read_color(parent, dest):
ans = inherit
for col in XPath('./w:color[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
ans = simple_color(val)
setattr(dest, 'color', ans)
def read_highlight(parent, dest):
ans = inherit
for col in XPath('./w:highlight[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
if not val or val == 'none':
val = 'transparent'
ans = val
setattr(dest, 'highlight', ans)
def read_lang(parent, dest):
ans = inherit
for col in XPath('./w:lang[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
try:
code = int(val, 16)
except (ValueError, TypeError):
ans = val
else:
from calibre.ebooks.docx.lcid import lcid
val = lcid.get(code, None)
if val:
ans = val
setattr(dest, 'lang', ans)
def read_letter_spacing(parent, dest):
ans = inherit
for col in XPath('./w:spacing[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.05)
if val is not None:
ans = val
setattr(dest, 'letter_spacing', ans)
def read_sz(parent, dest):
ans = inherit
for col in XPath('./w:sz[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.5)
if val is not None:
ans = val
setattr(dest, 'font_size', ans)
def read_underline(parent, dest):
ans = inherit
for col in XPath('./w:u[@w:val]')(parent):
val = get(col, 'w:val')
if val:
ans = 'underline'
setattr(dest, 'text_decoration', ans)
def read_vert_align(parent, dest):
ans = inherit
for col in XPath('./w:vertAlign[@w:val]')(parent):
val = get(col, 'w:val')
if val and val in {'baseline', 'subscript', 'superscript'}:
ans = val
setattr(dest, 'vert_align', ans)
# }}}
class RunStyle(object):
all_properties = {
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint',
'rtl', 'shadow', 'smallCaps', 'strike', 'vanish',
'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background_color',
'letter_spacing', 'font_size', 'text_decoration', 'vert_align', 'lang',
}
toggle_properties = {
'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'vanish',
}
def __init__(self, rPr=None):
self.linked_style = None
if rPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for p in (
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
'smallCaps', 'strike', 'vanish',
):
setattr(self, p, binary_property(rPr, p))
for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align', 'lang'):
f = globals()['read_%s' % x]
f(rPr, self)
for s in XPath('./w:rStyle[@w:val]')(rPr):
self.linked_style = get(s, 'w:val')
self._css = None
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
if other.linked_style is not None:
self.linked_style = other.linked_style
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
@property
def css(self):
if self._css is None:
c = self._css = OrderedDict()
td = set()
if self.text_decoration is not inherit:
td.add(self.text_decoration)
if self.strike:
td.add('line-through')
if self.dstrike:
td.add('overline')
td.add('line-through')
if td:
c['text-decoration'] = ' '.join(td)
if self.caps is True:
c['text-transform'] = 'uppercase'
if self.i is True:
c['font-style'] = 'italic'
if self.shadow:
c['text-shadow'] = '2px 2px'
if self.smallCaps is True:
c['font-variant'] = 'small-caps'
if self.vanish is True:
c['display'] = 'none'
for x in ('color', 'style', 'width'):
val = getattr(self, 'border_'+x)
if x == 'width' and val is not inherit:
val = '%.3gpt' % val
if val is not inherit:
c['border-%s' % x] = val
if self.padding is not inherit:
c['padding'] = '%.3gpt' % self.padding
for x in ('color', 'background_color'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = val
for x in ('letter_spacing', 'font_size'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = '%.3gpt' % val
if self.highlight is not inherit and self.highlight != 'transparent':
c['background-color'] = self.highlight
return self._css
def same_border(self, other):
for x in (self, other):
has_border = False
for y in ('color', 'style', 'width'):
if ('border-%s' % y) in x.css:
has_border = True
break
if not has_border:
return False
s = tuple(self.css.get('border-%s' % y, None) for y in ('color', 'style', 'width'))
o = tuple(other.css.get('border-%s' % y, None) for y in ('color', 'style', 'width'))
return s == o

View File

@ -6,356 +6,23 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from collections import OrderedDict, Counter
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.names import XPath, get
class Inherit:
pass
inherit = Inherit()
def binary_property(parent, name):
vals = XPath('./w:%s' % name)(parent)
if not vals:
return inherit
val = get(vals[0], 'w:val', 'on')
return True if val in {'on', '1', 'true'} else False
def simple_color(col, auto='black'):
if not col or col == 'auto' or len(col) != 6:
return auto
return '#'+col
def simple_float(val, mult=1.0):
try:
return float(val) * mult
except (ValueError, TypeError, AttributeError, KeyError):
return None
# Block styles {{{
LINE_STYLES = { # {{{
'basicBlackDashes': 'dashed',
'basicBlackDots': 'dotted',
'basicBlackSquares': 'dashed',
'basicThinLines': 'solid',
'dashDotStroked': 'groove',
'dashed': 'dashed',
'dashSmallGap': 'dashed',
'dotDash': 'dashed',
'dotDotDash': 'dashed',
'dotted': 'dotted',
'double': 'double',
'inset': 'inset',
'nil': 'none',
'none': 'none',
'outset': 'outset',
'single': 'solid',
'thick': 'solid',
'thickThinLargeGap': 'double',
'thickThinMediumGap': 'double',
'thickThinSmallGap' : 'double',
'thinThickLargeGap': 'double',
'thinThickMediumGap': 'double',
'thinThickSmallGap': 'double',
'thinThickThinLargeGap': 'double',
'thinThickThinMediumGap': 'double',
'thinThickThinSmallGap': 'double',
'threeDEmboss': 'ridge',
'threeDEngrave': 'groove',
'triple': 'double',
} # }}}
def read_border(parent, dest):
tvals = {'padding_%s':inherit, 'border_%s_width':inherit,
'border_%s_style':inherit, 'border_%s_color':inherit}
vals = {}
for edge in ('left', 'top', 'right', 'bottom'):
vals.update({k % edge:v for k, v in tvals.iteritems()})
for border in XPath('./w:pBdr')(parent):
for edge in ('left', 'top', 'right', 'bottom'):
for elem in XPath('./w:%s' % edge):
color = get(elem, 'w:color')
if color is not None:
vals['border_%s_color' % edge] = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
vals['border_%s_style' % edge] = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
vals['padding_%s' % edge] = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
vals['border_%s_width' % edge] = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
for key, val in vals.iteritems():
setattr(dest, key, val)
def read_indent(parent, dest):
padding_left = padding_right = text_indent = inherit
for indent in XPath('./w:ind')(parent):
l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
if pl is not None:
padding_left = '%.3f%s' % (pl, 'em' if lc is not None else 'pt')
r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
if pr is not None:
padding_right = '%.3f%s' % (pr, 'em' if rc is not None else 'pt')
h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
if ti is not None:
text_indent = '%.3f%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
setattr(dest, 'margin_left', padding_left)
setattr(dest, 'margin_right', padding_right)
setattr(dest, 'text_indent', text_indent)
def read_justification(parent, dest):
ans = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
ans = 'justify'
if val in {'left', 'center', 'right',}:
ans = val
setattr(dest, 'text_align', ans)
def read_spacing(parent, dest):
padding_top = padding_bottom = line_height = inherit
for s in XPath('./w:spacing')(parent):
a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
if pb is not None:
padding_bottom = '%.3f%s' % (pb, 'ex' if al is not None else 'pt')
b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
if pt is not None:
padding_top = '%.3f%s' % (pt, 'ex' if bl is not None else 'pt')
l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
if l is not None:
lh = simple_float(l, 0.05) if lr in {'exactly', 'atLeast'} else simple_float(l, 1/240.0)
line_height = '%.3f%s' % (lh, 'pt' if lr in {'exactly', 'atLeast'} else '')
setattr(dest, 'margin_top', padding_top)
setattr(dest, 'margin_bottom', padding_bottom)
setattr(dest, 'line_height', line_height)
def read_direction(parent, dest):
ans = inherit
for jc in XPath('./w:textFlow[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if 'rl' in val.lower():
ans = 'rtl'
setattr(dest, 'direction', ans)
def read_shd(parent, dest):
ans = inherit
for shd in XPath('./w:shd[@w:fill]')(parent):
val = get(shd, 'w:fill')
if val:
ans = simple_color(val, auto='transparent')
setattr(dest, 'background_color', ans)
class ParagraphStyle(object):
all_properties = (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
# Border margins padding
'border_left_width', 'border_left_style', 'border_left_color', 'padding_left',
'border_top_width', 'border_top_style', 'border_top_color', 'padding_top',
'border_right_width', 'border_right_style', 'border_right_color', 'padding_right',
'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom',
'margin_left', 'margin_top', 'margin_right', 'margin_bottom',
# Misc.
'text_indent', 'text_align', 'line_height', 'direction', 'background_color',
)
def __init__(self, pPr):
for p in (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
):
setattr(self, p, binary_property(pPr, p))
for x in ('border', 'indent', 'justification', 'spacing', 'direction', 'shd'):
f = globals()['read_%s' % x]
f(pPr, self)
# TODO: numPr and outlineLvl
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
# }}}
# Character styles {{{
def read_text_border(parent, dest):
border_color = border_style = border_width = padding = inherit
elems = XPath('./w:bdr')(parent)
if elems:
border_color = simple_color('auto')
border_style = 'solid'
border_width = 1
for elem in elems:
color = get(elem, 'w:color')
if color is not None:
border_color = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
border_style = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
padding = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
border_width = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
setattr(dest, 'border_color', border_color)
setattr(dest, 'border_style', border_style)
setattr(dest, 'border_width', border_width)
setattr(dest, 'padding', padding)
def read_color(parent, dest):
ans = inherit
for col in XPath('./w:color[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
ans = simple_color(val)
setattr(dest, 'color', ans)
def read_highlight(parent, dest):
ans = inherit
for col in XPath('./w:highlight[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
if not val or val == 'none':
val = 'transparent'
ans = val
setattr(dest, 'highlight', ans)
def read_lang(parent, dest):
ans = inherit
for col in XPath('./w:lang[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
try:
code = int(val, 16)
except (ValueError, TypeError):
ans = val
else:
from calibre.ebooks.docx.lcid import lcid
val = lcid.get(code, None)
if val:
ans = val
setattr(dest, 'lang', ans)
def read_letter_spacing(parent, dest):
ans = inherit
for col in XPath('./w:spacing[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.05)
if val:
ans = val
setattr(dest, 'letter_spacing', ans)
def read_sz(parent, dest):
ans = inherit
for col in XPath('./w:sz[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.5)
if val:
ans = val
setattr(dest, 'font_size', ans)
def read_underline(parent, dest):
ans = inherit
for col in XPath('./w:u[@w:val]')(parent):
val = get(col, 'w:val')
if val:
ans = 'underline'
setattr(dest, 'text_decoration', ans)
def read_vert_align(parent, dest):
ans = inherit
for col in XPath('./w:vertAlign[@w:val]')(parent):
val = get(col, 'w:val')
if val and val in {'baseline', 'subscript', 'superscript'}:
ans = val
setattr(dest, 'vert_align', ans)
class RunStyle(object):
all_properties = (
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
'smallCaps', 'strike', 'vanish',
'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background-color',
'letter_spacing', 'font_size', 'text_decoration', 'vert_align',
)
def __init__(self, rPr):
for p in (
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
'smallCaps', 'strike', 'vanish',
):
setattr(self, p, binary_property(rPr, p))
for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align'):
f = globals()['read_%s' % x]
f(rPr, self)
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
# }}}
class Style(object):
'''
Class representing a <w:style> element. Can contain block, character, etc. styles.
'''
name_path = XPath('./w:name[@w:val]')
based_on_path = XPath('./w:basedOn[@w:val]')
link_path = XPath('./w:link[@w:val]')
def __init__(self, elem):
self.resolved = False
self.style_id = get(elem, 'w:styleId')
self.style_type = get(elem, 'w:type')
names = self.name_path(elem)
@ -364,10 +31,6 @@ class Style(object):
self.based_on = get(based_on[0], 'w:val') if based_on else None
if self.style_type == 'numbering':
self.based_on = None
link = self.link_path(elem)
self.link = get(link[0], 'w:val') if link else None
if self.style_type not in {'paragraph', 'character'}:
self.link = None
self.paragraph_style = self.character_style = None
@ -387,11 +50,30 @@ class Style(object):
else:
self.character_style.update(rs)
def resolve_based_on(self, parent):
if parent.paragraph_style is not None:
if self.paragraph_style is None:
self.paragraph_style = ParagraphStyle()
self.paragraph_style.resolve_based_on(parent.paragraph_style)
if parent.character_style is not None:
if self.character_style is None:
self.character_style = RunStyle()
self.character_style.resolve_based_on(parent.character_style)
class Styles(object):
'''
Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup.
'''
def __init__(self):
self.id_map = OrderedDict()
self.para_cache = {}
self.para_char_cache = {}
self.run_cache = {}
self.classes = {}
self.counter = Counter()
def __iter__(self):
for s in self.id_map.itervalues():
@ -412,19 +94,160 @@ class Styles(object):
if s.style_id:
self.id_map[s.style_id] = s
# Nuke based_on, link attributes that refer to missing/incompatible
# styles
self.default_paragraph_style = self.default_character_style = None
for dd in XPath('./w:docDefaults')(root):
for pd in XPath('./w:pPrDefault')(dd):
for pPr in XPath('./w:pPr')(pd):
ps = ParagraphStyle(pPr)
if self.default_paragraph_style is None:
self.default_paragraph_style = ps
else:
self.default_paragraph_style.update(ps)
for pd in XPath('./w:rPrDefault')(dd):
for pPr in XPath('./w:rPr')(pd):
ps = RunStyle(pPr)
if self.default_character_style is None:
self.default_character_style = ps
else:
self.default_character_style.update(ps)
def resolve(s, p):
if p is not None:
if not p.resolved:
resolve(p, self.get(p.based_on))
s.resolve_based_on(p)
s.resolved = True
for s in self:
bo = s.based_on
if bo is not None:
p = self.get(bo)
if p is None or p.style_type != s.style_type:
s.based_on = None
link = s.link
if link is not None:
p = self.get(link)
if p is None or (s.style_type, p.style_type) not in {('paragraph', 'character'), ('character', 'paragraph')}:
s.link = None
if not s.resolved:
resolve(s, self.get(s.based_on))
# TODO: Document defaults (docDefaults)
def para_val(self, parent_styles, direct_formatting, attr):
val = getattr(direct_formatting, attr)
if val is inherit:
for ps in reversed(parent_styles):
pval = getattr(ps, attr)
if pval is not inherit:
val = pval
break
return val
def run_val(self, parent_styles, direct_formatting, attr):
val = getattr(direct_formatting, attr)
if val is not inherit:
return val
if attr in direct_formatting.toggle_properties:
val = False
for rs in parent_styles:
pval = getattr(rs, attr)
if pval is True:
val ^= True
return val
for rs in reversed(parent_styles):
rval = getattr(rs, attr)
if rval is not inherit:
return rval
return val
def resolve_paragraph(self, p):
ans = self.para_cache.get(p, None)
if ans is None:
ans = self.para_cache[p] = ParagraphStyle()
ans.style_name = None
direct_formatting = None
for pPr in XPath('./w:pPr')(p):
ps = ParagraphStyle(pPr)
if direct_formatting is None:
direct_formatting = ps
else:
direct_formatting.update(ps)
if direct_formatting is None:
direct_formatting = ParagraphStyle()
parent_styles = []
if self.default_paragraph_style is not None:
parent_styles.append(self.default_paragraph_style)
if direct_formatting.linked_style is not None:
ls = self.get(direct_formatting.linked_style)
if ls is not None:
ans.style_name = ls.name
ps = ls.paragraph_style
if ps is not None:
parent_styles.append(ps)
if ls.character_style is not None:
self.para_char_cache[p] = ls.character_style
for attr in ans.all_properties:
setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
return ans
def resolve_run(self, r):
ans = self.run_cache.get(r, None)
if ans is None:
p = r.getparent()
ans = self.run_cache[r] = RunStyle()
direct_formatting = None
for rPr in XPath('./w:rPr')(r):
rs = RunStyle(rPr)
if direct_formatting is None:
direct_formatting = rs
else:
direct_formatting.update(rs)
if direct_formatting is None:
direct_formatting = RunStyle()
parent_styles = []
if self.default_character_style is not None:
parent_styles.append(self.default_character_style)
pstyle = self.para_char_cache.get(p, None)
if pstyle is not None:
parent_styles.append(pstyle)
if direct_formatting.linked_style is not None:
ls = self.get(direct_formatting.linked_style).character_style
if ls is not None:
parent_styles.append(ls)
for attr in ans.all_properties:
setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr))
return ans
def resolve(self, obj):
if obj.tag.endswith('}p'):
return self.resolve_paragraph(obj)
if obj.tag.endswith('}r'):
return self.resolve_run(obj)
def register(self, css, prefix):
h = hash(tuple(css.iteritems()))
ans, _ = self.classes.get(h, (None, None))
if ans is None:
self.counter[prefix] += 1
ans = '%s_%d' % (prefix, self.counter[prefix])
self.classes[h] = (ans, css)
return ans
def generate_classes(self):
for bs in self.para_cache.itervalues():
css = bs.css
if css:
self.register(css, 'block')
for bs in self.run_cache.itervalues():
css = bs.css
if css:
self.register(css, 'text')
def class_name(self, css):
h = hash(tuple(css.iteritems()))
return self.classes.get(h, (None, None))[0]
def generate_css(self):
ans = []
for (cls, css) in sorted(self.classes.itervalues(), key=lambda x:x[0]):
b = ('\t%s: %s;' % (k, v) for k, v in css.iteritems())
b = '\n'.join(b)
ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
return '\n'.join(ans)

View File

@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os
import sys, os, re
from lxml import html
from lxml.html.builder import (
@ -14,7 +14,7 @@ from lxml.html.builder import (
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import XPath, is_tag, barename, XML, STYLES
from calibre.ebooks.docx.styles import Styles
from calibre.ebooks.docx.styles import Styles, inherit
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
class Text:
@ -35,6 +35,7 @@ class Convert(object):
self.mi = self.docx.metadata
self.body = BODY()
self.styles = Styles()
self.object_map = {}
self.html = HTML(
HEAD(
META(charset='utf-8'),
@ -75,6 +76,16 @@ class Convert(object):
for child in self.body:
child.tail = '\n\t'
self.body[-1].tail = '\n'
self.styles.generate_classes()
for obj, html_obj in self.object_map.iteritems():
style = self.styles.resolve(obj)
if style is not None:
css = style.css
if css:
cls = self.styles.class_name(css)
if cls:
html_obj.set('class', cls)
self.write()
def read_styles(self, relationships_by_type):
@ -96,6 +107,10 @@ class Convert(object):
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
css = self.styles.generate_css()
if css:
with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
f.write(css.encode('utf-8'))
def convert_p(self, p):
dest = P()
@ -103,10 +118,58 @@ class Convert(object):
span = self.convert_run(run)
dest.append(span)
style = self.styles.resolve_paragraph(p)
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
if m is not None:
n = min(1, max(6, int(m.group(1))))
dest.tag = 'h%d' % n
if style.direction == 'rtl':
dest.set('dir', 'rtl')
border_runs = []
common_borders = []
for span in dest:
run = self.object_map[span]
style = self.styles.resolve_run(run)
if not border_runs or border_runs[-1][1].same_border(style):
border_runs.append((span, style))
elif border_runs:
if len(border_runs) > 1:
common_borders.append(border_runs)
border_runs = []
for border_run in common_borders:
spans = []
bs = {}
for span, style in border_run:
c = style.css
spans.append(span)
for x in ('width', 'color', 'style'):
val = c.pop('border-%s' % x, None)
if val is not None:
bs['border-%s' % x] = val
if bs:
cls = self.styles.register(bs, 'text_border')
wrapper = self.wrap_elems(spans, SPAN())
wrapper.set('class', cls)
self.object_map[p] = dest
return dest
def wrap_elems(self, elems, wrapper):
p = elems[0].getparent()
idx = p.index(elems[0])
p.insert(idx, wrapper)
wrapper.tail = elems[-1].tail
elems[-1].tail = None
for elem in elems:
p.remove(elem)
wrapper.append(elem)
def convert_run(self, run):
ans = SPAN()
ans.run = run
text = Text(ans, 'text', [])
for child in run:
@ -121,6 +184,7 @@ class Convert(object):
text.buf.append(child.text)
elif is_tag(child, 'w:cr'):
text.add_elem(BR())
ans.append(text.elem)
elif is_tag(child, 'w:br'):
typ = child.get('type', None)
if typ in {'column', 'page'}:
@ -132,8 +196,16 @@ class Convert(object):
else:
br = BR()
text.add_elem(br)
ans.append(text.elem)
if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf))
style = self.styles.resolve_run(run)
if style.vert_align in {'superscript', 'subscript'}:
ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
if style.lang is not inherit:
ans.lang = style.lang
self.object_map[ans] = run
return ans
if __name__ == '__main__':