DOCX Output: Nicer organization of output styles.

Styles are now merged into block styles that contain both paragraph and
character formatting (for the most common character style in each
block). All block styles inherit the Normal style and override only what
is different.
This commit is contained in:
Kovid Goyal 2015-05-11 07:04:16 +05:30
parent 2ddf85a341
commit 35699b1f7b
3 changed files with 167 additions and 96 deletions

View File

@ -48,7 +48,7 @@ class DOCXOutput(OutputFormatPlugin):
from calibre.ebooks.docx.writer.from_html import Convert
docx = DOCX(opts, log)
self.convert_metadata(oeb)
Convert(oeb, docx)()
Convert(oeb, docx, self.mi)()
docx.write(output_path, self.mi)
if opts.extract_to:
from calibre.ebooks.docx.dump import do_dump

View File

@ -53,6 +53,7 @@ class TextRun(object):
self.style = style
self.texts = []
self.link = None
self.parent_style = None
self.makelement = namespace.makeelement
def add_text(self, text, preserve_whitespace, bookmark=None, link=None):
@ -75,8 +76,9 @@ class TextRun(object):
makeelement = self.makelement
parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link)
r = makeelement(parent, 'w:r')
rpr = makeelement(r, 'w:rPr')
makeelement(rpr, 'w:rStyle', w_val=self.style.id)
if self.parent_style is not self.style:
rpr = makeelement(r, 'w:rPr')
makeelement(rpr, 'w:rStyle', w_val=self.style.id)
for text, preserve_whitespace, bookmark in self.texts:
if bookmark is not None:
@ -104,6 +106,14 @@ class TextRun(object):
return True
return False
@property
def style_weight(self):
ans = 0
for text, preserve_whitespace, bookmark in self.texts:
if isinstance(text, type('')):
ans += len(text)
return ans
class Block(object):
def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False):
@ -124,6 +134,7 @@ class Block(object):
self.page_break_before = False
self.runs = []
self.skipped = False
self.linked_style = None
def resolve_skipped(self, next_block):
if not self.is_empty():
@ -186,7 +197,8 @@ class Block(object):
numpr = makeelement(ppr, 'w:numPr')
makeelement(numpr, 'w:ilvl', w_val=str(self.numbering_id[1]))
makeelement(numpr, 'w:numId', w_val=str(self.numbering_id[0]))
makeelement(ppr, 'w:pStyle', w_val=self.style.id)
if self.linked_style is not None:
makeelement(ppr, 'w:pStyle', w_val=self.linked_style.id)
if self.is_first_block:
makeelement(ppr, 'w:pageBreakBefore', w_val='off')
for run in self.runs:
@ -311,6 +323,9 @@ class Blocks(object):
self.all_blocks[self.pos].page_break_before = True
self.block_map = {}
def __repr__(self):
return 'Block(%r)' % self.runs
class Convert(object):
# Word does not apply default styling to hyperlinks, so we ensure they get
@ -320,16 +335,17 @@ class Convert(object):
a[href] { text-decoration: underline; color: blue }
'''
def __init__(self, oeb, docx):
def __init__(self, oeb, docx, mi):
self.oeb, self.docx = oeb, docx
self.log, self.opts = docx.log, docx.opts
self.mi = mi
def __call__(self):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
self.svg_rasterizer(self.oeb, self.opts)
self.styles_manager = StylesManager(self.docx.namespace)
self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language)
self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships)
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
self.lists_manager = ListsManager(self.docx)

View File

@ -13,7 +13,7 @@ from lxml import etree
from calibre.ebooks import parse_css_length
from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
from calibre.utils.icu import numeric_sort_key
from calibre.utils.localization import lang_as_iso639_1
from tinycss.css21 import CSS21Parser
css_parser = CSS21Parser()
@ -43,6 +43,34 @@ def bmap(x):
def is_dropcaps(html_tag, tag_style):
return len(html_tag) < 2 and len(etree.tostring(html_tag, method='text', encoding=unicode, with_tail=False)) < 5 and tag_style['float'] == 'left'
class CombinedStyle(object):
def __init__(self, bs, rs, blocks, namespace):
self.bs, self.rs, self.blocks = bs, rs, blocks
self.namespace = namespace
self.id = self.name = self.seq = None
def apply(self):
for block in self.blocks:
block.linked_style = self
for run in block.runs:
run.parent_style = self.rs
def serialize(self, styles, normal_style):
makeelement = self.namespace.makeelement
w = lambda x: '{%s}%s' % (self.namespace.namespaces['w'], x)
block = makeelement(styles, 'w:style', w_styleId=self.id, w_type='paragraph')
makeelement(block, 'w:name', w_val=self.name)
makeelement(block, 'w:qFormat')
if self is not normal_style:
makeelement(block, 'w:basedOn', w_val=normal_style.id)
if self.seq == 0:
block.set(w('default'), '1')
pPr = makeelement(block, 'w:pPr')
self.bs.serialize_properties(pPr, normal_style.bs)
rPr = makeelement(block, 'w:rPr')
self.rs.serialize_properties(rPr, normal_style.rs)
class FloatSpec(object):
def __init__(self, namespace, html_tag, tag_style):
@ -134,14 +162,11 @@ class DOCXStyle(object):
__str__ = __repr__
def serialize(self, styles, normal_style):
w, makeelement = self.w, self.makeelement
makeelement = self.makeelement
style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE)
style.append(makeelement(style, 'name', val=self.name))
if self is normal_style:
style.set(w('default'), '1')
else:
if self is not normal_style:
style.append(makeelement(style, 'basedOn', val=normal_style.id))
style.append(makeelement(style, 'qFormat'))
styles.append(style)
return style
@ -235,13 +260,14 @@ class TextStyle(DOCXStyle):
def serialize_borders(self, bdr, normal_style):
w = self.w
if (self.padding not in (None, ignore, 0) and self is normal_style) or self.padding != normal_style.padding:
is_normal_style = self is normal_style
if is_normal_style or self.padding != normal_style.padding:
bdr.set(w('space'), str(0 if self.padding in (None, ignore) else self.padding))
if (self.border_width not in (None, ignore, 0) and self is normal_style) or self.border_width != normal_style.border_width:
if is_normal_style or self.border_width != normal_style.border_width:
bdr.set(w('sz'), str(0 if self.border_width in (None, ignore) else self.border_width))
if (self.border_style not in (None, ignore, 'none') and self is normal_style) or self.border_style != normal_style.border_style:
if is_normal_style or self.border_style != normal_style.border_style:
bdr.set(w('val'), 'none' if self.border_style in (None, ignore) else self.border_style)
if (self.border_color not in (None, ignore, 'auto') and self is normal_style) or self.border_color != normal_style.border_color:
if is_normal_style or self.border_color != normal_style.border_color:
bdr.set(w('color'), 'auto' if self.border_color in (None, ignore) else self.border_color)
return bdr
@ -249,53 +275,58 @@ class TextStyle(DOCXStyle):
makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'rPr')
if self is normal_style or self.font_family != normal_style.font_family:
style.append(makeelement(
style, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()}))
for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)):
val = getattr(self, attr)
if self is normal_style or getattr(normal_style, attr) != val:
for suffix in ('', 'Cs'):
style.append(makeelement(style, name + suffix, val=vmap(val)))
def check_attr(attr):
val = getattr(self, attr)
return (self is normal_style and val is not False and val is not None) or (val != getattr(normal_style, attr))
if check_attr('color'):
style.append(makeelement(style, 'color', val=self.color or 'auto'))
if check_attr('background_color'):
style.append(makeelement(style, 'shd', fill=self.background_color or 'auto'))
if check_attr('underline'):
style.append(makeelement(style, 'u', val='single' if self.underline else 'none'))
if check_attr('dstrike'):
style.append(makeelement(style, 'dstrike', val=bmap(self.dstrike)))
if check_attr('strike'):
style.append(makeelement(style, 'strike', val=bmap(self.strike)))
if check_attr('caps'):
style.append(makeelement(style, 'caps', val=bmap(self.caps)))
if check_attr('small_caps'):
style.append(makeelement(style, 'smallCaps', val=bmap(self.small_caps)))
if check_attr('shadow'):
style.append(makeelement(style, 'shadow', val=bmap(self.shadow)))
if check_attr('spacing'):
style.append(makeelement(style, 'spacing', val=str(self.spacing or 0)))
if (self is normal_style and self.vertical_align in {'superscript', 'subscript'}) or self.vertical_align != normal_style.vertical_align:
if self.vertical_align in {'superscript', 'subscript', 'baseline'}:
style.append(makeelement(style, 'vertAlign', val=self.vertical_align))
else:
style.append(makeelement(style, 'position', val=self.vertical_align))
bdr = self.serialize_borders(makeelement(style, 'bdr'), normal_style)
if bdr.attrib:
style.append(bdr)
self.serialize_properties(style, normal_style)
if len(style) > 0:
style_root.append(style)
return style_root
def serialize_properties(self, rPr, normal_style):
makeelement = self.makeelement
is_normal_style = self is normal_style
if is_normal_style or self.font_family != normal_style.font_family:
rPr.append(makeelement(
rPr, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()}))
for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)):
val = getattr(self, attr)
if is_normal_style or getattr(normal_style, attr) != val:
for suffix in ('', 'Cs'):
rPr.append(makeelement(rPr, name + suffix, val=vmap(val)))
def check_attr(attr):
val = getattr(self, attr)
return is_normal_style or (val != getattr(normal_style, attr))
if check_attr('color'):
rPr.append(makeelement(rPr, 'color', val=self.color or 'auto'))
if check_attr('background_color'):
rPr.append(makeelement(rPr, 'shd', fill=self.background_color or 'auto'))
if check_attr('underline'):
rPr.append(makeelement(rPr, 'u', val='single' if self.underline else 'none'))
if check_attr('dstrike'):
rPr.append(makeelement(rPr, 'dstrike', val=bmap(self.dstrike)))
if check_attr('strike'):
rPr.append(makeelement(rPr, 'strike', val=bmap(self.strike)))
if check_attr('caps'):
rPr.append(makeelement(rPr, 'caps', val=bmap(self.caps)))
if check_attr('small_caps'):
rPr.append(makeelement(rPr, 'smallCaps', val=bmap(self.small_caps)))
if check_attr('shadow'):
rPr.append(makeelement(rPr, 'shadow', val=bmap(self.shadow)))
if check_attr('spacing'):
rPr.append(makeelement(rPr, 'spacing', val=str(self.spacing or 0)))
if is_normal_style:
rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align if self.vertical_align in {'superscript', 'subscript'} else 'baseline'))
elif self.vertical_align != normal_style.vertical_align:
if self.vertical_align in {'superscript', 'subscript', 'baseline'}:
rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align))
else:
rPr.append(makeelement(rPr, 'position', val=self.vertical_align))
bdr = self.serialize_borders(makeelement(rPr, 'bdr'), normal_style)
if bdr.attrib:
rPr.append(bdr)
def read_css_block_borders(self, css, store_css_style=False):
for edge in border_edges:
if css is None:
@ -385,11 +416,17 @@ class BlockStyle(DOCXStyle):
return bdr
def serialize(self, styles, normal_style):
w, makeelement = self.w, self.makeelement
makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'pPr')
spacing = makeelement(style, 'spacing')
if len(style) > 0:
style_root.append(style)
return style_root
def serialize_properties(self, pPr, normal_style):
makeelement, w = self.makeelement, self.w
spacing = makeelement(pPr, 'spacing')
for edge, attr in {'top':'before', 'bottom':'after'}.iteritems():
getter = attrgetter('css_margin_' + edge)
css_val, css_unit = parse_css_length(getter(self))
@ -408,9 +445,9 @@ class BlockStyle(DOCXStyle):
spacing.set(w('lineRule'), 'atLeast')
if spacing.attrib:
style.append(spacing)
pPr.append(spacing)
ind = makeelement(style, 'ind')
ind = makeelement(pPr, 'ind')
for edge in ('left', 'right'):
getter = attrgetter('css_margin_' + edge)
css_val, css_unit = parse_css_length(getter(self))
@ -444,35 +481,35 @@ class BlockStyle(DOCXStyle):
ind.set(w('hanging'), str(abs(val)))
ind.set(w('hangingChars'), '0')
if ind.attrib:
style.append(ind)
pPr.append(ind)
if (self is normal_style and self.background_color) or self.background_color != normal_style.background_color:
style.append(makeelement(style, 'shd', val='clear', color='auto', fill=self.background_color or 'auto'))
pPr.append(makeelement(pPr, 'shd', val='clear', color='auto', fill=self.background_color or 'auto'))
pbdr = self.serialize_borders(style.makeelement(w('pBdr')), normal_style)
pbdr = self.serialize_borders(pPr.makeelement(w('pBdr')), normal_style)
if len(pbdr):
style.append(pbdr)
pPr.append(pbdr)
if self is normal_style or self.text_align != normal_style.text_align:
style.append(makeelement(style, 'jc', val=self.text_align))
pPr.append(makeelement(pPr, 'jc', val=self.text_align))
if (self is normal_style and self.page_break_before) or self.page_break_before != normal_style.page_break_before:
style.append(makeelement(style, 'pageBreakBefore', val=bmap(self.page_break_before)))
pPr.append(makeelement(pPr, 'pageBreakBefore', val=bmap(self.page_break_before)))
if (self is normal_style and self.keep_lines) or self.keep_lines != normal_style.keep_lines:
style.append(makeelement(style, 'keepLines', val=bmap(self.keep_lines)))
pPr.append(makeelement(pPr, 'keepLines', val=bmap(self.keep_lines)))
if self is not normal_style and self.next_style is not None:
style.append(makeelement(style, 'next', val=self.next_style))
if len(style) > 0:
style_root.append(style)
return style_root
pPr.append(makeelement(pPr, 'next', val=self.next_style))
class StylesManager(object):
def __init__(self, namespace):
def __init__(self, namespace, log, document_lang):
self.namespace = namespace
self.document_lang = lang_as_iso639_1(document_lang) or 'en-US'
if self.document_lang == 'en':
self.document_lang = 'en-US'
self.log = log
self.block_styles, self.text_styles = {}, {}
def create_text_style(self, css_style, is_parent_style=False):
@ -496,37 +533,55 @@ class StylesManager(object):
def finalize(self, blocks):
block_counts, run_counts = Counter(), Counter()
block_rmap, run_rmap = defaultdict(list), defaultdict(list)
used_pairs = defaultdict(list)
for block in blocks:
block_counts[block.style] += 1
bs = block.style
block_counts[bs] += 1
block_rmap[block.style].append(block)
local_run_counts = Counter()
for run in block.runs:
run_counts[run.style] += (0 if run.is_empty() else 1)
count = run.style_weight
run_counts[run.style] += count
local_run_counts[run.style] += count
run_rmap[run.style].append(run)
bnum = len(str(max(1, len(block_counts) - 1)))
for i, (block_style, count) in enumerate(block_counts.most_common()):
if i == 0:
self.normal_block_style = block_style
block_style.id = 'ParagraphNormal'
else:
block_style.id = 'Paragraph%d' % i
block_style.name = '%0{}d Para'.format(bnum) % i
if local_run_counts:
rs = local_run_counts.most_common(1)[0][0]
used_pairs[(bs, rs)].append(block)
rnum = len(str(max(1, len(run_counts) - 1)))
for i, (text_style, count) in enumerate(run_counts.most_common()):
text_style.id = 'Text%d' % i
text_style.name = '%0{}d Text'.format(rnum) % i
text_style.seq = i
if i == 0:
self.normal_text_style = text_style
text_style.id = 'TextNormal'
else:
text_style.id = 'Text%d' % i
text_style.name = '%0{}d Text'.format(rnum) % i
for s in tuple(self.block_styles):
if s.id is None:
self.block_styles.pop(s)
for s in tuple(self.text_styles):
if s.id is None:
self.text_styles.pop(s)
counts = Counter()
for (bs, rs), blocks in used_pairs.iteritems():
s = CombinedStyle(bs, rs, blocks, self.namespace)
counts[s] += sum(1 for b in blocks if not b.is_empty())
snum = len(str(max(1, len(counts) - 1)))
for i, (style, count) in enumerate(counts.most_common()):
if i == 0:
self.normal_style = style
style.id = style.name = 'Normal'
else:
style.id = style.name = 'Para %0{}d'.format(snum) % i
style.seq = i
self.combined_styles = sorted(counts.iterkeys(), key=attrgetter('seq'))
[ls.apply() for ls in self.combined_styles]
self.log.debug('%d Text Styles %d Combined styles' % tuple(map(len, (
self.text_styles, self.combined_styles))))
def serialize(self, styles):
for style in sorted(self.block_styles, key=lambda s:(s is not self.normal_block_style, numeric_sort_key(s.id))):
style.serialize(styles, self.normal_block_style)
for style in sorted(self.text_styles, key=lambda s:(s is not self.normal_text_style, numeric_sort_key(s.id))):
lang = styles.xpath('descendant::*[local-name()="lang"]')[0]
for k in tuple(lang.attrib):
lang.attrib[k] = self.document_lang
for style in self.combined_styles:
style.serialize(styles, self.normal_style)
for style in sorted(self.text_styles, key=attrgetter('seq')):
style.serialize(styles, self.normal_text_style)