DOCX Output: Nicer organization of output styles.

Styles are now merged into block styles that contain both paragraph and
character formatting (for the most common character style in each
block). All block styles inherit the Normal style and override only what
is different.
This commit is contained in:
Kovid Goyal 2015-05-11 07:04:16 +05:30
parent 2ddf85a341
commit 35699b1f7b
3 changed files with 167 additions and 96 deletions

View File

@ -48,7 +48,7 @@ class DOCXOutput(OutputFormatPlugin):
from calibre.ebooks.docx.writer.from_html import Convert from calibre.ebooks.docx.writer.from_html import Convert
docx = DOCX(opts, log) docx = DOCX(opts, log)
self.convert_metadata(oeb) self.convert_metadata(oeb)
Convert(oeb, docx)() Convert(oeb, docx, self.mi)()
docx.write(output_path, self.mi) docx.write(output_path, self.mi)
if opts.extract_to: if opts.extract_to:
from calibre.ebooks.docx.dump import do_dump from calibre.ebooks.docx.dump import do_dump

View File

@ -53,6 +53,7 @@ class TextRun(object):
self.style = style self.style = style
self.texts = [] self.texts = []
self.link = None self.link = None
self.parent_style = None
self.makelement = namespace.makeelement self.makelement = namespace.makeelement
def add_text(self, text, preserve_whitespace, bookmark=None, link=None): def add_text(self, text, preserve_whitespace, bookmark=None, link=None):
@ -75,8 +76,9 @@ class TextRun(object):
makeelement = self.makelement makeelement = self.makelement
parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link) parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link)
r = makeelement(parent, 'w:r') r = makeelement(parent, 'w:r')
rpr = makeelement(r, 'w:rPr') if self.parent_style is not self.style:
makeelement(rpr, 'w:rStyle', w_val=self.style.id) rpr = makeelement(r, 'w:rPr')
makeelement(rpr, 'w:rStyle', w_val=self.style.id)
for text, preserve_whitespace, bookmark in self.texts: for text, preserve_whitespace, bookmark in self.texts:
if bookmark is not None: if bookmark is not None:
@ -104,6 +106,14 @@ class TextRun(object):
return True return True
return False return False
@property
def style_weight(self):
ans = 0
for text, preserve_whitespace, bookmark in self.texts:
if isinstance(text, type('')):
ans += len(text)
return ans
class Block(object): class Block(object):
def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False): def __init__(self, namespace, styles_manager, links_manager, html_block, style, is_table_cell=False, float_spec=None, is_list_item=False):
@ -124,6 +134,7 @@ class Block(object):
self.page_break_before = False self.page_break_before = False
self.runs = [] self.runs = []
self.skipped = False self.skipped = False
self.linked_style = None
def resolve_skipped(self, next_block): def resolve_skipped(self, next_block):
if not self.is_empty(): if not self.is_empty():
@ -186,7 +197,8 @@ class Block(object):
numpr = makeelement(ppr, 'w:numPr') numpr = makeelement(ppr, 'w:numPr')
makeelement(numpr, 'w:ilvl', w_val=str(self.numbering_id[1])) makeelement(numpr, 'w:ilvl', w_val=str(self.numbering_id[1]))
makeelement(numpr, 'w:numId', w_val=str(self.numbering_id[0])) makeelement(numpr, 'w:numId', w_val=str(self.numbering_id[0]))
makeelement(ppr, 'w:pStyle', w_val=self.style.id) if self.linked_style is not None:
makeelement(ppr, 'w:pStyle', w_val=self.linked_style.id)
if self.is_first_block: if self.is_first_block:
makeelement(ppr, 'w:pageBreakBefore', w_val='off') makeelement(ppr, 'w:pageBreakBefore', w_val='off')
for run in self.runs: for run in self.runs:
@ -311,6 +323,9 @@ class Blocks(object):
self.all_blocks[self.pos].page_break_before = True self.all_blocks[self.pos].page_break_before = True
self.block_map = {} self.block_map = {}
def __repr__(self):
return 'Block(%r)' % self.runs
class Convert(object): class Convert(object):
# Word does not apply default styling to hyperlinks, so we ensure they get # Word does not apply default styling to hyperlinks, so we ensure they get
@ -320,16 +335,17 @@ class Convert(object):
a[href] { text-decoration: underline; color: blue } a[href] { text-decoration: underline; color: blue }
''' '''
def __init__(self, oeb, docx): def __init__(self, oeb, docx, mi):
self.oeb, self.docx = oeb, docx self.oeb, self.docx = oeb, docx
self.log, self.opts = docx.log, docx.opts self.log, self.opts = docx.log, docx.opts
self.mi = mi
def __call__(self): def __call__(self):
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
self.svg_rasterizer = SVGRasterizer(base_css=self.base_css) self.svg_rasterizer = SVGRasterizer(base_css=self.base_css)
self.svg_rasterizer(self.oeb, self.opts) self.svg_rasterizer(self.oeb, self.opts)
self.styles_manager = StylesManager(self.docx.namespace) self.styles_manager = StylesManager(self.docx.namespace, self.log, self.mi.language)
self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships) self.links_manager = LinksManager(self.docx.namespace, self.docx.document_relationships)
self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships) self.images_manager = ImagesManager(self.oeb, self.docx.document_relationships)
self.lists_manager = ListsManager(self.docx) self.lists_manager = ListsManager(self.docx)

View File

@ -13,7 +13,7 @@ from lxml import etree
from calibre.ebooks import parse_css_length from calibre.ebooks import parse_css_length
from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero
from calibre.utils.icu import numeric_sort_key from calibre.utils.localization import lang_as_iso639_1
from tinycss.css21 import CSS21Parser from tinycss.css21 import CSS21Parser
css_parser = CSS21Parser() css_parser = CSS21Parser()
@ -43,6 +43,34 @@ def bmap(x):
def is_dropcaps(html_tag, tag_style): def is_dropcaps(html_tag, tag_style):
return len(html_tag) < 2 and len(etree.tostring(html_tag, method='text', encoding=unicode, with_tail=False)) < 5 and tag_style['float'] == 'left' return len(html_tag) < 2 and len(etree.tostring(html_tag, method='text', encoding=unicode, with_tail=False)) < 5 and tag_style['float'] == 'left'
class CombinedStyle(object):
def __init__(self, bs, rs, blocks, namespace):
self.bs, self.rs, self.blocks = bs, rs, blocks
self.namespace = namespace
self.id = self.name = self.seq = None
def apply(self):
for block in self.blocks:
block.linked_style = self
for run in block.runs:
run.parent_style = self.rs
def serialize(self, styles, normal_style):
makeelement = self.namespace.makeelement
w = lambda x: '{%s}%s' % (self.namespace.namespaces['w'], x)
block = makeelement(styles, 'w:style', w_styleId=self.id, w_type='paragraph')
makeelement(block, 'w:name', w_val=self.name)
makeelement(block, 'w:qFormat')
if self is not normal_style:
makeelement(block, 'w:basedOn', w_val=normal_style.id)
if self.seq == 0:
block.set(w('default'), '1')
pPr = makeelement(block, 'w:pPr')
self.bs.serialize_properties(pPr, normal_style.bs)
rPr = makeelement(block, 'w:rPr')
self.rs.serialize_properties(rPr, normal_style.rs)
class FloatSpec(object): class FloatSpec(object):
def __init__(self, namespace, html_tag, tag_style): def __init__(self, namespace, html_tag, tag_style):
@ -134,14 +162,11 @@ class DOCXStyle(object):
__str__ = __repr__ __str__ = __repr__
def serialize(self, styles, normal_style): def serialize(self, styles, normal_style):
w, makeelement = self.w, self.makeelement makeelement = self.makeelement
style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE) style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE)
style.append(makeelement(style, 'name', val=self.name)) style.append(makeelement(style, 'name', val=self.name))
if self is normal_style: if self is not normal_style:
style.set(w('default'), '1')
else:
style.append(makeelement(style, 'basedOn', val=normal_style.id)) style.append(makeelement(style, 'basedOn', val=normal_style.id))
style.append(makeelement(style, 'qFormat'))
styles.append(style) styles.append(style)
return style return style
@ -235,13 +260,14 @@ class TextStyle(DOCXStyle):
def serialize_borders(self, bdr, normal_style): def serialize_borders(self, bdr, normal_style):
w = self.w w = self.w
if (self.padding not in (None, ignore, 0) and self is normal_style) or self.padding != normal_style.padding: is_normal_style = self is normal_style
if is_normal_style or self.padding != normal_style.padding:
bdr.set(w('space'), str(0 if self.padding in (None, ignore) else self.padding)) bdr.set(w('space'), str(0 if self.padding in (None, ignore) else self.padding))
if (self.border_width not in (None, ignore, 0) and self is normal_style) or self.border_width != normal_style.border_width: if is_normal_style or self.border_width != normal_style.border_width:
bdr.set(w('sz'), str(0 if self.border_width in (None, ignore) else self.border_width)) bdr.set(w('sz'), str(0 if self.border_width in (None, ignore) else self.border_width))
if (self.border_style not in (None, ignore, 'none') and self is normal_style) or self.border_style != normal_style.border_style: if is_normal_style or self.border_style != normal_style.border_style:
bdr.set(w('val'), 'none' if self.border_style in (None, ignore) else self.border_style) bdr.set(w('val'), 'none' if self.border_style in (None, ignore) else self.border_style)
if (self.border_color not in (None, ignore, 'auto') and self is normal_style) or self.border_color != normal_style.border_color: if is_normal_style or self.border_color != normal_style.border_color:
bdr.set(w('color'), 'auto' if self.border_color in (None, ignore) else self.border_color) bdr.set(w('color'), 'auto' if self.border_color in (None, ignore) else self.border_color)
return bdr return bdr
@ -249,53 +275,58 @@ class TextStyle(DOCXStyle):
makeelement = self.makeelement makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style) style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'rPr') style = makeelement(style_root, 'rPr')
self.serialize_properties(style, normal_style)
if self is normal_style or self.font_family != normal_style.font_family:
style.append(makeelement(
style, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()}))
for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)):
val = getattr(self, attr)
if self is normal_style or getattr(normal_style, attr) != val:
for suffix in ('', 'Cs'):
style.append(makeelement(style, name + suffix, val=vmap(val)))
def check_attr(attr):
val = getattr(self, attr)
return (self is normal_style and val is not False and val is not None) or (val != getattr(normal_style, attr))
if check_attr('color'):
style.append(makeelement(style, 'color', val=self.color or 'auto'))
if check_attr('background_color'):
style.append(makeelement(style, 'shd', fill=self.background_color or 'auto'))
if check_attr('underline'):
style.append(makeelement(style, 'u', val='single' if self.underline else 'none'))
if check_attr('dstrike'):
style.append(makeelement(style, 'dstrike', val=bmap(self.dstrike)))
if check_attr('strike'):
style.append(makeelement(style, 'strike', val=bmap(self.strike)))
if check_attr('caps'):
style.append(makeelement(style, 'caps', val=bmap(self.caps)))
if check_attr('small_caps'):
style.append(makeelement(style, 'smallCaps', val=bmap(self.small_caps)))
if check_attr('shadow'):
style.append(makeelement(style, 'shadow', val=bmap(self.shadow)))
if check_attr('spacing'):
style.append(makeelement(style, 'spacing', val=str(self.spacing or 0)))
if (self is normal_style and self.vertical_align in {'superscript', 'subscript'}) or self.vertical_align != normal_style.vertical_align:
if self.vertical_align in {'superscript', 'subscript', 'baseline'}:
style.append(makeelement(style, 'vertAlign', val=self.vertical_align))
else:
style.append(makeelement(style, 'position', val=self.vertical_align))
bdr = self.serialize_borders(makeelement(style, 'bdr'), normal_style)
if bdr.attrib:
style.append(bdr)
if len(style) > 0: if len(style) > 0:
style_root.append(style) style_root.append(style)
return style_root return style_root
def serialize_properties(self, rPr, normal_style):
makeelement = self.makeelement
is_normal_style = self is normal_style
if is_normal_style or self.font_family != normal_style.font_family:
rPr.append(makeelement(
rPr, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()}))
for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)):
val = getattr(self, attr)
if is_normal_style or getattr(normal_style, attr) != val:
for suffix in ('', 'Cs'):
rPr.append(makeelement(rPr, name + suffix, val=vmap(val)))
def check_attr(attr):
val = getattr(self, attr)
return is_normal_style or (val != getattr(normal_style, attr))
if check_attr('color'):
rPr.append(makeelement(rPr, 'color', val=self.color or 'auto'))
if check_attr('background_color'):
rPr.append(makeelement(rPr, 'shd', fill=self.background_color or 'auto'))
if check_attr('underline'):
rPr.append(makeelement(rPr, 'u', val='single' if self.underline else 'none'))
if check_attr('dstrike'):
rPr.append(makeelement(rPr, 'dstrike', val=bmap(self.dstrike)))
if check_attr('strike'):
rPr.append(makeelement(rPr, 'strike', val=bmap(self.strike)))
if check_attr('caps'):
rPr.append(makeelement(rPr, 'caps', val=bmap(self.caps)))
if check_attr('small_caps'):
rPr.append(makeelement(rPr, 'smallCaps', val=bmap(self.small_caps)))
if check_attr('shadow'):
rPr.append(makeelement(rPr, 'shadow', val=bmap(self.shadow)))
if check_attr('spacing'):
rPr.append(makeelement(rPr, 'spacing', val=str(self.spacing or 0)))
if is_normal_style:
rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align if self.vertical_align in {'superscript', 'subscript'} else 'baseline'))
elif self.vertical_align != normal_style.vertical_align:
if self.vertical_align in {'superscript', 'subscript', 'baseline'}:
rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align))
else:
rPr.append(makeelement(rPr, 'position', val=self.vertical_align))
bdr = self.serialize_borders(makeelement(rPr, 'bdr'), normal_style)
if bdr.attrib:
rPr.append(bdr)
def read_css_block_borders(self, css, store_css_style=False): def read_css_block_borders(self, css, store_css_style=False):
for edge in border_edges: for edge in border_edges:
if css is None: if css is None:
@ -385,11 +416,17 @@ class BlockStyle(DOCXStyle):
return bdr return bdr
def serialize(self, styles, normal_style): def serialize(self, styles, normal_style):
w, makeelement = self.w, self.makeelement makeelement = self.makeelement
style_root = DOCXStyle.serialize(self, styles, normal_style) style_root = DOCXStyle.serialize(self, styles, normal_style)
style = makeelement(style_root, 'pPr') style = makeelement(style_root, 'pPr')
spacing = makeelement(style, 'spacing') if len(style) > 0:
style_root.append(style)
return style_root
def serialize_properties(self, pPr, normal_style):
makeelement, w = self.makeelement, self.w
spacing = makeelement(pPr, 'spacing')
for edge, attr in {'top':'before', 'bottom':'after'}.iteritems(): for edge, attr in {'top':'before', 'bottom':'after'}.iteritems():
getter = attrgetter('css_margin_' + edge) getter = attrgetter('css_margin_' + edge)
css_val, css_unit = parse_css_length(getter(self)) css_val, css_unit = parse_css_length(getter(self))
@ -408,9 +445,9 @@ class BlockStyle(DOCXStyle):
spacing.set(w('lineRule'), 'atLeast') spacing.set(w('lineRule'), 'atLeast')
if spacing.attrib: if spacing.attrib:
style.append(spacing) pPr.append(spacing)
ind = makeelement(style, 'ind') ind = makeelement(pPr, 'ind')
for edge in ('left', 'right'): for edge in ('left', 'right'):
getter = attrgetter('css_margin_' + edge) getter = attrgetter('css_margin_' + edge)
css_val, css_unit = parse_css_length(getter(self)) css_val, css_unit = parse_css_length(getter(self))
@ -444,35 +481,35 @@ class BlockStyle(DOCXStyle):
ind.set(w('hanging'), str(abs(val))) ind.set(w('hanging'), str(abs(val)))
ind.set(w('hangingChars'), '0') ind.set(w('hangingChars'), '0')
if ind.attrib: if ind.attrib:
style.append(ind) pPr.append(ind)
if (self is normal_style and self.background_color) or self.background_color != normal_style.background_color: if (self is normal_style and self.background_color) or self.background_color != normal_style.background_color:
style.append(makeelement(style, 'shd', val='clear', color='auto', fill=self.background_color or 'auto')) pPr.append(makeelement(pPr, 'shd', val='clear', color='auto', fill=self.background_color or 'auto'))
pbdr = self.serialize_borders(style.makeelement(w('pBdr')), normal_style) pbdr = self.serialize_borders(pPr.makeelement(w('pBdr')), normal_style)
if len(pbdr): if len(pbdr):
style.append(pbdr) pPr.append(pbdr)
if self is normal_style or self.text_align != normal_style.text_align: if self is normal_style or self.text_align != normal_style.text_align:
style.append(makeelement(style, 'jc', val=self.text_align)) pPr.append(makeelement(pPr, 'jc', val=self.text_align))
if (self is normal_style and self.page_break_before) or self.page_break_before != normal_style.page_break_before: if (self is normal_style and self.page_break_before) or self.page_break_before != normal_style.page_break_before:
style.append(makeelement(style, 'pageBreakBefore', val=bmap(self.page_break_before))) pPr.append(makeelement(pPr, 'pageBreakBefore', val=bmap(self.page_break_before)))
if (self is normal_style and self.keep_lines) or self.keep_lines != normal_style.keep_lines: if (self is normal_style and self.keep_lines) or self.keep_lines != normal_style.keep_lines:
style.append(makeelement(style, 'keepLines', val=bmap(self.keep_lines))) pPr.append(makeelement(pPr, 'keepLines', val=bmap(self.keep_lines)))
if self is not normal_style and self.next_style is not None: if self is not normal_style and self.next_style is not None:
style.append(makeelement(style, 'next', val=self.next_style)) pPr.append(makeelement(pPr, 'next', val=self.next_style))
if len(style) > 0:
style_root.append(style)
return style_root
class StylesManager(object): class StylesManager(object):
def __init__(self, namespace): def __init__(self, namespace, log, document_lang):
self.namespace = namespace self.namespace = namespace
self.document_lang = lang_as_iso639_1(document_lang) or 'en-US'
if self.document_lang == 'en':
self.document_lang = 'en-US'
self.log = log
self.block_styles, self.text_styles = {}, {} self.block_styles, self.text_styles = {}, {}
def create_text_style(self, css_style, is_parent_style=False): def create_text_style(self, css_style, is_parent_style=False):
@ -496,37 +533,55 @@ class StylesManager(object):
def finalize(self, blocks): def finalize(self, blocks):
block_counts, run_counts = Counter(), Counter() block_counts, run_counts = Counter(), Counter()
block_rmap, run_rmap = defaultdict(list), defaultdict(list) block_rmap, run_rmap = defaultdict(list), defaultdict(list)
used_pairs = defaultdict(list)
for block in blocks: for block in blocks:
block_counts[block.style] += 1 bs = block.style
block_counts[bs] += 1
block_rmap[block.style].append(block) block_rmap[block.style].append(block)
local_run_counts = Counter()
for run in block.runs: for run in block.runs:
run_counts[run.style] += (0 if run.is_empty() else 1) count = run.style_weight
run_counts[run.style] += count
local_run_counts[run.style] += count
run_rmap[run.style].append(run) run_rmap[run.style].append(run)
bnum = len(str(max(1, len(block_counts) - 1))) if local_run_counts:
for i, (block_style, count) in enumerate(block_counts.most_common()): rs = local_run_counts.most_common(1)[0][0]
if i == 0: used_pairs[(bs, rs)].append(block)
self.normal_block_style = block_style
block_style.id = 'ParagraphNormal'
else:
block_style.id = 'Paragraph%d' % i
block_style.name = '%0{}d Para'.format(bnum) % i
rnum = len(str(max(1, len(run_counts) - 1))) rnum = len(str(max(1, len(run_counts) - 1)))
for i, (text_style, count) in enumerate(run_counts.most_common()): for i, (text_style, count) in enumerate(run_counts.most_common()):
text_style.id = 'Text%d' % i
text_style.name = '%0{}d Text'.format(rnum) % i
text_style.seq = i
if i == 0: if i == 0:
self.normal_text_style = text_style self.normal_text_style = text_style
text_style.id = 'TextNormal'
else:
text_style.id = 'Text%d' % i
text_style.name = '%0{}d Text'.format(rnum) % i
for s in tuple(self.block_styles):
if s.id is None:
self.block_styles.pop(s)
for s in tuple(self.text_styles): for s in tuple(self.text_styles):
if s.id is None: if s.id is None:
self.text_styles.pop(s) self.text_styles.pop(s)
counts = Counter()
for (bs, rs), blocks in used_pairs.iteritems():
s = CombinedStyle(bs, rs, blocks, self.namespace)
counts[s] += sum(1 for b in blocks if not b.is_empty())
snum = len(str(max(1, len(counts) - 1)))
for i, (style, count) in enumerate(counts.most_common()):
if i == 0:
self.normal_style = style
style.id = style.name = 'Normal'
else:
style.id = style.name = 'Para %0{}d'.format(snum) % i
style.seq = i
self.combined_styles = sorted(counts.iterkeys(), key=attrgetter('seq'))
[ls.apply() for ls in self.combined_styles]
self.log.debug('%d Text Styles %d Combined styles' % tuple(map(len, (
self.text_styles, self.combined_styles))))
def serialize(self, styles): def serialize(self, styles):
for style in sorted(self.block_styles, key=lambda s:(s is not self.normal_block_style, numeric_sort_key(s.id))): lang = styles.xpath('descendant::*[local-name()="lang"]')[0]
style.serialize(styles, self.normal_block_style) for k in tuple(lang.attrib):
for style in sorted(self.text_styles, key=lambda s:(s is not self.normal_text_style, numeric_sort_key(s.id))): lang.attrib[k] = self.document_lang
for style in self.combined_styles:
style.serialize(styles, self.normal_style)
for style in sorted(self.text_styles, key=attrgetter('seq')):
style.serialize(styles, self.normal_text_style) style.serialize(styles, self.normal_text_style)