From 1f4e6c22dd6587e148948bc3b74f25f0629604a8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 21 May 2015 13:43:41 +0530 Subject: [PATCH] DOCX Output: Fix incorrect handling of bold/italic in paragraphs where the majority of text is either bold or italic instead of normal --- src/calibre/ebooks/docx/writer/from_html.py | 5 +- src/calibre/ebooks/docx/writer/styles.py | 127 +++++++++++++++++--- 2 files changed, 114 insertions(+), 18 deletions(-) diff --git a/src/calibre/ebooks/docx/writer/from_html.py b/src/calibre/ebooks/docx/writer/from_html.py index 094380afd0..d909fb7ee2 100644 --- a/src/calibre/ebooks/docx/writer/from_html.py +++ b/src/calibre/ebooks/docx/writer/from_html.py @@ -64,6 +64,7 @@ class TextRun(object): self.lang = lang self.parent_style = None self.makeelement = namespace.makeelement + self.descendant_style = None def add_text(self, text, preserve_whitespace, bookmark=None, link=None): if not preserve_whitespace: @@ -86,8 +87,8 @@ class TextRun(object): parent = p if self.link is None else links_manager.serialize_hyperlink(p, self.link) r = makeelement(parent, 'w:r') rpr = makeelement(r, 'w:rPr', append=False) - if self.parent_style is not self.style: - makeelement(rpr, 'w:rStyle', w_val=self.style.id) + if getattr(self.descendant_style, 'id', None) is not None: + makeelement(rpr, 'w:rStyle', w_val=self.descendant_style.id) if self.lang: makeelement(rpr, 'w:lang', w_bidi=self.lang, w_val=self.lang, w_eastAsia=self.lang) if len(rpr) > 0: diff --git a/src/calibre/ebooks/docx/writer/styles.py b/src/calibre/ebooks/docx/writer/styles.py index e6c4f302d8..804dfe49e7 100644 --- a/src/calibre/ebooks/docx/writer/styles.py +++ b/src/calibre/ebooks/docx/writer/styles.py @@ -341,6 +341,93 @@ class TextStyle(DOCXStyle): if bdr.attrib: rPr.append(bdr) +class DescendantTextStyle(object): + + def __init__(self, parent_style, child_style): + self.id = self.name = None + self.makeelement = child_style.makeelement + + p = [] + def add(name, **props): + p.append((name, frozenset(props.iteritems()))) + + def vals(attr): + return getattr(parent_style, attr), getattr(child_style, attr) + + def check(attr): + pval, cval = vals(attr) + return pval != cval + + if parent_style.font_family != child_style.font_family: + add('rFonts', **{k:child_style.font_family for k in 'ascii cs eastAsia hAnsi'.split()}) + + for name, attr in (('sz', 'font_size'), ('b', 'bold'), ('i', 'italic')): + pval, cval = vals(attr) + if pval != cval: + val = 'on' if attr in {'bold', 'italic'} else str(cval) # bold, italic are toggle properties + for suffix in ('', 'Cs'): + add(name + suffix, val=val) + + if check('color'): + add('color', val=child_style.color or 'auto') + if check('background_color'): + add('shd', fill=child_style.background_color or 'auto') + if check('underline'): + add('u', val='single' if child_style.underline else 'none') + if check('dstrike'): + add('dstrike', val=bmap(child_style.dstrike)) + if check('strike'): + add('strike', val='on') # toggle property + if check('caps'): + add('caps', val='on') # toggle property + if check('small_caps'): + add('smallCaps', val='on') # toggle property + if check('shadow'): + add('shadow', val='on') # toggle property + if check('spacing'): + add('spacing', val=str(child_style.spacing or 0)) + if check('vertical_align'): + val = child_style.vertical_align + if val in {'superscript', 'subscript', 'baseline'}: + add('vertAlign', val=val) + else: + add('position', val=val) + + bdr = {} + if check('padding'): + bdr['space'] = str(child_style.padding) + if check('border_width'): + bdr['sz'] = str(child_style.border_width) + if check('border_style'): + bdr['val'] = child_style.border_style + if check('border_color'): + bdr['color'] = child_style.border_color + if bdr: + add('bdr', **bdr) + self.properties = tuple(p) + self._hash = hash(self.properties) + + def __hash__(self): + return self._hash + + def __eq__(self, other): + return self.properties == other.properties + + def __ne__(self, other): + return self.properties != other.properties + + def serialize(self, styles): + makeelement = self.makeelement + style = makeelement(styles, 'style', styleId=self.id, type='character') + style.append(makeelement(style, 'name', val=self.name)) + rpr = makeelement(style, 'rPr') + style.append(rpr) + for name, attrs in self.properties: + rpr.append(makeelement(style, name, **dict(attrs))) + styles.append(style) + return style + + def read_css_block_borders(self, css, store_css_style=False): for edge in border_edges: if css is None: @@ -540,7 +627,7 @@ class StylesManager(object): ans = existing return ans - def finalize(self, blocks): + def finalize(self, all_blocks): block_counts, run_counts = Counter(), Counter() block_rmap, run_rmap = defaultdict(list), defaultdict(list) used_pairs = defaultdict(list) @@ -548,7 +635,7 @@ class StylesManager(object): headings = frozenset('h1 h2 h3 h4 h5 h6'.split()) pure_block_styles = set() - for block in blocks: + for block in all_blocks: bs = block.style block_counts[bs] += 1 block_rmap[block.style].append(block) @@ -574,17 +661,6 @@ class StylesManager(object): if i == 0: self.normal_pure_block_style = bs - rnum = len(str(max(1, len(run_counts) - 1))) - for i, (text_style, count) in enumerate(run_counts.most_common()): - text_style.id = 'Text%d' % i - text_style.name = '%0{}d Text'.format(rnum) % i - text_style.seq = i - if i == 0: - self.normal_text_style = text_style - for s in tuple(self.text_styles): - if s.id is None: - self.text_styles.pop(s) - counts = Counter() smap = {} for (bs, rs), blocks in used_pairs.iteritems(): @@ -614,8 +690,27 @@ class StylesManager(object): style.seq = i self.combined_styles = sorted(counts.iterkeys(), key=attrgetter('seq')) [ls.apply() for ls in self.combined_styles] + + descendant_style_map = {} + ds_counts = Counter() + for block in all_blocks: + for run in block.runs: + if run.parent_style is not run.style: + ds = DescendantTextStyle(run.parent_style, run.style) + if ds.properties: + run.descendant_style = descendant_style_map.get(ds) + if run.descendant_style is None: + run.descendant_style = descendant_style_map[ds] = ds + ds_counts[run.descendant_style] += run.style_weight + rnum = len(str(max(1, len(ds_counts) - 1))) + for i, (text_style, count) in enumerate(ds_counts.most_common()): + text_style.id = 'Text%d' % i + text_style.name = '%0{}d Text'.format(rnum) % i + text_style.seq = i + self.descendant_text_styles = sorted(descendant_style_map, key=attrgetter('seq')) + self.log.debug('%d Text Styles %d Combined styles' % tuple(map(len, ( - self.text_styles, self.combined_styles)))) + self.descendant_text_styles, self.combined_styles)))) self.primary_heading_style = None if heading_styles: @@ -634,7 +729,7 @@ class StylesManager(object): lang.attrib[k] = self.document_lang for style in self.combined_styles: style.serialize(styles, self.normal_style) - for style in sorted(self.text_styles, key=attrgetter('seq')): - style.serialize(styles, self.normal_text_style) + for style in self.descendant_text_styles: + style.serialize(styles) for style in sorted(self.pure_block_styles, key=attrgetter('seq')): style.serialize(styles, self.normal_pure_block_style)