From 49c9b2279f96073188a48a2a0cdc0423879146e4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Jun 2013 14:51:38 +0530 Subject: [PATCH] DOCX: Simplify markup for the common

case --- src/calibre/ebooks/docx/cleanup.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py index 4c268cb0f1..f8fe922e9b 100644 --- a/src/calibre/ebooks/docx/cleanup.py +++ b/src/calibre/ebooks/docx/cleanup.py @@ -43,6 +43,12 @@ def merge_run(run): merge(parent, span) +def liftable(css): + # A is liftable if all its styling would work just as well if it is + # specified on the parent element. + prefixes = {x.partition('-')[0] for x in css.iterkeys()} + return not (prefixes - {'text', 'font', 'letter', 'color', 'background'}) + def cleanup_markup(root, styles): # Merge consecutive spans that have the same styling current_run = [] @@ -57,3 +63,24 @@ def cleanup_markup(root, styles): if len(current_run) > 1: merge_run(current_run) current_run = [span] + + # Remove unnecessary span tags that are the only child of a parent block + # element + class_map = dict(styles.classes.itervalues()) + parents = ('p', 'div') + tuple('h%d' % i for i in xrange(1, 7)) + for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)): + if len(parent) == 1 and not parent.text and not parent[0].tail: + # We have a block whose contents are entirely enclosed in a + span = parent[0] + span_class = span.get('class', None) + span_css = class_map.get(span_class, {}) + if liftable(span_css): + pclass = parent.get('class', None) + if span_class: + pclass = (pclass + ' ' + span_class) if pclass else span_class + parent.set('class', pclass) + parent.text = span.text + parent.remove(span) + for child in span: + parent.append(span) +