From c07db5e194104819f862306ab839f5e94da5100c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 12 Jun 2013 17:00:51 +0530 Subject: [PATCH] DOCX: Get rid of tags with no attributes --- src/calibre/ebooks/docx/cleanup.py | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/calibre/ebooks/docx/cleanup.py b/src/calibre/ebooks/docx/cleanup.py index 4b1828e39a..2b1e095025 100644 --- a/src/calibre/ebooks/docx/cleanup.py +++ b/src/calibre/ebooks/docx/cleanup.py @@ -49,6 +49,41 @@ def liftable(css): prefixes = {x.partition('-')[0] for x in css.iterkeys()} return not (prefixes - {'text', 'font', 'letter', 'color', 'background'}) + +def add_text(elem, attr, text): + old = getattr(elem, attr) or '' + setattr(elem, attr, old + text) + + +def lift(span): + # Replace an element by its content (text, children and tail) + parent = span.getparent() + idx = parent.index(span) + try: + last_child = span[-1] + except IndexError: + last_child = None + + if span.text: + if idx == 0: + add_text(parent, 'text', span.text) + else: + add_text(parent[idx - 1], 'tail', span.text) + + for child in reversed(span): + parent.insert(idx, child) + parent.remove(span) + + if span.tail: + if last_child is None: + if idx == 0: + add_text(parent, 'text', span.tail) + else: + add_text(parent[idx - 1], 'tail', span.tail) + else: + add_text(last_child, 'tail', span.tail) + + def cleanup_markup(root, styles): # Merge consecutive spans that have the same styling current_run = [] @@ -95,3 +130,7 @@ def cleanup_markup(root, styles): span.tag = 'b' del span.attrib['class'] + # Get rid of s that have no styling + for span in root.xpath('//span[not(@class) and not(@id)]'): + lift(span) +