DOCX: Get rid of <span> tags with no attributes

This commit is contained in:
Kovid Goyal 2013-06-12 17:00:51 +05:30
parent b8be1a27b2
commit c07db5e194

View File

@ -49,6 +49,41 @@ def liftable(css):
prefixes = {x.partition('-')[0] for x in css.iterkeys()} prefixes = {x.partition('-')[0] for x in css.iterkeys()}
return not (prefixes - {'text', 'font', 'letter', 'color', 'background'}) return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
def add_text(elem, attr, text):
old = getattr(elem, attr) or ''
setattr(elem, attr, old + text)
def lift(span):
# Replace an element by its content (text, children and tail)
parent = span.getparent()
idx = parent.index(span)
try:
last_child = span[-1]
except IndexError:
last_child = None
if span.text:
if idx == 0:
add_text(parent, 'text', span.text)
else:
add_text(parent[idx - 1], 'tail', span.text)
for child in reversed(span):
parent.insert(idx, child)
parent.remove(span)
if span.tail:
if last_child is None:
if idx == 0:
add_text(parent, 'text', span.tail)
else:
add_text(parent[idx - 1], 'tail', span.tail)
else:
add_text(last_child, 'tail', span.tail)
def cleanup_markup(root, styles): def cleanup_markup(root, styles):
# Merge consecutive spans that have the same styling # Merge consecutive spans that have the same styling
current_run = [] current_run = []
@ -95,3 +130,7 @@ def cleanup_markup(root, styles):
span.tag = 'b' span.tag = 'b'
del span.attrib['class'] del span.attrib['class']
# Get rid of <span>s that have no styling
for span in root.xpath('//span[not(@class) and not(@id)]'):
lift(span)