DOCX Input: Change handling of sequences of space characters to more closely follow Microsoft Word.

2025-07-09 03:04:10 -04:00 · 2015-02-03 11:14:23 +05:30 · 2015-02-03 11:14:23 +05:30 · 4618304369
commit 4618304369
parent 28e6946de4
1 changed files with 12 additions and 8 deletions
--- a/src/calibre/ebooks/docx/to_html.py
+++ b/src/calibre/ebooks/docx/to_html.py
@ -581,17 +581,21 @@ class Convert(object):
                    continue
                space = child.get(XML('space'), None)
                preserve = False
-                if space == 'preserve':
+                ctext = child.text
-                    # Only use a <span> with white-space:pre-wrap if this element
+                if space != 'preserve':
-                    # actually needs it, i.e. if it has more than one
+                    # Remove leading and trailing whitespace. Word ignores
-                    # consecutive space or it has newlines or tabs.
+                    # leading and trailing whitespace without preserve
-                    multi_spaces = self.ms_pat.search(child.text) is not None
+                    ctext = ctext.strip()
-                    preserve = multi_spaces or self.ws_pat.search(child.text) is not None
+                # Only use a <span> with white-space:pre-wrap if this element
                # actually needs it, i.e. if it has more than one
                # consecutive space or it has newlines or tabs.
                multi_spaces = self.ms_pat.search(ctext) is not None
                preserve = multi_spaces or self.ws_pat.search(ctext) is not None
                if preserve:
-                    text.add_elem(SPAN(child.text, style="white-space:pre-wrap"))
+                    text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
                    ans.append(text.elem)
                else:
-                    text.buf.append(child.text)
+                    text.buf.append(ctext)
            elif is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)