From c7557b23b296aa06c0dc9cf160d72f1b617c5263 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 20 Feb 2025 13:07:54 +0530
Subject: [PATCH] More work on kepubify

---
 src/calibre/ebooks/oeb/polish/kepubify.py       | 7 +++++--
 src/calibre/ebooks/oeb/polish/tests/kepubify.py | 6 ++++++
 2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py
index 201707a3d0..20193d1657 100644
--- a/src/calibre/ebooks/oeb/polish/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/kepubify.py
@@ -16,7 +16,7 @@ import re
 
 from lxml import etree
 
-from calibre.ebooks.oeb.base import XHTML, XPath, serialize
+from calibre.ebooks.oeb.base import XHTML, XPath, escape_cdata
 from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
 from calibre.ebooks.oeb.polish.parsing import parse
 from calibre.ebooks.oeb.polish.tts import lang_for_elem
@@ -178,7 +178,10 @@ def remove_kobo_markup_from_html(root):
 
 
 def serialize_html(root) -> bytes:
-    return serialize(root, 'text/html')
+    escape_cdata(root)
+    ans = etree.tostring(root, encoding='unicode', xml_declaration=False, pretty_print=False, with_tail=False)
+    ans = ans.replace('\xa0', '&#160;')
+    return b"<?xml version='1.0' encoding='utf-8'?>\n" + ans.encode('utf-8')
 
 
 def kepubify_parsed_html(root, metadata_lang: str = 'en'):
diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
index 90aacf87ed..b7dbbe037c 100644
--- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
@@ -49,6 +49,12 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }</style></head><body><div id=
             '<div>Svg: <svg>mouse</svg><i> no tail':
             '<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
             '<i> <span class="koboSpan" id="kobo.1.2">no tail</span></i></div>',
+
+            # encoding quirks
+            '<p>A\xa0nbsp;&nbsp;':
+            '<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;&#160;</span></p>',
+            '<div><script>1 < 2 & 3</script>':  # escaping with cdata note that kepubify doesnt do this
+            '<div><script><![CDATA[1 < 2 & 3]]></script></div>',
         }.items():
             with self.subTest(src=src):
                 root = kepubify_html_data(src)