diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py
index 1a5b117f8f..e43ba27d3f 100644
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@@ -46,6 +46,7 @@ from calibre.ebooks.oeb.base import (
 )
 from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html
 from calibre.ebooks.oeb.polish.errors import DRMError, InvalidBook
+from calibre.ebooks.oeb.polish.parsing import decode_xml
 from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
 from calibre.ebooks.oeb.polish.utils import OEB_FONTS, CommentFinder, PositionFinder, adjust_mime_for_epub, guess_type, insert_self_closing, parse_css
 from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
@@ -168,36 +169,10 @@ class ContainerBase:  # {{{
 
         :param normalize_to_nfc: Normalize returned unicode to the NFC normal form as is required by both the EPUB and AZW3 formats.
         '''
-        def fix_data(d):
-            return d.replace('\r\n', '\n').replace('\r', '\n')
-        if isinstance(data, str):
-            return fix_data(data)
-        bom_enc = None
-        if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}:
-            bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
-                       b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
-            data = data[4:]
-        elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}:
-            bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]]
-            data = data[2:]
-        elif data[:3] == b'\xef\xbb\xbf':
-            bom_enc = 'utf-8'
-            data = data[3:]
-        if bom_enc is not None:
-            try:
-                self.used_encoding = bom_enc
-                return fix_data(data.decode(bom_enc))
-            except UnicodeDecodeError:
-                pass
-        try:
-            self.used_encoding = 'utf-8'
-            return fix_data(data.decode('utf-8'))
-        except UnicodeDecodeError:
-            pass
-        data, self.used_encoding = xml_to_unicode(data)
-        if normalize_to_nfc:
-            data = unicodedata.normalize('NFC', data)
-        return fix_data(data)
+        html, used_encoding = decode_xml(data, normalize_to_nfc)
+        if used_encoding:
+            self.used_encoding = used_encoding
+        return html
 
     def parse_xml(self, data):
         data, self.used_encoding = xml_to_unicode(
diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py
index 75509c9c31..311d40c996 100644
--- a/src/calibre/ebooks/oeb/polish/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/kepubify.py
@@ -18,6 +18,7 @@ from lxml import etree
 
 from calibre.ebooks.oeb.base import XHTML, XPath
 from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
+from calibre.ebooks.oeb.polish.parsing import parse
 from calibre.ebooks.oeb.polish.tts import lang_for_elem
 from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing
 from calibre.spell.break_iterator import sentence_positions
@@ -176,11 +177,17 @@ def remove_kobo_markup_from_html(root):
         unwrap_body_contents(body)
 
 
-def kepubify_html(root, metadata_lang='en'):
+def kepubify_parsed_html(root, metadata_lang: str = 'en'):
     remove_kobo_markup_from_html(root)
     merge_multiple_html_heads_and_bodies(root)
     add_kobo_markup_to_html(root, metadata_lang)
 
 
-def kepubify(container):
+def kepubify_html_data(raw: str | bytes, metadata_lang: str = 'en'):
+    root = parse(raw)
+    kepubify_parsed_html(root, metadata_lang)
+    return root
+
+
+def kepubify_container(container):
     lang = container.mi.language
diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py
index 78fead9704..c8ef145bb0 100644
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@@ -5,6 +5,7 @@ __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
 import re
+import unicodedata
 
 import html5_parser
 from lxml.etree import Element as LxmlElement
@@ -23,6 +24,40 @@ except ImportError:
 XHTML_NS     = 'http://www.w3.org/1999/xhtml'
 
 
+def decode_xml(data: bytes, normalize_to_nfc: bool = True) -> tuple[str, str]:
+    used_encoding = ''
+    def fix_data(d):
+        return d.replace('\r\n', '\n').replace('\r', '\n'), used_encoding
+    if isinstance(data, str):
+        return fix_data(data)
+    bom_enc = None
+    if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}:
+        bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
+                    b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
+        data = data[4:]
+    elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}:
+        bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]]
+        data = data[2:]
+    elif data[:3] == b'\xef\xbb\xbf':
+        bom_enc = 'utf-8'
+        data = data[3:]
+    if bom_enc is not None:
+        try:
+            used_encoding = bom_enc
+            return fix_data(data.decode(bom_enc))
+        except UnicodeDecodeError:
+            pass
+    try:
+        used_encoding = 'utf-8'
+        return fix_data(data.decode('utf-8'))
+    except UnicodeDecodeError:
+        pass
+    data, used_encoding = xml_to_unicode(data)
+    if normalize_to_nfc:
+        data = unicodedata.normalize('NFC', data)
+    return fix_data(data)
+
+
 def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
     if isinstance(raw, bytes):
         raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
index 29ed7ee11c..65b980ad53 100644
--- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
@@ -3,8 +3,7 @@
 
 
 from calibre.ebooks.oeb.base import serialize
-from calibre.ebooks.oeb.polish.kepubify import kepubify_html
-from calibre.ebooks.oeb.polish.parsing import parse_html5 as parse
+from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data
 from calibre.ebooks.oeb.polish.tests.base import BaseTest
 
 
@@ -38,8 +37,7 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }</style></head><body><div id=
             # skipped tags
         }.items():
             with self.subTest(src=src):
-                root = parse(src)
-                kepubify_html(root)
+                root = kepubify_html_data(src)
                 actual = serialize(root, 'text/html').decode('utf-8')
                 actual = actual[len(prefix):-len(suffix)]
                 self.assertEqual(expected, actual)