From c7557b23b296aa06c0dc9cf160d72f1b617c5263 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 20 Feb 2025 13:07:54 +0530 Subject: [PATCH] More work on kepubify --- src/calibre/ebooks/oeb/polish/kepubify.py | 7 +++++-- src/calibre/ebooks/oeb/polish/tests/kepubify.py | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py index 201707a3d0..20193d1657 100644 --- a/src/calibre/ebooks/oeb/polish/kepubify.py +++ b/src/calibre/ebooks/oeb/polish/kepubify.py @@ -16,7 +16,7 @@ import re from lxml import etree -from calibre.ebooks.oeb.base import XHTML, XPath, serialize +from calibre.ebooks.oeb.base import XHTML, XPath, escape_cdata from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.tts import lang_for_elem @@ -178,7 +178,10 @@ def remove_kobo_markup_from_html(root): def serialize_html(root) -> bytes: - return serialize(root, 'text/html') + escape_cdata(root) + ans = etree.tostring(root, encoding='unicode', xml_declaration=False, pretty_print=False, with_tail=False) + ans = ans.replace('\xa0', ' ') + return b"\n" + ans.encode('utf-8') def kepubify_parsed_html(root, metadata_lang: str = 'en'): diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py index 90aacf87ed..b7dbbe037c 100644 --- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py +++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py @@ -49,6 +49,12 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }
Svg: mouse' ' no tail
', + + # encoding quirks + '

A\xa0nbsp; ': + '

A nbsp; 

', + '
': # escaping with cdata note that kepubify doesnt do this + '
', }.items(): with self.subTest(src=src): root = kepubify_html_data(src)