More work on kepubify

This commit is contained in:
Kovid Goyal 2025-02-20 13:07:54 +05:30
parent d8a744ceea
commit c7557b23b2
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 11 additions and 2 deletions

View File

@ -16,7 +16,7 @@ import re
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import XHTML, XPath, serialize from calibre.ebooks.oeb.base import XHTML, XPath, escape_cdata
from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.polish.tts import lang_for_elem from calibre.ebooks.oeb.polish.tts import lang_for_elem
@ -178,7 +178,10 @@ def remove_kobo_markup_from_html(root):
def serialize_html(root) -> bytes: def serialize_html(root) -> bytes:
return serialize(root, 'text/html') escape_cdata(root)
ans = etree.tostring(root, encoding='unicode', xml_declaration=False, pretty_print=False, with_tail=False)
ans = ans.replace('\xa0', ' ')
return b"<?xml version='1.0' encoding='utf-8'?>\n" + ans.encode('utf-8')
def kepubify_parsed_html(root, metadata_lang: str = 'en'): def kepubify_parsed_html(root, metadata_lang: str = 'en'):

View File

@ -49,6 +49,12 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }</style></head><body><div id=
'<div>Svg: <svg>mouse</svg><i> no tail': '<div>Svg: <svg>mouse</svg><i> no tail':
'<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>' '<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
'<i> <span class="koboSpan" id="kobo.1.2">no tail</span></i></div>', '<i> <span class="koboSpan" id="kobo.1.2">no tail</span></i></div>',
# encoding quirks
'<p>A\xa0nbsp;&nbsp;':
'<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;&#160;</span></p>',
'<div><script>1 < 2 & 3</script>': # escaping with cdata note that kepubify doesnt do this
'<div><script><![CDATA[1 < 2 & 3]]></script></div>',
}.items(): }.items():
with self.subTest(src=src): with self.subTest(src=src):
root = kepubify_html_data(src) root = kepubify_html_data(src)