Fix sentences in tail of block elements not being wrapped

This commit is contained in:
Kovid Goyal 2024-10-16 11:01:42 +05:30
parent 4e621117e3
commit c69880ac87
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 33 additions and 13 deletions

View File

@ -245,6 +245,9 @@ class Structure(BaseTest):
'<div><p>something':
'<body><div><p><span id="1">something</span></p></div>',
'<p>One</p> Two. Three <p>Four':
'<body><p><span id="1">One</span></p><span id="2"> Two. </span><span id="3">Three </span><p><span id="4">Four</span></p>',
}.items()):
root = parse(text, namespace_elements=True)
orig = normalize_markup(root)

View File

@ -82,6 +82,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
self.elem = elem
self.tag_name = tag_name
self.lang = child_lang or lang_for_elem(elem, parent_lang)
self.parent_lang = parent_lang
q = elem.get('data-calibre-tts', '')
self.voice = parent_voice
if q.startswith('{'): # }
@ -96,6 +97,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
self.texts.append(Chunk(None, elem.text, self.pos))
self.pos += len(elem.text)
self.children = tuple(elem.iterchildren())
self.has_tail = bool((elem.tail or '').strip())
def add_simple_child(self, elem):
if text := elem.text:
@ -107,15 +109,29 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
self.pos += len(text)
def commit(self) -> None:
if not self.texts:
return
if self.texts:
text = ''.join(c.text for c in self.texts)
self.pos = 0
for start, length in sentence_positions(text, self.lang):
elem_id = self.wrap_sentence(start, length)
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
self.texts = []
self.pos = 0
if self.has_tail:
p = self.elem.getparent()
spans = []
before = after = None
for start, length in sentence_positions(self.elem.tail, self.parent_lang):
end = start + length
text = self.elem.tail[start:end]
if before is None:
before = self.elem.tail[:start]
span = self.make_wrapper(text, p)
spans.append(span)
after = self.elem.tail[end:]
self.elem.tail = before
if after and spans:
spans[-1].tail = after
idx = p.index(self.elem)
p[idx+1:idx+1] = spans
def make_into_wrapper(self, elem: Element) -> str:
nonlocal id_counter
@ -127,9 +143,11 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
return q
id_counter += 1
def make_wrapper(self, text: str | None) -> Element:
ns, sep, _ = self.elem.tag.partition('}')
ans = self.elem.makeelement(ns + sep + 'span')
def make_wrapper(self, text: str | None, elem: Element | None = None) -> Element:
if elem is None:
elem = self.elem
ns, sep, _ = elem.tag.partition('}')
ans = elem.makeelement(ns + sep + 'span')
ans.text = text
self.make_into_wrapper(ans)
return ans
@ -335,7 +353,6 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
elif child_tag_name not in ignored_tag_names:
simple_allowed = False
children_to_process.append(Parent(child, child_tag_name, p.lang, p.voice, child_lang=child_lang))
p.commit()
if simple_allowed and (text := child.tail):
p.add_tail(child, text)
p.commit()