Fix sentences in tail of block elements not being wrapped

This commit is contained in:
Kovid Goyal 2024-10-16 11:01:42 +05:30
parent 4e621117e3
commit c69880ac87
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 33 additions and 13 deletions

View File

@ -245,6 +245,9 @@ class Structure(BaseTest):
'<div><p>something': '<div><p>something':
'<body><div><p><span id="1">something</span></p></div>', '<body><div><p><span id="1">something</span></p></div>',
'<p>One</p> Two. Three <p>Four':
'<body><p><span id="1">One</span></p><span id="2"> Two. </span><span id="3">Three </span><p><span id="4">Four</span></p>',
}.items()): }.items()):
root = parse(text, namespace_elements=True) root = parse(text, namespace_elements=True)
orig = normalize_markup(root) orig = normalize_markup(root)

View File

@ -82,6 +82,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
self.elem = elem self.elem = elem
self.tag_name = tag_name self.tag_name = tag_name
self.lang = child_lang or lang_for_elem(elem, parent_lang) self.lang = child_lang or lang_for_elem(elem, parent_lang)
self.parent_lang = parent_lang
q = elem.get('data-calibre-tts', '') q = elem.get('data-calibre-tts', '')
self.voice = parent_voice self.voice = parent_voice
if q.startswith('{'): # } if q.startswith('{'): # }
@ -96,6 +97,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
self.texts.append(Chunk(None, elem.text, self.pos)) self.texts.append(Chunk(None, elem.text, self.pos))
self.pos += len(elem.text) self.pos += len(elem.text)
self.children = tuple(elem.iterchildren()) self.children = tuple(elem.iterchildren())
self.has_tail = bool((elem.tail or '').strip())
def add_simple_child(self, elem): def add_simple_child(self, elem):
if text := elem.text: if text := elem.text:
@ -107,15 +109,29 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
self.pos += len(text) self.pos += len(text)
def commit(self) -> None: def commit(self) -> None:
if not self.texts: if self.texts:
return text = ''.join(c.text for c in self.texts)
text = ''.join(c.text for c in self.texts) self.pos = 0
self.pos = 0 for start, length in sentence_positions(text, self.lang):
for start, length in sentence_positions(text, self.lang): elem_id = self.wrap_sentence(start, length)
elem_id = self.wrap_sentence(start, length) ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice)) if self.has_tail:
self.texts = [] p = self.elem.getparent()
self.pos = 0 spans = []
before = after = None
for start, length in sentence_positions(self.elem.tail, self.parent_lang):
end = start + length
text = self.elem.tail[start:end]
if before is None:
before = self.elem.tail[:start]
span = self.make_wrapper(text, p)
spans.append(span)
after = self.elem.tail[end:]
self.elem.tail = before
if after and spans:
spans[-1].tail = after
idx = p.index(self.elem)
p[idx+1:idx+1] = spans
def make_into_wrapper(self, elem: Element) -> str: def make_into_wrapper(self, elem: Element) -> str:
nonlocal id_counter nonlocal id_counter
@ -127,9 +143,11 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
return q return q
id_counter += 1 id_counter += 1
def make_wrapper(self, text: str | None) -> Element: def make_wrapper(self, text: str | None, elem: Element | None = None) -> Element:
ns, sep, _ = self.elem.tag.partition('}') if elem is None:
ans = self.elem.makeelement(ns + sep + 'span') elem = self.elem
ns, sep, _ = elem.tag.partition('}')
ans = elem.makeelement(ns + sep + 'span')
ans.text = text ans.text = text
self.make_into_wrapper(ans) self.make_into_wrapper(ans)
return ans return ans
@ -335,7 +353,6 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
elif child_tag_name not in ignored_tag_names: elif child_tag_name not in ignored_tag_names:
simple_allowed = False simple_allowed = False
children_to_process.append(Parent(child, child_tag_name, p.lang, p.voice, child_lang=child_lang)) children_to_process.append(Parent(child, child_tag_name, p.lang, p.voice, child_lang=child_lang))
p.commit()
if simple_allowed and (text := child.tail): if simple_allowed and (text := child.tail):
p.add_tail(child, text) p.add_tail(child, text)
p.commit() p.commit()