mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix sentences in tail of block elements not being wrapped
This commit is contained in:
parent
4e621117e3
commit
c69880ac87
@ -245,6 +245,9 @@ class Structure(BaseTest):
|
|||||||
|
|
||||||
'<div><p>something':
|
'<div><p>something':
|
||||||
'<body><div><p><span id="1">something</span></p></div>',
|
'<body><div><p><span id="1">something</span></p></div>',
|
||||||
|
|
||||||
|
'<p>One</p> Two. Three <p>Four':
|
||||||
|
'<body><p><span id="1">One</span></p><span id="2"> Two. </span><span id="3">Three </span><p><span id="4">Four</span></p>',
|
||||||
}.items()):
|
}.items()):
|
||||||
root = parse(text, namespace_elements=True)
|
root = parse(text, namespace_elements=True)
|
||||||
orig = normalize_markup(root)
|
orig = normalize_markup(root)
|
||||||
|
@ -82,6 +82,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
|||||||
self.elem = elem
|
self.elem = elem
|
||||||
self.tag_name = tag_name
|
self.tag_name = tag_name
|
||||||
self.lang = child_lang or lang_for_elem(elem, parent_lang)
|
self.lang = child_lang or lang_for_elem(elem, parent_lang)
|
||||||
|
self.parent_lang = parent_lang
|
||||||
q = elem.get('data-calibre-tts', '')
|
q = elem.get('data-calibre-tts', '')
|
||||||
self.voice = parent_voice
|
self.voice = parent_voice
|
||||||
if q.startswith('{'): # }
|
if q.startswith('{'): # }
|
||||||
@ -96,6 +97,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
|||||||
self.texts.append(Chunk(None, elem.text, self.pos))
|
self.texts.append(Chunk(None, elem.text, self.pos))
|
||||||
self.pos += len(elem.text)
|
self.pos += len(elem.text)
|
||||||
self.children = tuple(elem.iterchildren())
|
self.children = tuple(elem.iterchildren())
|
||||||
|
self.has_tail = bool((elem.tail or '').strip())
|
||||||
|
|
||||||
def add_simple_child(self, elem):
|
def add_simple_child(self, elem):
|
||||||
if text := elem.text:
|
if text := elem.text:
|
||||||
@ -107,15 +109,29 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
|||||||
self.pos += len(text)
|
self.pos += len(text)
|
||||||
|
|
||||||
def commit(self) -> None:
|
def commit(self) -> None:
|
||||||
if not self.texts:
|
if self.texts:
|
||||||
return
|
text = ''.join(c.text for c in self.texts)
|
||||||
text = ''.join(c.text for c in self.texts)
|
self.pos = 0
|
||||||
self.pos = 0
|
for start, length in sentence_positions(text, self.lang):
|
||||||
for start, length in sentence_positions(text, self.lang):
|
elem_id = self.wrap_sentence(start, length)
|
||||||
elem_id = self.wrap_sentence(start, length)
|
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
|
||||||
ans.append(Sentence(elem_id, text[start:start+length], self.lang, self.voice))
|
if self.has_tail:
|
||||||
self.texts = []
|
p = self.elem.getparent()
|
||||||
self.pos = 0
|
spans = []
|
||||||
|
before = after = None
|
||||||
|
for start, length in sentence_positions(self.elem.tail, self.parent_lang):
|
||||||
|
end = start + length
|
||||||
|
text = self.elem.tail[start:end]
|
||||||
|
if before is None:
|
||||||
|
before = self.elem.tail[:start]
|
||||||
|
span = self.make_wrapper(text, p)
|
||||||
|
spans.append(span)
|
||||||
|
after = self.elem.tail[end:]
|
||||||
|
self.elem.tail = before
|
||||||
|
if after and spans:
|
||||||
|
spans[-1].tail = after
|
||||||
|
idx = p.index(self.elem)
|
||||||
|
p[idx+1:idx+1] = spans
|
||||||
|
|
||||||
def make_into_wrapper(self, elem: Element) -> str:
|
def make_into_wrapper(self, elem: Element) -> str:
|
||||||
nonlocal id_counter
|
nonlocal id_counter
|
||||||
@ -127,9 +143,11 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
|||||||
return q
|
return q
|
||||||
id_counter += 1
|
id_counter += 1
|
||||||
|
|
||||||
def make_wrapper(self, text: str | None) -> Element:
|
def make_wrapper(self, text: str | None, elem: Element | None = None) -> Element:
|
||||||
ns, sep, _ = self.elem.tag.partition('}')
|
if elem is None:
|
||||||
ans = self.elem.makeelement(ns + sep + 'span')
|
elem = self.elem
|
||||||
|
ns, sep, _ = elem.tag.partition('}')
|
||||||
|
ans = elem.makeelement(ns + sep + 'span')
|
||||||
ans.text = text
|
ans.text = text
|
||||||
self.make_into_wrapper(ans)
|
self.make_into_wrapper(ans)
|
||||||
return ans
|
return ans
|
||||||
@ -335,7 +353,6 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
|||||||
elif child_tag_name not in ignored_tag_names:
|
elif child_tag_name not in ignored_tag_names:
|
||||||
simple_allowed = False
|
simple_allowed = False
|
||||||
children_to_process.append(Parent(child, child_tag_name, p.lang, p.voice, child_lang=child_lang))
|
children_to_process.append(Parent(child, child_tag_name, p.lang, p.voice, child_lang=child_lang))
|
||||||
p.commit()
|
|
||||||
if simple_allowed and (text := child.tail):
|
if simple_allowed and (text := child.tail):
|
||||||
p.add_tail(child, text)
|
p.add_tail(child, text)
|
||||||
p.commit()
|
p.commit()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user