mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Code to mark sentences in HTML
This commit is contained in:
parent
ee88003c01
commit
5a63ba851f
@ -4,7 +4,9 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
from typing import NamedTuple
|
||||||
|
|
||||||
from calibre.utils.icu import _icu
|
from calibre.utils.icu import _icu
|
||||||
from calibre.utils.localization import lang_as_iso639_1
|
from calibre.utils.localization import lang_as_iso639_1
|
||||||
@ -125,3 +127,318 @@ def split_into_sentences_for_tts(
|
|||||||
yield start, sentence
|
yield start, sentence
|
||||||
if pending_sentence:
|
if pending_sentence:
|
||||||
yield pending_start, pending_sentence
|
yield pending_start, pending_sentence
|
||||||
|
|
||||||
|
|
||||||
|
class Sentence(NamedTuple):
|
||||||
|
elem_id: str
|
||||||
|
text: str
|
||||||
|
lang: str
|
||||||
|
|
||||||
|
|
||||||
|
def mark_sentences_in_html(root, lang: str = '') -> list[Sentence]:
|
||||||
|
from lxml.etree import ElementBase as Element
|
||||||
|
from lxml.etree import tostring as _tostring
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import barename
|
||||||
|
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||||
|
continued_tag_names = frozenset({
|
||||||
|
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
|
||||||
|
})
|
||||||
|
ignored_tag_names = frozenset({
|
||||||
|
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
|
||||||
|
})
|
||||||
|
|
||||||
|
def tostring(x) -> str:
|
||||||
|
return _tostring(x, encoding='unicode')
|
||||||
|
|
||||||
|
def lang_for_elem(elem, parent_lang):
|
||||||
|
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
|
||||||
|
|
||||||
|
def has_text(elem):
|
||||||
|
if elem.text and elem.text.strip():
|
||||||
|
return True
|
||||||
|
for child in elem:
|
||||||
|
if child.tail and child.tail.strip():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
root_lang = lang_for_elem(root, canonicalize_lang(lang or get_lang()))
|
||||||
|
seen_ids = set(root.xpath('//*/@id'))
|
||||||
|
id_counter = 1
|
||||||
|
ans = []
|
||||||
|
clones_map = defaultdict(list)
|
||||||
|
|
||||||
|
class Chunk(NamedTuple):
|
||||||
|
child: Element | None
|
||||||
|
text: str
|
||||||
|
start_at: int
|
||||||
|
is_tail: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class Parent:
|
||||||
|
|
||||||
|
def __init__(self, elem, tag_name, parent_lang, child_lang=''):
|
||||||
|
self.elem = elem
|
||||||
|
self.tag_name = tag_name
|
||||||
|
self.lang = child_lang or lang_for_elem(elem, parent_lang)
|
||||||
|
self.pos = 0
|
||||||
|
self.texts = []
|
||||||
|
if elem.text and elem.text.strip():
|
||||||
|
self.texts.append(Chunk(None, elem.text, self.pos))
|
||||||
|
self.pos += len(elem.text)
|
||||||
|
self.children = tuple(elem.iterchildren())
|
||||||
|
self.child_pos = 0
|
||||||
|
|
||||||
|
def add_simple_child(self, elem):
|
||||||
|
if text := elem.text:
|
||||||
|
self.texts.append(Chunk(elem, text, self.pos))
|
||||||
|
self.pos += len(text)
|
||||||
|
|
||||||
|
def add_tail(self, elem, text):
|
||||||
|
self.texts.append(Chunk(elem, text, self.pos, is_tail=True))
|
||||||
|
self.pos += len(text)
|
||||||
|
|
||||||
|
def commit(self) -> None:
|
||||||
|
if not self.texts:
|
||||||
|
return
|
||||||
|
text = ''.join(c.text for c in self.texts)
|
||||||
|
self.pos = 0
|
||||||
|
for start, length in sentence_positions(text, self.lang):
|
||||||
|
elem_id = self.wrap_sentence(start, length)
|
||||||
|
ans.append(Sentence(elem_id, text[start:start+length], lang))
|
||||||
|
self.texts = []
|
||||||
|
self.pos = 0
|
||||||
|
|
||||||
|
def make_into_wrapper(self, elem: Element) -> str:
|
||||||
|
nonlocal id_counter
|
||||||
|
while True:
|
||||||
|
q = f'cttsw-{id_counter}'
|
||||||
|
if q not in seen_ids:
|
||||||
|
elem.set('id', q)
|
||||||
|
seen_ids.add(q)
|
||||||
|
return q
|
||||||
|
id_counter += 1
|
||||||
|
|
||||||
|
def make_wrapper(self, text: str | None) -> Element:
|
||||||
|
ns, sep, _ = self.elem.tag.partition('}')
|
||||||
|
ans = self.elem.makeelement(ns + sep + 'span')
|
||||||
|
ans.text = text
|
||||||
|
self.make_into_wrapper(ans)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def replace_reference_to_child(self, elem: Element, replacement: Element) -> None:
|
||||||
|
for i in range(self.pos + 1, len(self.texts)):
|
||||||
|
if self.texts[i].child is elem:
|
||||||
|
self.texts[i] = self.texts[i]._replace(child=replacement)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
def wrap_contents(self, first_child: Element | None, last_child: Element) -> Element:
|
||||||
|
w = self.make_wrapper(self.elem.text if first_child is None else None)
|
||||||
|
in_range = False
|
||||||
|
for c in self.elem.iterchildren('*'):
|
||||||
|
if not in_range and (first_child is None or first_child is c):
|
||||||
|
in_range = True
|
||||||
|
pos = self.elem.index(c)
|
||||||
|
self.elem.insert(pos, w)
|
||||||
|
w.append(c)
|
||||||
|
first_child = c
|
||||||
|
if in_range:
|
||||||
|
if last_child is not first_child:
|
||||||
|
w.append(last_child)
|
||||||
|
if c is last_child:
|
||||||
|
break
|
||||||
|
self.replace_reference_to_child(last_child, w)
|
||||||
|
return w
|
||||||
|
|
||||||
|
def clone_simple_element(self, elem: Element) -> Element:
|
||||||
|
ans = elem.makeelement(elem.tag)
|
||||||
|
ans.attrib.update(elem.attrib)
|
||||||
|
ans.attrib.pop('id', None)
|
||||||
|
ans.attrib.pop('name', None)
|
||||||
|
ans.text, ans.tail = elem.text, elem.tail
|
||||||
|
p = elem.getparent()
|
||||||
|
idx = p.index(elem)
|
||||||
|
p.insert(idx + 1, ans)
|
||||||
|
self.replace_reference_to_child(elem, ans)
|
||||||
|
clones_map[elem].append(ans)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def wrap_sentence(self, start: int, length: int) -> str:
|
||||||
|
end = start + length
|
||||||
|
start_chunk = end_chunk = -1
|
||||||
|
start_offset = end_offset = 0
|
||||||
|
for i in range(self.pos, len(self.texts)):
|
||||||
|
c = self.texts[i]
|
||||||
|
if c.start_at <= start:
|
||||||
|
start_chunk = i
|
||||||
|
start_offset = start - c.start_at
|
||||||
|
if end <= c.start_at + len(c.text):
|
||||||
|
end_chunk = i
|
||||||
|
self.pos = i
|
||||||
|
end_offset = end - c.start_at
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
self.pos = end_chunk = len(self.texts) - 1
|
||||||
|
end_offset = len(self.texts[-1].text)
|
||||||
|
assert start_chunk > -1
|
||||||
|
s, e = self.texts[start_chunk], self.texts[end_chunk]
|
||||||
|
if s.child is None: # start in leading text of parent element
|
||||||
|
if e is s: # end also in leading text of parent element
|
||||||
|
before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
|
||||||
|
self.elem.text = before
|
||||||
|
w = self.make_wrapper(sentence)
|
||||||
|
self.elem.insert(0, w)
|
||||||
|
w.tail = after
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
if e.is_tail: # ending in the tail of a child
|
||||||
|
before_start, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
included, after = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
e.child.tail = included
|
||||||
|
self.elem.text = after_start
|
||||||
|
w = self.wrap_contents(None, e.child)
|
||||||
|
w.tail = after
|
||||||
|
self.elem.text = before_start
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# ending inside a child
|
||||||
|
before_start, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
included, after = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
e.child.text = included
|
||||||
|
c = self.clone_simple_element(e.child)
|
||||||
|
c.text = after
|
||||||
|
e.child.tail = None
|
||||||
|
self.elem.text = after_start
|
||||||
|
w = self.wrap_contents(None, e.child)
|
||||||
|
self.elem.text = before_start
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(c, c.text, end)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# starting in a child text or tail
|
||||||
|
if s.is_tail:
|
||||||
|
if e.is_tail:
|
||||||
|
if s is e: # end in tail of same element
|
||||||
|
before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
|
||||||
|
s.child.tail = before
|
||||||
|
w = self.make_wrapper(sentence)
|
||||||
|
w.tail = after
|
||||||
|
idx = self.elem.index(s.child)
|
||||||
|
self.elem.insert(idx + 1, w)
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(w, after, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
idx = self.elem.index(s.child)
|
||||||
|
w = self.wrap_contents(self.elem[idx+1], e.child)
|
||||||
|
w.text, w.tail = after_start, after_end
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# end inside some subsequent simple element
|
||||||
|
s.child.tail, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
c = self.clone_simple_element(e.child)
|
||||||
|
c.text = after_end
|
||||||
|
e.child.tail = None
|
||||||
|
w = self.wrap_contents(self.elem[self.elem.index(s.child) + 1], e.child)
|
||||||
|
w.text = after_start
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(c, after_end, end)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# start is in the text of a simple child
|
||||||
|
if s.child is e.child:
|
||||||
|
if e.is_tail: # ending in tail of element we start in
|
||||||
|
before_start, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
c = self.clone_simple_element(s.child)
|
||||||
|
s.child.text, s.child.tail = before_start, None
|
||||||
|
before_end, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
c.text, c.tail = after_start, before_end
|
||||||
|
w = self.wrap_contents(c, c)
|
||||||
|
w.tail = after_end
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# start and end in text of element
|
||||||
|
before, sentence, after = s.text[:start_offset], s.text[start_offset:end_offset], s.text[end_offset:]
|
||||||
|
c = self.clone_simple_element(s.child)
|
||||||
|
s.child.text, s.child.tail = before, None
|
||||||
|
c.text, c.tail = sentence, None
|
||||||
|
c2 = self.clone_simple_element(c)
|
||||||
|
c2.text = after
|
||||||
|
self.make_into_wrapper(c)
|
||||||
|
if after:
|
||||||
|
self.texts[self.pos] = Chunk(c2, after, end)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return c.get('id')
|
||||||
|
# end is in a subsequent simple child or tail of one
|
||||||
|
s.child.text, after_start = s.text[:start_offset], s.text[start_offset:]
|
||||||
|
c = self.clone_simple_element(s.child)
|
||||||
|
c.text, s.child.tail = after_start, None
|
||||||
|
if e.is_tail:
|
||||||
|
e.child.tail, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
w = self.wrap_contents(c, e.child)
|
||||||
|
w.tail = after_end
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(w, after_end, end, is_tail=True)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
# end is in text of subsequent simple child
|
||||||
|
e.child.text, after_end = e.text[:end_offset], e.text[end_offset:]
|
||||||
|
c2 = self.clone_simple_element(e.child)
|
||||||
|
c2.text, e.child.tail = after_end, None
|
||||||
|
w = self.wrap_contents(c, e.child)
|
||||||
|
if after_end:
|
||||||
|
self.texts[self.pos] = Chunk(c2, after_end, end)
|
||||||
|
else:
|
||||||
|
self.pos += 1
|
||||||
|
return w.get('id')
|
||||||
|
|
||||||
|
stack_of_parents = [Parent(elem, 'body', root_lang) for elem in root.iterchildren('*') if barename(elem.tag).lower() == 'body']
|
||||||
|
while stack_of_parents:
|
||||||
|
p = stack_of_parents.pop()
|
||||||
|
if len(p.elem) == 1 and not has_text(p.elem): # wrapper
|
||||||
|
c = p.elem[0]
|
||||||
|
if isinstance(c.tag, str):
|
||||||
|
stack_of_parents.append(Parent(c, barename(c.tag).lower(), p.lang))
|
||||||
|
continue
|
||||||
|
for i in range(p.child_pos, len(p.children)):
|
||||||
|
child = p.children[i]
|
||||||
|
child_lang = lang_for_elem(child, p.lang)
|
||||||
|
child_tag_name = barename(child.tag).lower() if isinstance(child.tag, str) else ''
|
||||||
|
if child_lang == p.lang and child_tag_name in continued_tag_names and len(child) == 0:
|
||||||
|
p.add_simple_child(child)
|
||||||
|
elif child_tag_name not in ignored_tag_names:
|
||||||
|
stack_of_parents.append(Parent(child, child_tag_name, p.lang, child_lang))
|
||||||
|
p.commit()
|
||||||
|
p.child_pos = i + 1
|
||||||
|
stack_of_parents.append(p)
|
||||||
|
continue
|
||||||
|
if text := child.tail:
|
||||||
|
p.add_tail(child, text)
|
||||||
|
p.commit()
|
||||||
|
for src_elem, clones in clones_map.items():
|
||||||
|
for clone in clones + [src_elem]:
|
||||||
|
if not clone.text and not clone.tail and not clone.get('id') and not clone.get('name'):
|
||||||
|
if (p := clone.getparent()) is not None:
|
||||||
|
p.remove(clone)
|
||||||
|
return ans
|
||||||
|
@ -267,6 +267,49 @@ class TestICU(unittest.TestCase):
|
|||||||
}.items():
|
}.items():
|
||||||
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
|
self.ae(expected, list(split_into_sentences_for_tts(sentence, max_sentence_length=40)))
|
||||||
|
|
||||||
|
def test_mark_sentences(self):
|
||||||
|
from html5_parser import parse
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
from calibre.spell.break_iterator import mark_sentences_in_html
|
||||||
|
for text, expected in reversed({
|
||||||
|
'<p id=1>hello cruel world': '<body><p id="1"><span id="1">hello cruel world</span></p>',
|
||||||
|
|
||||||
|
'<p>hello <b>cruel</b> world': '<body><p><span id="1">hello <b>cruel</b> world</span></p>',
|
||||||
|
|
||||||
|
'<p>Yes, please. Hello <b>cruel</b> world.':
|
||||||
|
'<body><p><span id="1">Yes, please. </span><span id="2">Hello <b>cruel</b> world.</span></p>',
|
||||||
|
|
||||||
|
'<p>Hello <b>cruel</b> <i>world. </i>':
|
||||||
|
'<body><p><span id="1">Hello <b>cruel</b> <i>world. </i></span></p>',
|
||||||
|
|
||||||
|
'<p>Yes, <b>please.</b> Well done! Bravissima! ':
|
||||||
|
'<body><p><span id="1">Yes, <b>please.</b> </span><span id="2">Well done! </span><span id="3">Bravissima! </span></p>',
|
||||||
|
|
||||||
|
'<p>Yes, <b>please.</b> Well <i>done! </i>Bravissima! ':
|
||||||
|
'<body><p><span id="1">Yes, <b>please.</b> </span><span id="2">Well <i>done! </i></span><span id="3">Bravissima! </span></p>',
|
||||||
|
|
||||||
|
'<p><i>Hello</i>, world! Good day to you':
|
||||||
|
'<body><p><span id="1"><i>Hello</i>, world! </span><span id="2">Good day to you</span></p>',
|
||||||
|
|
||||||
|
'<p><i>Hello, world! </i>Good day to you':
|
||||||
|
'<body><p><i id="1">Hello, world! </i><span id="2">Good day to you</span></p>',
|
||||||
|
|
||||||
|
'<p><i>Hello, </i><b>world!</b>Good day to you':
|
||||||
|
'<body><p><span id="1"><i>Hello, </i><b>world!</b></span><span id="2">Good day to you</span></p>',
|
||||||
|
|
||||||
|
'<p><i>Hello, </i><b>world</b>! Good day to you':
|
||||||
|
'<body><p><span id="1"><i>Hello, </i><b>world</b>! </span><span id="2">Good day to you</span></p>',
|
||||||
|
}.items()):
|
||||||
|
root = parse(text, namespace_elements=True)
|
||||||
|
mark_sentences_in_html(root)
|
||||||
|
actual = html.tostring(root, encoding='unicode')
|
||||||
|
actual = actual[actual.find('<body'):]
|
||||||
|
actual = actual[:actual.rfind('</body>')]
|
||||||
|
actual = actual.replace('cttsw-', '')
|
||||||
|
self.ae(expected, actual)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def find_tests():
|
def find_tests():
|
||||||
return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)
|
return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user