This commit is contained in:
Kovid Goyal 2024-10-15 16:19:14 +05:30
parent 690988033e
commit 84aa726181
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,11 +1,17 @@
#!/usr/bin/env python #!/usr/bin/env python
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
import json
from collections import defaultdict from collections import defaultdict
from contextlib import suppress from contextlib import suppress
from typing import NamedTuple from typing import NamedTuple
from lxml.etree import ElementBase as Element
from lxml.etree import tostring as _tostring
from calibre.ebooks.oeb.base import barename
from calibre.spell.break_iterator import sentence_positions from calibre.spell.break_iterator import sentence_positions
from calibre.utils.localization import canonicalize_lang, get_lang
class Sentence(NamedTuple): class Sentence(NamedTuple):
@ -15,36 +21,39 @@ class Sentence(NamedTuple):
voice : str voice : str
def tostring(x) -> str:
return _tostring(x, encoding='unicode')
def lang_for_elem(elem, parent_lang):
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
def has_text(elem):
if elem.text and elem.text.strip():
return True
for child in elem:
if child.tail and child.tail.strip():
return True
return False
class Chunk(NamedTuple):
child: Element | None
text: str
start_at: int
is_tail: bool = False
continued_tag_names = frozenset({
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
})
ignored_tag_names = frozenset({
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
})
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]: def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
import json
from lxml.etree import ElementBase as Element
from lxml.etree import tostring as _tostring
from calibre.ebooks.oeb.base import barename
from calibre.utils.localization import canonicalize_lang, get_lang
continued_tag_names = frozenset({
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
})
ignored_tag_names = frozenset({
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
})
def tostring(x) -> str:
return _tostring(x, encoding='unicode')
def lang_for_elem(elem, parent_lang):
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
def has_text(elem):
if elem.text and elem.text.strip():
return True
for child in elem:
if child.tail and child.tail.strip():
return True
return False
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en') root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
root_voice = voice root_voice = voice
seen_ids = set(root.xpath('//*/@id')) seen_ids = set(root.xpath('//*/@id'))
@ -52,13 +61,6 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
ans = [] ans = []
clones_map = defaultdict(list) clones_map = defaultdict(list)
class Chunk(NamedTuple):
child: Element | None
text: str
start_at: int
is_tail: bool = False
class Parent: class Parent:
def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''): def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):