This commit is contained in:
Kovid Goyal 2024-10-15 16:19:14 +05:30
parent 690988033e
commit 84aa726181
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,11 +1,17 @@
#!/usr/bin/env python #!/usr/bin/env python
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
import json
from collections import defaultdict from collections import defaultdict
from contextlib import suppress from contextlib import suppress
from typing import NamedTuple from typing import NamedTuple
from lxml.etree import ElementBase as Element
from lxml.etree import tostring as _tostring
from calibre.ebooks.oeb.base import barename
from calibre.spell.break_iterator import sentence_positions from calibre.spell.break_iterator import sentence_positions
from calibre.utils.localization import canonicalize_lang, get_lang
class Sentence(NamedTuple): class Sentence(NamedTuple):
@ -15,28 +21,14 @@ class Sentence(NamedTuple):
voice : str voice : str
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
import json
from lxml.etree import ElementBase as Element
from lxml.etree import tostring as _tostring
from calibre.ebooks.oeb.base import barename
from calibre.utils.localization import canonicalize_lang, get_lang
continued_tag_names = frozenset({
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
})
ignored_tag_names = frozenset({
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
})
def tostring(x) -> str: def tostring(x) -> str:
return _tostring(x, encoding='unicode') return _tostring(x, encoding='unicode')
def lang_for_elem(elem, parent_lang): def lang_for_elem(elem, parent_lang):
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
def has_text(elem): def has_text(elem):
if elem.text and elem.text.strip(): if elem.text and elem.text.strip():
return True return True
@ -45,12 +37,6 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
return True return True
return False return False
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
root_voice = voice
seen_ids = set(root.xpath('//*/@id'))
id_counter = 1
ans = []
clones_map = defaultdict(list)
class Chunk(NamedTuple): class Chunk(NamedTuple):
child: Element | None child: Element | None
@ -59,6 +45,22 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
is_tail: bool = False is_tail: bool = False
continued_tag_names = frozenset({
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
})
ignored_tag_names = frozenset({
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
})
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
root_voice = voice
seen_ids = set(root.xpath('//*/@id'))
id_counter = 1
ans = []
clones_map = defaultdict(list)
class Parent: class Parent:
def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''): def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):