This commit is contained in:
Kovid Goyal 2024-10-15 16:19:14 +05:30
parent 690988033e
commit 84aa726181
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,11 +1,17 @@
#!/usr/bin/env python
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
import json
from collections import defaultdict
from contextlib import suppress
from typing import NamedTuple
from lxml.etree import ElementBase as Element
from lxml.etree import tostring as _tostring
from calibre.ebooks.oeb.base import barename
from calibre.spell.break_iterator import sentence_positions
from calibre.utils.localization import canonicalize_lang, get_lang
class Sentence(NamedTuple):
@ -15,36 +21,39 @@ class Sentence(NamedTuple):
voice : str
def tostring(x) -> str:
return _tostring(x, encoding='unicode')
def lang_for_elem(elem, parent_lang):
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
def has_text(elem):
if elem.text and elem.text.strip():
return True
for child in elem:
if child.tail and child.tail.strip():
return True
return False
class Chunk(NamedTuple):
child: Element | None
text: str
start_at: int
is_tail: bool = False
continued_tag_names = frozenset({
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
})
ignored_tag_names = frozenset({
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
})
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
import json
from lxml.etree import ElementBase as Element
from lxml.etree import tostring as _tostring
from calibre.ebooks.oeb.base import barename
from calibre.utils.localization import canonicalize_lang, get_lang
continued_tag_names = frozenset({
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
})
ignored_tag_names = frozenset({
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
})
def tostring(x) -> str:
return _tostring(x, encoding='unicode')
def lang_for_elem(elem, parent_lang):
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
def has_text(elem):
if elem.text and elem.text.strip():
return True
for child in elem:
if child.tail and child.tail.strip():
return True
return False
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
root_voice = voice
seen_ids = set(root.xpath('//*/@id'))
@ -52,13 +61,6 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
ans = []
clones_map = defaultdict(list)
class Chunk(NamedTuple):
child: Element | None
text: str
start_at: int
is_tail: bool = False
class Parent:
def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):