mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
...
This commit is contained in:
parent
690988033e
commit
84aa726181
@ -1,11 +1,17 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
import json
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from typing import NamedTuple
|
from typing import NamedTuple
|
||||||
|
|
||||||
|
from lxml.etree import ElementBase as Element
|
||||||
|
from lxml.etree import tostring as _tostring
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import barename
|
||||||
from calibre.spell.break_iterator import sentence_positions
|
from calibre.spell.break_iterator import sentence_positions
|
||||||
|
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||||
|
|
||||||
|
|
||||||
class Sentence(NamedTuple):
|
class Sentence(NamedTuple):
|
||||||
@ -15,29 +21,15 @@ class Sentence(NamedTuple):
|
|||||||
voice : str
|
voice : str
|
||||||
|
|
||||||
|
|
||||||
|
def tostring(x) -> str:
|
||||||
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
|
|
||||||
import json
|
|
||||||
|
|
||||||
from lxml.etree import ElementBase as Element
|
|
||||||
from lxml.etree import tostring as _tostring
|
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import barename
|
|
||||||
from calibre.utils.localization import canonicalize_lang, get_lang
|
|
||||||
continued_tag_names = frozenset({
|
|
||||||
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
|
|
||||||
})
|
|
||||||
ignored_tag_names = frozenset({
|
|
||||||
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
|
|
||||||
})
|
|
||||||
|
|
||||||
def tostring(x) -> str:
|
|
||||||
return _tostring(x, encoding='unicode')
|
return _tostring(x, encoding='unicode')
|
||||||
|
|
||||||
def lang_for_elem(elem, parent_lang):
|
|
||||||
|
def lang_for_elem(elem, parent_lang):
|
||||||
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
|
return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
|
||||||
|
|
||||||
def has_text(elem):
|
|
||||||
|
def has_text(elem):
|
||||||
if elem.text and elem.text.strip():
|
if elem.text and elem.text.strip():
|
||||||
return True
|
return True
|
||||||
for child in elem:
|
for child in elem:
|
||||||
@ -45,6 +37,23 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class Chunk(NamedTuple):
|
||||||
|
child: Element | None
|
||||||
|
text: str
|
||||||
|
start_at: int
|
||||||
|
is_tail: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
continued_tag_names = frozenset({
|
||||||
|
'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
|
||||||
|
})
|
||||||
|
ignored_tag_names = frozenset({
|
||||||
|
'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
|
||||||
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
|
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
|
||||||
root_voice = voice
|
root_voice = voice
|
||||||
seen_ids = set(root.xpath('//*/@id'))
|
seen_ids = set(root.xpath('//*/@id'))
|
||||||
@ -52,13 +61,6 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
|
|||||||
ans = []
|
ans = []
|
||||||
clones_map = defaultdict(list)
|
clones_map = defaultdict(list)
|
||||||
|
|
||||||
class Chunk(NamedTuple):
|
|
||||||
child: Element | None
|
|
||||||
text: str
|
|
||||||
start_at: int
|
|
||||||
is_tail: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
class Parent:
|
class Parent:
|
||||||
|
|
||||||
def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):
|
def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user