mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make sentence splitting code testable
This commit is contained in:
parent
ab0b387dc7
commit
36d9b620c6
@ -32,7 +32,7 @@ from qt.core import (
|
|||||||
from calibre.constants import cache_dir, is_debugging
|
from calibre.constants import cache_dir, is_debugging
|
||||||
from calibre.gui2 import error_dialog
|
from calibre.gui2 import error_dialog
|
||||||
from calibre.gui2.tts2.types import EngineSpecificSettings, Quality, TTSBackend, Voice, piper_cmdline, widget_parent
|
from calibre.gui2.tts2.types import EngineSpecificSettings, Quality, TTSBackend, Voice, piper_cmdline, widget_parent
|
||||||
from calibre.spell.break_iterator import sentence_positions, split_into_words_and_positions
|
from calibre.spell.break_iterator import split_into_sentences_for_tts
|
||||||
from calibre.utils.localization import canonicalize_lang, get_lang
|
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||||
from calibre.utils.resources import get_path as P
|
from calibre.utils.resources import get_path as P
|
||||||
|
|
||||||
@ -58,7 +58,6 @@ class Utterance:
|
|||||||
synthesized: bool = False
|
synthesized: bool = False
|
||||||
|
|
||||||
|
|
||||||
PARAGRAPH_SEPARATOR = '\u2029'
|
|
||||||
UTTERANCE_SEPARATOR = b'\n'
|
UTTERANCE_SEPARATOR = b'\n'
|
||||||
|
|
||||||
|
|
||||||
@ -141,44 +140,17 @@ class UtteranceAudioQueue(QIODevice):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: int = 2048):
|
|
||||||
if len(sentence) <= limit:
|
|
||||||
yield offset, sentence
|
|
||||||
return
|
|
||||||
buf, total, start_at = [], 0, 0
|
|
||||||
|
|
||||||
def a(s, e):
|
|
||||||
nonlocal total, start_at
|
|
||||||
t = sentence[s:e]
|
|
||||||
if not buf:
|
|
||||||
start_at = s
|
|
||||||
buf.append(t)
|
|
||||||
total += len(t)
|
|
||||||
|
|
||||||
for start, length in split_into_words_and_positions(sentence, lang):
|
|
||||||
a(start, start + length)
|
|
||||||
if total >= limit:
|
|
||||||
yield offset + start_at, ' '.join(buf)
|
|
||||||
buf, total = [], 0
|
|
||||||
if buf:
|
|
||||||
yield offset + start_at, ' '.join(buf)
|
|
||||||
|
|
||||||
|
|
||||||
def split_into_utterances(text: str, counter: count, lang: str = 'en'):
|
def split_into_utterances(text: str, counter: count, lang: str = 'en'):
|
||||||
text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ')
|
for start, sentence in split_into_sentences_for_tts(text, lang):
|
||||||
for start, length in sentence_positions(text, lang):
|
payload = json.dumps({'text': sentence}).encode('utf-8')
|
||||||
sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
|
ba = QByteArray()
|
||||||
if sentence:
|
ba.reserve(len(payload) + 1)
|
||||||
for start, sentence in split_long_sentences(sentence, start, lang):
|
ba.append(payload)
|
||||||
payload = json.dumps({'text': sentence}).encode('utf-8')
|
ba.append(UTTERANCE_SEPARATOR)
|
||||||
ba = QByteArray()
|
u = Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(),
|
||||||
ba.reserve(len(payload) + 1)
|
left_to_write=ba, start=start, length=len(sentence))
|
||||||
ba.append(payload)
|
debug(f'Utterance created {u.id} {start=}: {sentence!r}')
|
||||||
ba.append(UTTERANCE_SEPARATOR)
|
yield u
|
||||||
u = Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(),
|
|
||||||
left_to_write=ba, start=start, length=len(sentence))
|
|
||||||
debug(f'Utterance created {u.id}: {sentence}')
|
|
||||||
yield u
|
|
||||||
|
|
||||||
|
|
||||||
class Piper(TTSBackend):
|
class Piper(TTSBackend):
|
||||||
|
@ -68,3 +68,38 @@ def count_words(text, lang='en'):
|
|||||||
it = get_iterator(lang)
|
it = get_iterator(lang)
|
||||||
it.set_text(text)
|
it.set_text(text)
|
||||||
return it.count_words()
|
return it.count_words()
|
||||||
|
|
||||||
|
|
||||||
|
def split_long_sentences(sentence: str, offset: int, lang: str = 'en', limit: int = 2048):
|
||||||
|
if len(sentence) <= limit:
|
||||||
|
yield offset, sentence
|
||||||
|
return
|
||||||
|
buf, total, start_at = [], 0, 0
|
||||||
|
|
||||||
|
def a(s, e):
|
||||||
|
nonlocal total, start_at
|
||||||
|
t = sentence[s:e]
|
||||||
|
if not buf:
|
||||||
|
start_at = s
|
||||||
|
buf.append(t)
|
||||||
|
total += len(t)
|
||||||
|
|
||||||
|
for start, length in split_into_words_and_positions(sentence, lang):
|
||||||
|
a(start, start + length)
|
||||||
|
if total >= limit:
|
||||||
|
yield offset + start_at, ' '.join(buf)
|
||||||
|
buf, total = [], 0
|
||||||
|
if buf:
|
||||||
|
yield offset + start_at, ' '.join(buf)
|
||||||
|
|
||||||
|
|
||||||
|
def split_into_sentences_for_tts(text: str, lang: str = 'en', PARAGRAPH_SEPARATOR: str = '\u2029'):
|
||||||
|
import re
|
||||||
|
def sub(m):
|
||||||
|
return PARAGRAPH_SEPARATOR + ' ' * (len(m.group()) - 1)
|
||||||
|
text = re.sub(r'\n{2,}', sub, text.replace('\r', ' ')).replace('\n', ' ')
|
||||||
|
for start, length in sentence_positions(text, lang):
|
||||||
|
sentence = text[start:start+length].rstrip().replace('\n', ' ').strip()
|
||||||
|
if sentence:
|
||||||
|
for start, sentence in split_long_sentences(sentence, start, lang):
|
||||||
|
yield start, sentence
|
||||||
|
Loading…
x
Reference in New Issue
Block a user