Read Aloud: Allow configuring an extra pause at the end of every sentence when using the Piper TTS engine. Fixes #2083058 [Bug with The Piper TTS Engine](https://bugs.launchpad.net/calibre/+bug/2083058)

This commit is contained in:
Kovid Goyal 2024-09-28 12:12:19 +05:30
parent ed9f133e86
commit 0db769b01f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 39 additions and 3 deletions

View File

@ -3,6 +3,7 @@
from qt.core import (
QCheckBox,
QDoubleSpinBox,
QFont,
QFormLayout,
QHBoxLayout,
@ -71,6 +72,26 @@ class EngineChoice(QWidget):
self.engine_description.setText(metadata.description)
class SentenceDelay(QDoubleSpinBox):
def __init__(self, parent=None):
super().__init__(parent)
self.setRange(0., 2.)
self.setDecimals(2)
self.setSuffix(_(' seconds'))
self.setToolTip(_('The number of seconds to pause for at the end of a sentence.'))
self.setSpecialValueText(_('no pause'))
self.setSingleStep(0.05)
@property
def val(self) -> str:
return max(0.0, self.value())
@val.setter
def val(self, v) -> None:
self.setValue(float(v))
class FloatSlider(QSlider):
def __init__(self, minimum: float = -1, maximum: float = 1, factor: int = 10, parent=None):
@ -220,6 +241,8 @@ class EngineSpecificConfig(QWidget):
self.engine_specific_settings = {}
self.rate = r = FloatSlider(parent=self)
l.addRow(_('&Speed of speech:'), r)
self.sentence_delay = d = SentenceDelay(parent=self)
l.addRow(_('&Pause after sentence:'), d)
self.pitch = p = FloatSlider(parent=self)
l.addRow(_('&Pitch of speech:'), p)
self.volume = v = Volume(self)
@ -256,6 +279,7 @@ class EngineSpecificConfig(QWidget):
else:
self.layout().setRowVisible(self.output_module, False)
self.output_module.blockSignals(False)
self.layout().setRowVisible(self.sentence_delay, metadata.has_sentence_delay)
try:
s = self.engine_specific_settings[self.engine_name]
except KeyError:
@ -274,6 +298,8 @@ class EngineSpecificConfig(QWidget):
else:
self.layout().setRowVisible(self.volume, False)
self.volume.val = None
if metadata.has_sentence_delay:
self.sentence_delay.val = s.sentence_delay
self.audio_device.clear()
if metadata.allows_choosing_audio_device:
self.audio_device.addItem(_('System default (currently {})').format(self.default_audio_device.description), '')
@ -305,6 +331,8 @@ class EngineSpecificConfig(QWidget):
engine_name=self.engine_name,
rate=self.rate.val, voice_name=self.voices.val, pitch=self.pitch.val, volume=self.volume.val)
metadata = available_engines()[self.engine_name]
if metadata.has_sentence_delay:
ans = ans._replace(sentence_delay=self.sentence_delay.val)
if metadata.has_multiple_output_modules and self.output_module.currentIndex() > 0:
ans = ans._replace(output_module=self.output_module.currentData())
if metadata.allows_choosing_audio_device and self.audio_device.currentIndex() > 0:

View File

@ -286,7 +286,7 @@ class Piper(TTSBackend):
cmdline = list(piper_cmdline()) + [
'--model', model_path, '--config', config_path, '--output-raw', '--json-input',
'--sentence-silence', '0', '--length_scale', str(length_scale)]
'--sentence-silence', str(s.sentence_delay), '--length_scale', str(length_scale)]
if is_debugging():
cmdline.append('--debug')
self._process.setProgram(cmdline[0])

View File

@ -39,6 +39,7 @@ class EngineMetadata(NamedTuple):
can_change_volume: bool = True
voices_have_quality_metadata: bool = False
has_managed_voices: bool = False
has_sentence_delay: bool = False
class Quality(Enum):
@ -122,6 +123,7 @@ class EngineSpecificSettings(NamedTuple):
volume: float | None = None # 0 to 1, None is platform default volume
output_module: str = ''
engine_name: str = ''
sentence_delay: float = 0 # seconds >= 0
@classmethod
def create_from_prefs(cls, engine_name: str, prefs: dict[str, object]) -> 'EngineSpecificSettings':
@ -142,8 +144,11 @@ class EngineSpecificSettings(NamedTuple):
with suppress(Exception):
volume = max(0, min(float(prefs.get('volume')), 1))
om = str(prefs.get('output_module', ''))
sentence_delay = 0.
with suppress(Exception):
sentence_delay = max(0, float(prefs.get('sentence_delay')))
return EngineSpecificSettings(
voice_name=str(prefs.get('voice', '')), output_module=om,
voice_name=str(prefs.get('voice', '')), output_module=om, sentence_delay=sentence_delay,
audio_device_id=audio_device_id, rate=rate, pitch=pitch, volume=volume, engine_name=engine_name)
@classmethod
@ -166,6 +171,8 @@ class EngineSpecificSettings(NamedTuple):
ans['volume'] = self.volume
if self.output_module:
ans['output_module'] = self.output_module
if self.sentence_delay:
ans['sentence_delay'] = self.sentence_delay
return ans
def save_to_config(self, prefs:JSONConfig | None = None):
@ -219,7 +226,8 @@ def available_engines() -> dict[str, EngineMetadata]:
ans['piper'] = EngineMetadata('piper', _('The Piper Neural Engine'), _(
'The "piper" engine can track the currently spoken sentence on screen. It uses a neural network '
'for natural sounding voices. The neural network is run locally on your computer, it is fairly resource intensive to run.'
), TrackingCapability.Sentence, can_change_pitch=False, voices_have_quality_metadata=True, has_managed_voices=True)
), TrackingCapability.Sentence, can_change_pitch=False, voices_have_quality_metadata=True, has_managed_voices=True,
has_sentence_delay=True)
if islinux:
try:
from speechd.paths import SPD_SPAWN_CMD