Read Aloud: Allow configuring an extra pause at the end of every sentence when using the Piper TTS engine. Fixes #2083058 [Bug with The Piper TTS Engine](https://bugs.launchpad.net/calibre/+bug/2083058)

This commit is contained in:
Kovid Goyal 2024-09-28 12:12:19 +05:30
parent ed9f133e86
commit 0db769b01f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 39 additions and 3 deletions

View File

@ -3,6 +3,7 @@
from qt.core import ( from qt.core import (
QCheckBox, QCheckBox,
QDoubleSpinBox,
QFont, QFont,
QFormLayout, QFormLayout,
QHBoxLayout, QHBoxLayout,
@ -71,6 +72,26 @@ class EngineChoice(QWidget):
self.engine_description.setText(metadata.description) self.engine_description.setText(metadata.description)
class SentenceDelay(QDoubleSpinBox):
def __init__(self, parent=None):
super().__init__(parent)
self.setRange(0., 2.)
self.setDecimals(2)
self.setSuffix(_(' seconds'))
self.setToolTip(_('The number of seconds to pause for at the end of a sentence.'))
self.setSpecialValueText(_('no pause'))
self.setSingleStep(0.05)
@property
def val(self) -> str:
return max(0.0, self.value())
@val.setter
def val(self, v) -> None:
self.setValue(float(v))
class FloatSlider(QSlider): class FloatSlider(QSlider):
def __init__(self, minimum: float = -1, maximum: float = 1, factor: int = 10, parent=None): def __init__(self, minimum: float = -1, maximum: float = 1, factor: int = 10, parent=None):
@ -220,6 +241,8 @@ class EngineSpecificConfig(QWidget):
self.engine_specific_settings = {} self.engine_specific_settings = {}
self.rate = r = FloatSlider(parent=self) self.rate = r = FloatSlider(parent=self)
l.addRow(_('&Speed of speech:'), r) l.addRow(_('&Speed of speech:'), r)
self.sentence_delay = d = SentenceDelay(parent=self)
l.addRow(_('&Pause after sentence:'), d)
self.pitch = p = FloatSlider(parent=self) self.pitch = p = FloatSlider(parent=self)
l.addRow(_('&Pitch of speech:'), p) l.addRow(_('&Pitch of speech:'), p)
self.volume = v = Volume(self) self.volume = v = Volume(self)
@ -256,6 +279,7 @@ class EngineSpecificConfig(QWidget):
else: else:
self.layout().setRowVisible(self.output_module, False) self.layout().setRowVisible(self.output_module, False)
self.output_module.blockSignals(False) self.output_module.blockSignals(False)
self.layout().setRowVisible(self.sentence_delay, metadata.has_sentence_delay)
try: try:
s = self.engine_specific_settings[self.engine_name] s = self.engine_specific_settings[self.engine_name]
except KeyError: except KeyError:
@ -274,6 +298,8 @@ class EngineSpecificConfig(QWidget):
else: else:
self.layout().setRowVisible(self.volume, False) self.layout().setRowVisible(self.volume, False)
self.volume.val = None self.volume.val = None
if metadata.has_sentence_delay:
self.sentence_delay.val = s.sentence_delay
self.audio_device.clear() self.audio_device.clear()
if metadata.allows_choosing_audio_device: if metadata.allows_choosing_audio_device:
self.audio_device.addItem(_('System default (currently {})').format(self.default_audio_device.description), '') self.audio_device.addItem(_('System default (currently {})').format(self.default_audio_device.description), '')
@ -305,6 +331,8 @@ class EngineSpecificConfig(QWidget):
engine_name=self.engine_name, engine_name=self.engine_name,
rate=self.rate.val, voice_name=self.voices.val, pitch=self.pitch.val, volume=self.volume.val) rate=self.rate.val, voice_name=self.voices.val, pitch=self.pitch.val, volume=self.volume.val)
metadata = available_engines()[self.engine_name] metadata = available_engines()[self.engine_name]
if metadata.has_sentence_delay:
ans = ans._replace(sentence_delay=self.sentence_delay.val)
if metadata.has_multiple_output_modules and self.output_module.currentIndex() > 0: if metadata.has_multiple_output_modules and self.output_module.currentIndex() > 0:
ans = ans._replace(output_module=self.output_module.currentData()) ans = ans._replace(output_module=self.output_module.currentData())
if metadata.allows_choosing_audio_device and self.audio_device.currentIndex() > 0: if metadata.allows_choosing_audio_device and self.audio_device.currentIndex() > 0:

View File

@ -286,7 +286,7 @@ class Piper(TTSBackend):
cmdline = list(piper_cmdline()) + [ cmdline = list(piper_cmdline()) + [
'--model', model_path, '--config', config_path, '--output-raw', '--json-input', '--model', model_path, '--config', config_path, '--output-raw', '--json-input',
'--sentence-silence', '0', '--length_scale', str(length_scale)] '--sentence-silence', str(s.sentence_delay), '--length_scale', str(length_scale)]
if is_debugging(): if is_debugging():
cmdline.append('--debug') cmdline.append('--debug')
self._process.setProgram(cmdline[0]) self._process.setProgram(cmdline[0])

View File

@ -39,6 +39,7 @@ class EngineMetadata(NamedTuple):
can_change_volume: bool = True can_change_volume: bool = True
voices_have_quality_metadata: bool = False voices_have_quality_metadata: bool = False
has_managed_voices: bool = False has_managed_voices: bool = False
has_sentence_delay: bool = False
class Quality(Enum): class Quality(Enum):
@ -122,6 +123,7 @@ class EngineSpecificSettings(NamedTuple):
volume: float | None = None # 0 to 1, None is platform default volume volume: float | None = None # 0 to 1, None is platform default volume
output_module: str = '' output_module: str = ''
engine_name: str = '' engine_name: str = ''
sentence_delay: float = 0 # seconds >= 0
@classmethod @classmethod
def create_from_prefs(cls, engine_name: str, prefs: dict[str, object]) -> 'EngineSpecificSettings': def create_from_prefs(cls, engine_name: str, prefs: dict[str, object]) -> 'EngineSpecificSettings':
@ -142,8 +144,11 @@ class EngineSpecificSettings(NamedTuple):
with suppress(Exception): with suppress(Exception):
volume = max(0, min(float(prefs.get('volume')), 1)) volume = max(0, min(float(prefs.get('volume')), 1))
om = str(prefs.get('output_module', '')) om = str(prefs.get('output_module', ''))
sentence_delay = 0.
with suppress(Exception):
sentence_delay = max(0, float(prefs.get('sentence_delay')))
return EngineSpecificSettings( return EngineSpecificSettings(
voice_name=str(prefs.get('voice', '')), output_module=om, voice_name=str(prefs.get('voice', '')), output_module=om, sentence_delay=sentence_delay,
audio_device_id=audio_device_id, rate=rate, pitch=pitch, volume=volume, engine_name=engine_name) audio_device_id=audio_device_id, rate=rate, pitch=pitch, volume=volume, engine_name=engine_name)
@classmethod @classmethod
@ -166,6 +171,8 @@ class EngineSpecificSettings(NamedTuple):
ans['volume'] = self.volume ans['volume'] = self.volume
if self.output_module: if self.output_module:
ans['output_module'] = self.output_module ans['output_module'] = self.output_module
if self.sentence_delay:
ans['sentence_delay'] = self.sentence_delay
return ans return ans
def save_to_config(self, prefs:JSONConfig | None = None): def save_to_config(self, prefs:JSONConfig | None = None):
@ -219,7 +226,8 @@ def available_engines() -> dict[str, EngineMetadata]:
ans['piper'] = EngineMetadata('piper', _('The Piper Neural Engine'), _( ans['piper'] = EngineMetadata('piper', _('The Piper Neural Engine'), _(
'The "piper" engine can track the currently spoken sentence on screen. It uses a neural network ' 'The "piper" engine can track the currently spoken sentence on screen. It uses a neural network '
'for natural sounding voices. The neural network is run locally on your computer, it is fairly resource intensive to run.' 'for natural sounding voices. The neural network is run locally on your computer, it is fairly resource intensive to run.'
), TrackingCapability.Sentence, can_change_pitch=False, voices_have_quality_metadata=True, has_managed_voices=True) ), TrackingCapability.Sentence, can_change_pitch=False, voices_have_quality_metadata=True, has_managed_voices=True,
has_sentence_delay=True)
if islinux: if islinux:
try: try:
from speechd.paths import SPD_SPAWN_CMD from speechd.paths import SPD_SPAWN_CMD