Read Aloud: Allow configuring an extra pause at the end of every sentence when using the Piper TTS engine. Fixes #2083058 [Bug with The Piper TTS Engine](https://bugs.launchpad.net/calibre/+bug/2083058)

2025-07-09 03:04:10 -04:00 · 2024-09-28 12:12:19 +05:30 · 2024-09-28 12:12:19 +05:30 · 0db769b01f
commit 0db769b01f
parent ed9f133e86
3 changed files with 39 additions and 3 deletions
--- a/src/calibre/gui2/tts/config.py
+++ b/src/calibre/gui2/tts/config.py
@ -3,6 +3,7 @@

 from qt.core import (
    QCheckBox,
+    QDoubleSpinBox,
    QFont,
    QFormLayout,
    QHBoxLayout,
@ -71,6 +72,26 @@ class EngineChoice(QWidget):
        self.engine_description.setText(metadata.description)


+class SentenceDelay(QDoubleSpinBox):
+
+    def __init__(self, parent=None):
+        super().__init__(parent)
+        self.setRange(0., 2.)
+        self.setDecimals(2)
+        self.setSuffix(_(' seconds'))
+        self.setToolTip(_('The number of seconds to pause for at the end of a sentence.'))
+        self.setSpecialValueText(_('no pause'))
+        self.setSingleStep(0.05)
+
+    @property
+    def val(self) -> str:
+        return max(0.0, self.value())
+
+    @val.setter
+    def val(self, v) -> None:
+        self.setValue(float(v))
+
+
 class FloatSlider(QSlider):

    def __init__(self, minimum: float = -1, maximum: float = 1, factor: int = 10, parent=None):
@ -220,6 +241,8 @@ class EngineSpecificConfig(QWidget):
        self.engine_specific_settings = {}
        self.rate = r = FloatSlider(parent=self)
        l.addRow(_('&Speed of speech:'), r)
+        self.sentence_delay = d = SentenceDelay(parent=self)
+        l.addRow(_('&Pause after sentence:'), d)
        self.pitch = p = FloatSlider(parent=self)
        l.addRow(_('&Pitch of speech:'), p)
        self.volume = v = Volume(self)
@ -256,6 +279,7 @@ class EngineSpecificConfig(QWidget):
        else:
            self.layout().setRowVisible(self.output_module, False)
        self.output_module.blockSignals(False)
+        self.layout().setRowVisible(self.sentence_delay, metadata.has_sentence_delay)
        try:
            s = self.engine_specific_settings[self.engine_name]
        except KeyError:
@ -274,6 +298,8 @@ class EngineSpecificConfig(QWidget):
        else:
            self.layout().setRowVisible(self.volume, False)
            self.volume.val = None
+        if metadata.has_sentence_delay:
+            self.sentence_delay.val = s.sentence_delay
        self.audio_device.clear()
        if metadata.allows_choosing_audio_device:
            self.audio_device.addItem(_('System default (currently {})').format(self.default_audio_device.description), '')
@ -305,6 +331,8 @@ class EngineSpecificConfig(QWidget):
            engine_name=self.engine_name,
            rate=self.rate.val, voice_name=self.voices.val, pitch=self.pitch.val, volume=self.volume.val)
        metadata = available_engines()[self.engine_name]
+        if metadata.has_sentence_delay:
+            ans = ans._replace(sentence_delay=self.sentence_delay.val)
        if metadata.has_multiple_output_modules and self.output_module.currentIndex() > 0:
            ans = ans._replace(output_module=self.output_module.currentData())
        if metadata.allows_choosing_audio_device and self.audio_device.currentIndex() > 0:
--- a/src/calibre/gui2/tts/piper.py
+++ b/src/calibre/gui2/tts/piper.py
@ -286,7 +286,7 @@ class Piper(TTSBackend):

            cmdline = list(piper_cmdline()) + [
                '--model', model_path, '--config', config_path, '--output-raw', '--json-input',
-                '--sentence-silence', '0', '--length_scale', str(length_scale)]
+                '--sentence-silence', str(s.sentence_delay), '--length_scale', str(length_scale)]
            if is_debugging():
                cmdline.append('--debug')
            self._process.setProgram(cmdline[0])
--- a/src/calibre/gui2/tts/types.py
+++ b/src/calibre/gui2/tts/types.py
@ -39,6 +39,7 @@ class EngineMetadata(NamedTuple):
    can_change_volume: bool = True
    voices_have_quality_metadata: bool = False
    has_managed_voices: bool = False
+    has_sentence_delay: bool = False


 class Quality(Enum):
@ -122,6 +123,7 @@ class EngineSpecificSettings(NamedTuple):
    volume: float | None = None  # 0 to 1, None is platform default volume
    output_module: str = ''
    engine_name: str = ''
+    sentence_delay: float = 0  # seconds >= 0

    @classmethod
    def create_from_prefs(cls, engine_name: str, prefs: dict[str, object]) -> 'EngineSpecificSettings':
@ -142,8 +144,11 @@ class EngineSpecificSettings(NamedTuple):
        with suppress(Exception):
            volume = max(0, min(float(prefs.get('volume')), 1))
        om = str(prefs.get('output_module', ''))
+        sentence_delay = 0.
+        with suppress(Exception):
+            sentence_delay = max(0, float(prefs.get('sentence_delay')))
        return EngineSpecificSettings(
-            voice_name=str(prefs.get('voice', '')), output_module=om,
+            voice_name=str(prefs.get('voice', '')), output_module=om, sentence_delay=sentence_delay,
            audio_device_id=audio_device_id, rate=rate, pitch=pitch, volume=volume, engine_name=engine_name)

    @classmethod
@ -166,6 +171,8 @@ class EngineSpecificSettings(NamedTuple):
            ans['volume'] = self.volume
        if self.output_module:
            ans['output_module'] = self.output_module
+        if self.sentence_delay:
+            ans['sentence_delay'] = self.sentence_delay
        return ans

    def save_to_config(self, prefs:JSONConfig | None = None):
@ -219,7 +226,8 @@ def available_engines() -> dict[str, EngineMetadata]:
        ans['piper'] = EngineMetadata('piper', _('The Piper Neural Engine'), _(
            'The "piper" engine can track the currently spoken sentence on screen. It uses a neural network '
            'for natural sounding voices. The neural network is run locally on your computer, it is fairly resource intensive to run.'
-        ), TrackingCapability.Sentence, can_change_pitch=False, voices_have_quality_metadata=True, has_managed_voices=True)
+        ), TrackingCapability.Sentence, can_change_pitch=False, voices_have_quality_metadata=True, has_managed_voices=True,
+        has_sentence_delay=True)
    if islinux:
        try:
            from speechd.paths import SPD_SPAWN_CMD