Wire up the piper backend fully

This commit is contained in:
Kovid Goyal 2024-09-03 09:18:44 +05:30
parent 83e877b9bc
commit c4e4661e21
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 67 additions and 9 deletions

View File

@ -132,8 +132,9 @@ class Voices(QTreeWidget):
current_item = None
def qv(parent, voice):
nonlocal current_item
ans = QTreeWidgetItem(parent, [voice.short_text])
ans = QTreeWidgetItem(parent, [voice.short_text(engine_metadata)])
ans.setData(0, Qt.ItemDataRole.UserRole, voice)
ans.setToolTip(0, voice.tooltip(engine_metadata))
if current_voice == voice.name:
current_item = ans
return ans

View File

@ -19,6 +19,7 @@ from qt.core import (
QDialog,
QIODevice,
QIODeviceBase,
QMediaDevices,
QObject,
QProcess,
Qt,
@ -188,6 +189,7 @@ class Piper(TTSBackend):
self._process: QProcess | None = None
self._audio_sink: QAudioSink | None = None
self._current_voice: Voice | None = None
self._utterances_being_synthesized: deque[Utterance] = deque()
self._utterance_counter = count(start=1)
self._utterances_being_spoken = UtteranceAudioQueue()
@ -219,7 +221,10 @@ class Piper(TTSBackend):
else:
self._set_error(f'Failed to start piper process: {cmdline}')
return
self._utterances_being_synthesized.extend(split_into_utterances(text, self._utterance_counter)) # TODO: Use voice language
lang = 'en'
if self._current_voice and self._current_voice.language_code:
lang = self._current_voice.language_code
self._utterances_being_synthesized.extend(split_into_utterances(text, self._utterance_counter, lang))
self._write_current_utterance()
def pause(self) -> None:
@ -284,9 +289,11 @@ class Piper(TTSBackend):
raise Exception(str(e)) from e
if not model_path:
raise Exception('Could not download voice data')
with open(config_path) as f:
voice_metadata = json.load(f)
audio_rate = voice_metadata['audio']['sample_rate']
if 'metadata' not in voice.engine_data:
with open(config_path) as f:
voice.engine_data['metadata'] = json.load(f)
audio_rate = voice.engine_data['metadata']['audio']['sample_rate']
self._current_voice = voice
self._utterances_being_spoken.clear()
self._utterances_being_synthesized.clear()
self._errors_from_piper.clear()
@ -309,7 +316,18 @@ class Piper(TTSBackend):
fmt.setSampleFormat(QAudioFormat.SampleFormat.Int16)
fmt.setSampleRate(audio_rate)
fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono)
self._audio_sink = QAudioSink(fmt, self) # TODO: Make audio device configurable
dev = None
if s.audio_device_id:
for q in QMediaDevices.audioOutputs():
if bytes(q.id()) == s.audio_device_id.id:
dev = q
break
if dev:
self._audio_sink = QAudioSink(dev, fmt, self)
else:
self._audio_sink = QAudioSink(fmt, self)
if s.volume is not None:
self._audio_sink.setVolume(s.volume)
self._audio_sink.stateChanged.connect(self._utterances_being_spoken.audio_state_changed)
self._process.start()
self._audio_sink.start(self._utterances_being_spoken)

View File

@ -62,6 +62,16 @@ class Quality(Enum):
def from_piper_quality(self, x: str) -> 'Quality':
return {'x_low': Quality.ExtraLow, 'low': Quality.Low, 'medium': Quality.Medium, 'high': Quality.High}[x]
@property
def localized_name(self) -> str:
if self is Quality.Medium:
return _('Medium quality')
if self is Quality.Low:
return _('Low quality')
if self is Quality.ExtraLow:
return _('Extra low quality')
return _('High quality')
class Voice(NamedTuple):
name: str = ''
@ -77,11 +87,30 @@ class Voice(NamedTuple):
engine_data: dict[str, str] | None = None
@property
def short_text(self) -> str:
def basic_name(self) -> str:
return self.human_name or self.name or _('System default voice')
def short_text(self, m: EngineMetadata) -> str:
ans = self.basic_name
if self.country_code:
territory = QLocale.codeToTerritory(self.country_code)
ans += f' ({QLocale.territoryToString(territory)})'
if m.voices_have_quality_metadata:
ans += f' [{self.quality.localized_name}]'
return ans
def tooltip(self, m: EngineMetadata) -> str:
ans = []
if self.notes:
ans.append(self.notes)
if self.age is not QVoice.Age.Other:
ans.append(_('Age: {}').format(QVoice.ageName(self.age)))
if self.gender is not QVoice.Gender.Unknown:
ans.append(_('Gender: {}').format(QVoice.genderName(self.gender)))
return '\n'.join(ans)
def sort_key(self) -> tuple[Quality, str]:
return (self.quality, self.short_text.lower())
return (self.quality.value, self.basic_name.lower())
@ -190,10 +219,13 @@ def available_engines() -> dict[str, EngineMetadata]:
elif x == 'speechd':
continue
if islinux:
if piper_cmdline():
ans['piper'] = EngineMetadata('piper', TrackingCapability.Sentence, can_change_pitch=False, voices_have_quality_metadata=True)
from speechd.paths import SPD_SPAWN_CMD
cmd = os.getenv("SPEECHD_CMD", SPD_SPAWN_CMD)
if cmd and os.access(cmd, os.X_OK) and os.path.isfile(cmd):
ans['speechd'] = EngineMetadata('speechd', TrackingCapability.WordByWord, allows_choosing_audio_device=False, has_multiple_output_modules=True)
return ans
@ -202,6 +234,8 @@ def default_engine_name() -> str:
return 'sapi' if tweaks.get('prefer_winsapi') else 'winrt'
if ismacos:
return 'darwin'
if 'piper' in available_engines():
return 'piper'
if 'speechd' in available_engines():
return 'speechd'
return 'flite'
@ -256,7 +290,12 @@ def create_tts_backend(force_engine: str | None = None) -> TTSBackend:
engine_name = engine_name or default_engine_name()
if engine_name not in available_engines():
engine_name = default_engine_name()
if engine_name == 'speechd':
if engine_name == 'piper':
if engine_name not in engine_instances:
from calibre.gui2.tts2.piper import Piper
engine_instances[engine_name] = Piper(engine_name, QApplication.instance())
ans = engine_instances[engine_name]
elif engine_name == 'speechd':
if engine_name not in engine_instances:
from calibre.gui2.tts2.speechd import SpeechdTTSBackend
engine_instances[engine_name] = SpeechdTTSBackend(engine_name, QApplication.instance())