From ac923e630a86602560b0ad221ca5bd0ddc772efe Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 2 Sep 2024 21:14:36 +0530
Subject: [PATCH] Wire up auto-download of piper voice

---
 setup/piper.py                   |   6 +-
 src/calibre/gui2/tts2/manager.py |  13 ++--
 src/calibre/gui2/tts2/piper.py   | 110 +++++++++++++++++++++++--------
 src/calibre/gui2/tts2/types.py   |   8 ++-
 4 files changed, 100 insertions(+), 37 deletions(-)

diff --git a/setup/piper.py b/setup/piper.py
index 873b540c91..b392fef343 100644
--- a/setup/piper.py
+++ b/setup/piper.py
@@ -32,13 +32,13 @@ class PiperVoices(ReVendor):
             src = self.download_securely(url).decode('utf-8')
         lang_map = {}
         current_lang = current_voice = ''
-        lang_pat = re.compile(r'`(.+?)`')
+        lang_pat = re.compile(r'\((.+?)\)')
         model_pat = re.compile(r'\[model\]\((.+?)\)')
         config_pat = re.compile(r'\[config\]\((.+?)\)')
         for line in src.splitlines():
             if line.startswith('* '):
                 if m := lang_pat.search(line):
-                    current_lang = m.group(1)
+                    current_lang = m.group(1).partition(',')[0].replace('`', '')
                     lang_map[current_lang] = {}
                     current_voice = ''
             else:
@@ -62,6 +62,8 @@ class PiperVoices(ReVendor):
                     lang_map[current_lang][current_voice] = {}
         if not lang_map:
             raise SystemExit(f'Failed to read any piper voices from: {url}')
+        if 'en_US' not in lang_map:
+            raise SystemExit(f'Failed to read en_US piper voices from: {url}')
         with open(self.output_file_path, 'w') as f:
             json.dump({'version': 1, 'lang_map': lang_map}, f, indent=2, sort_keys=False)
 
diff --git a/src/calibre/gui2/tts2/manager.py b/src/calibre/gui2/tts2/manager.py
index e25d07b145..260dca7183 100644
--- a/src/calibre/gui2/tts2/manager.py
+++ b/src/calibre/gui2/tts2/manager.py
@@ -6,7 +6,7 @@ from collections import deque
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, NamedTuple
 
-from qt.core import QApplication, QDialog, QObject, QTextToSpeech, QWidget, pyqtSignal
+from qt.core import QApplication, QDialog, QObject, QTextToSpeech, pyqtSignal
 
 from calibre.gui2 import error_dialog
 from calibre.gui2.widgets import BusyCursor
@@ -125,7 +125,10 @@ class TTSManager(QObject):
         if self._tts is None:
             with BusyCursor():
                 from calibre.gui2.tts2.types import create_tts_backend
-                self._tts = create_tts_backend()
+                try:
+                    self._tts = create_tts_backend()
+                except AttributeError as e:
+                    raise Exception(str(e)) from e
                 self._tts.state_changed.connect(self._state_changed)
                 self._tts.saying.connect(self._saying)
         return self._tts
@@ -185,11 +188,9 @@ class TTSManager(QObject):
 
     def configure(self) -> None:
         from calibre.gui2.tts2.config import ConfigDialog
-        p = self
-        while p is not None and not isinstance(p, QWidget):
-            p = p.parent()
+        from calibre.gui2.tts2.types import widget_parent
         with self.resume_after() as rd:
-            d = ConfigDialog(parent=p)
+            d = ConfigDialog(parent=widget_parent(self))
             if d.exec() == QDialog.DialogCode.Accepted and self._tts is not None:
                 rd.needs_full_resume = True
                 if d.engine_changed:
diff --git a/src/calibre/gui2/tts2/piper.py b/src/calibre/gui2/tts2/piper.py
index deae62fef2..a545e13c7c 100644
--- a/src/calibre/gui2/tts2/piper.py
+++ b/src/calibre/gui2/tts2/piper.py
@@ -11,12 +11,12 @@ from dataclasses import dataclass
 from itertools import count
 from time import monotonic
 
-from qt.core import QAudio, QAudioFormat, QAudioSink, QByteArray, QIODevice, QIODeviceBase, QObject, QProcess, Qt, QTextToSpeech, pyqtSignal, sip
+from qt.core import QAudio, QAudioFormat, QAudioSink, QByteArray, QDialog, QIODevice, QIODeviceBase, QObject, QProcess, Qt, QTextToSpeech, pyqtSignal, sip
 
-from calibre.constants import is_debugging
-from calibre.gui2.tts2.types import Quality, TTSBackend, Voice, piper_cmdline
+from calibre.constants import cache_dir, is_debugging
+from calibre.gui2.tts2.types import EngineSpecificSettings, Quality, TTSBackend, Voice, piper_cmdline, widget_parent
 from calibre.spell.break_iterator import sentence_positions, split_into_words_and_positions
-from calibre.utils.localization import canonicalize_lang
+from calibre.utils.localization import canonicalize_lang, get_lang
 from calibre.utils.resources import get_path as P
 
 
@@ -178,7 +178,7 @@ class Piper(TTSBackend):
         self._utterances_being_spoken.saying.connect(self.saying)
         self._utterances_being_spoken.update_status.connect(self._update_status, type=Qt.ConnectionType.QueuedConnection)
         self._state = QTextToSpeech.State.Ready
-        self._voices = None
+        self._voices = self._voice_for_lang = None
         self._last_error = ''
         self._errors_from_piper: list[str] = []
         self._pending_stderr_data = b''
@@ -189,25 +189,7 @@ class Piper(TTSBackend):
 
     @property
     def available_voices(self) -> dict[str, tuple[Voice, ...]]:
-        if self._voices is None:
-            d = json.loads(P('piper-voices.json', data=True))
-            ans = []
-            for bcp_code, voice_map in d['lang_map'].items():
-                lang, sep, country = bcp_code.partition('_')
-                lang = canonicalize_lang(lang) or lang
-                for voice_name, qual_map in voice_map.items():
-                    best_qual = voice = None
-                    for qual, e in qual_map.items():
-                        q = Quality.from_piper_quality(qual)
-                        if best_qual is None or q.value < best_qual.value:
-                            best_qual = q
-                            voice = Voice(voice_name, lang, country, quality=q, engine_data={
-                                'model_url': e['model'], 'config_url': e['config'],
-                                'model_filename': f'{bcp_code}-{voice_name}-{qual}.onnx',
-                            })
-                    if voice:
-                        ans.append(voice)
-            self._voices = tuple(ans)
+        self._load_voice_metadata()
         return {'': self._voices}
 
     def say(self, text: str) -> None:
@@ -275,6 +257,20 @@ class Piper(TTSBackend):
     @property
     def process(self) -> QProcess:
         if self._process is None:
+            model_path = config_path = ''
+            try:
+                self._load_voice_metadata()
+                s = EngineSpecificSettings.create_from_config(self.engine_name)
+                rate = max(0.1, 1 + s.rate)  # maps -1 to 1 to 0.1 to 2
+                voice = self._voice_name_map.get(s.voice_name) or self._default_voice
+                model_path, config_path = self._ensure_voice_is_downloaded(voice)
+            except AttributeError as e:
+                raise Exception(str(e)) from e
+            if not model_path:
+                raise Exception('Could not download voice data')
+            with open(config_path) as f:
+                voice_metadata = json.load(f)
+                audio_rate = voice_metadata['audio']['sample_rate']
             self._utterances_being_spoken.clear()
             self._utterances_being_synthesized.clear()
             self._errors_from_piper.clear()
@@ -282,10 +278,9 @@ class Piper(TTSBackend):
             self._pending_stderr_data = b''
             self._set_state(QTextToSpeech.State.Ready)
 
-            model_path =  '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice
-            rate = 1.0  # TODO: Make rate configurable
             cmdline = list(piper_cmdline()) + [
-                '--model', model_path, '--output-raw', '--json-input', '--sentence-silence', '0', '--length_scale', str(rate)]
+                '--model', model_path, '--config', config_path, '--output-raw', '--json-input',
+                '--sentence-silence', '0', '--length_scale', str(rate)]
             if is_debugging():
                 cmdline.append('--debug')
             self._process.setProgram(cmdline[0])
@@ -296,7 +291,7 @@ class Piper(TTSBackend):
             self._process.stateChanged.connect(self._update_status)
             fmt = QAudioFormat()
             fmt.setSampleFormat(QAudioFormat.SampleFormat.Int16)
-            fmt.setSampleRate(22050)  # TODO: Read this from voice JSON
+            fmt.setSampleRate(audio_rate)
             fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono)
             self._audio_sink = QAudioSink(fmt, self)  # TODO: Make audio device configurable
             self._audio_sink.stateChanged.connect(self._utterances_being_spoken.audio_state_changed)
@@ -383,6 +378,65 @@ class Piper(TTSBackend):
     def audio_sink_state_changed(self, state: QAudio.State) -> None:
         self._update_status()
 
+    def _load_voice_metadata(self) -> None:
+        if self._voices is not None:
+            return
+        d = json.loads(P('piper-voices.json', data=True))
+        ans = []
+        lang_voices_map = {}
+        self._voice_name_map = {}
+        for bcp_code, voice_map in d['lang_map'].items():
+            lang, sep, country = bcp_code.partition('_')
+            lang = canonicalize_lang(lang) or lang
+            voices_for_lang = lang_voices_map.setdefault(lang, [])
+            for voice_name, qual_map in voice_map.items():
+                best_qual = voice = None
+                for qual, e in qual_map.items():
+                    q = Quality.from_piper_quality(qual)
+                    if best_qual is None or q.value < best_qual.value:
+                        best_qual = q
+                        voice = Voice(bcp_code + ':' + voice_name, lang, country, human_name=voice_name, quality=q, engine_data={
+                            'model_url': e['model'], 'config_url': e['config'],
+                            'model_filename': f'{bcp_code}-{voice_name}-{qual}.onnx',
+                        })
+                if voice:
+                    ans.append(voice)
+                    self._voice_name_map[voice.name] = voice
+                    voices_for_lang.append(voice)
+        self._voices = tuple(ans)
+        self._voice_for_lang = {}
+        for lang, voices in lang_voices_map.items():
+            voices.sort(key=lambda v: v.quality.value)
+            self._voice_for_lang[lang] = voices[0]
+            if lang == 'eng':
+                for v in voices:
+                    if v.human_name == 'libritts':
+                        self._voice_for_lang[lang] = v
+                        break
+
+    @property
+    def _default_voice(self) -> Voice:
+        self._load_voice_metadata()
+        lang = get_lang()
+        lang = canonicalize_lang(lang) or lang
+        return self._voice_for_lang.get(lang) or self._voice_for_lang['eng']
+
+    def _ensure_voice_is_downloaded(self, voice: Voice) -> tuple[str, str]:
+        fname = voice.engine_data['model_filename']
+        model_path = os.path.join(cache_dir(), 'piper-voices', fname)
+        config_path = os.path.join(os.path.dirname(model_path), fname + '.json')
+        if os.path.exists(model_path) and os.path.exists(config_path):
+            return model_path, config_path
+        os.makedirs(os.path.dirname(model_path), exist_ok=True)
+        from calibre.gui2.tts2.download import DownloadResources
+        d = DownloadResources(_('Downloading voice data'), _('Downloading neural network for the {} voice').format(voice.human_name), {
+            voice.engine_data['model_url']: (model_path, _('Neural network data')),
+            voice.engine_data['config_url']: (config_path, _('Neural network metadata')),
+        }, parent=widget_parent(self))
+        if d.exec() == QDialog.DialogCode.Accepted:
+            return model_path, config_path
+        return '', ''
+
 
 def develop():  # {{{
     import tty
diff --git a/src/calibre/gui2/tts2/types.py b/src/calibre/gui2/tts2/types.py
index 181ac41b15..f12fa4da78 100644
--- a/src/calibre/gui2/tts2/types.py
+++ b/src/calibre/gui2/tts2/types.py
@@ -7,7 +7,7 @@ from enum import Enum, auto
 from functools import lru_cache
 from typing import Literal, NamedTuple
 
-from qt.core import QApplication, QLocale, QObject, QTextToSpeech, QVoice, pyqtSignal
+from qt.core import QApplication, QLocale, QObject, QTextToSpeech, QVoice, QWidget, pyqtSignal
 
 from calibre.constants import bundled_binaries_dir, islinux, ismacos, iswindows
 from calibre.utils.config import JSONConfig
@@ -207,6 +207,12 @@ def default_engine_name() -> str:
     return 'flite'
 
 
+def widget_parent(p: QObject) -> QWidget | None:
+    while p is not None and not isinstance(p, QWidget):
+        p = p.parent()
+    return p
+
+
 class TTSBackend(QObject):
     saying = pyqtSignal(int, int)  # offset, length
     state_changed = pyqtSignal(QTextToSpeech.State)