More work on UI for TTS embed

This commit is contained in:
Kovid Goyal 2024-10-17 14:13:22 +05:30
parent 7cbfd2a0df
commit b4c2478948
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 156 additions and 15 deletions

View File

@ -7,6 +7,7 @@ import os
import sys
from collections import defaultdict
from contextlib import suppress
from functools import partial
from typing import NamedTuple
from lxml.etree import ElementBase as Element
@ -380,14 +381,15 @@ class ReportProgress:
def __init__(self):
self.current_stage = ''
def __call__(self, stage: str, item: str, count: int, total: int) -> None:
def __call__(self, stage: str, item: str, count: int, total: int) -> bool:
if stage != self.current_stage:
self.current_stage = stage
print()
print(self.current_stage)
return
return False
frac = count / total
print(f'\r{frac:4.0%} {item}', end='')
return False
def make_par(container, seq, html_href, audio_href, elem_id, pos, duration) -> None:
@ -436,12 +438,13 @@ def remove_embedded_tts(container):
container.remove_item(aname)
def embed_tts(container, report_progress=None, parent_widget=None):
def embed_tts(container, report_progress=None, callback_to_download_voices=None):
report_progress = report_progress or ReportProgress()
if container.book_type != 'epub':
raise UnsupportedContainerType(_('Only the EPUB format has support for embedding speech overlay audio'))
if container.opf_version_parsed[0] < 3:
report_progress(_('Updating book internals'), '', 0, 0)
if report_progress(_('Updating book internals'), '', 0, 0):
return False
upgrade_book(container, print)
remove_embedded_tts(container)
@ -455,7 +458,8 @@ def embed_tts(container, report_progress=None, parent_widget=None):
if container.mime_map.get(name) in OEB_DOCS:
name_map[name] = PerFileData(name)
stage = _('Processing HTML')
report_progress(stage, '', 0, len(name_map))
if report_progress(stage, '', 0, len(name_map)):
return False
all_voices = set()
total_num_sentences = 0
for i, (name, pfd) in enumerate(name_map.items()):
@ -467,10 +471,16 @@ def embed_tts(container, report_progress=None, parent_widget=None):
pfd.key_map[key].append(s)
all_voices.add(key)
container.dirty(name)
report_progress(stage, name, i+1, len(name_map))
piper.ensure_voices_downloaded(iter(all_voices), parent=parent_widget)
if report_progress(stage, name, i+1, len(name_map)):
return False
if callback_to_download_voices is None:
piper.ensure_voices_downloaded(iter(all_voices))
else:
if not callback_to_download_voices(partial(piper.ensure_voices_downloaded, iter(all_voices))):
return False
stage = _('Converting text to speech')
report_progress(stage, '', 0, total_num_sentences)
if report_progress(stage, '', 0, total_num_sentences):
return False
snum = 0
size_of_audio_data = 0
mmap = {container.href_to_name(item.get('href'), container.opf_name):item for item in container.manifest_items}
@ -484,7 +494,8 @@ def embed_tts(container, report_progress=None, parent_widget=None):
audio_map[s] = audio_data, duration
size_of_audio_data += len(audio_data)
snum += 1
report_progress(stage, _('Sentence number: {}').format(snum), snum, total_num_sentences)
if report_progress(stage, _('Sentence number: {}').format(snum), snum, total_num_sentences):
return False
wav = io.BytesIO()
wav.write(wav_header_for_pcm_data(size_of_audio_data, HIGH_QUALITY_SAMPLE_RATE))
afitem = container.generate_item(name + '.m4a', id_prefix='tts-')

View File

@ -628,10 +628,13 @@ class PiperEmbedded:
raw_data = resample_raw_audio_16bit(raw_data, self._current_audio_rate, sample_rate)
yield raw_data, duration_of_raw_audio_data(raw_data, sample_rate)
def ensure_voices_downloaded(self, specs: Iterable[tuple[str, str]], parent: QObject = None) -> None:
def ensure_voices_downloaded(self, specs: Iterable[tuple[str, str]], parent: QObject = None) -> bool:
for lang, voice_name in specs:
voice = self.resolve_voice(lang, voice_name)
download_voice(voice, parent=parent, headless=parent is None)
m, c = download_voice(voice, parent=parent, headless=parent is None)
if not m:
return False
return True
def shutdown(self):
if self._process is not None:

View File

@ -1,14 +1,18 @@
#!/usr/bin/env python
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
import os
import sys
import traceback
from qt.core import QHBoxLayout, QStackedLayout, QTextBrowser, QVBoxLayout, QWidget
from qt.core import QDialogButtonBox, QHBoxLayout, QIcon, QStackedLayout, Qt, QTextBrowser, QVBoxLayout, QWidget, pyqtSignal
from calibre.gui2 import error_dialog
from calibre.gui2.tweak_book.widgets import Dialog
from calibre.gui2.widgets import BusyCursor
class ConfigWidget(QWidget):
class EngineSettingsWidget(QWidget):
def __init__(self, parent=None):
from calibre.gui2.tts.config import EmbeddingConfig
@ -18,20 +22,138 @@ class ConfigWidget(QWidget):
self.conf = c = EmbeddingConfig(self)
h.addWidget(c)
self.help = q = QTextBrowser(self)
h.addWidget(q)
h.addWidget(q, 10)
q.setHtml(_('''
<h2>Add Text-to-speech narration</h2>
<p>Add an audio overlay to this book using Text-to-speech technology. Then users reading this book in a reader that supports
audio overlays, such as the calibre viewer, will be able to hear the text read to them, if they wish.
<p>You can mark different passages to be spoken by different voices as shown in the example below:
<div><code>&lt;p data-calibre-tts="{0}"&gt;This will be voiced by "{0}"&lt;/p&gt;</code></div>
<div><code>&lt;p data-calibre-tts="{1}"&gt;This will be voiced by "{1}"&lt;/p&gt;</code></div>
<p style="font-size: small">Note that generating the Text-to-speech audio will be quite slow,
at the rate of approximately one sentence per couple of seconds, depending on your computer's hardware,
so consider leave it running overnight.
''').format('cory', 'ryan'))
self.save_settings = c.save_settings
class Progress(QWidget):
cancel_requested: bool = False
current_stage: str = ''
def __init__(self, parent: QWidget = None):
super().__init__(parent)
self.v = v = QVBoxLayout(self)
v.setAlignment(Qt.AlignmentFlag.AlignCenter)
v.setContentsMargins(0, 0, 0, 0)
def __call__(self, stage: str, item: str, count: int, total: int) -> bool:
return self.cancel_requested
class TTSEmbed(Dialog):
report_progress = pyqtSignal(object, object)
worker_done = pyqtSignal(object)
ensure_voices_downloaded_signal = pyqtSignal(object, object)
def __init__(self, container, parent=None):
self.container = container
from threading import Thread
self.worker_thread = Thread(target=self.worker, daemon=True)
self.worker_done.connect(self.on_worker_done, type=Qt.ConnectionType.QueuedConnection)
self.ensure_voices_downloaded_signal.connect(self.do_ensure_voices_downloaded, type=Qt.ConnectionType.QueuedConnection)
super().__init__(_('Add Text-to-speech narration'), 'tts-overlay-dialog', parent=parent)
def setup_ui(self):
self.v = v = QVBoxLayout(self)
self.stack = s = QStackedLayout(self)
self.engine_settings_widget = e = EngineSettingsWidget(self)
self.stack = s = QStackedLayout()
s.addWidget(e)
s.setCurrentIndex(0)
v.addLayout(s)
self.progress = p = Progress(self)
self.report_progress.connect(self.do_report_progress, type=Qt.ConnectionType.QueuedConnection)
s.addWidget(p)
self.remove_media_button = b = self.bb.addButton(_('&Remove existing audio'), QDialogButtonBox.ButtonRole.ActionRole)
b.setToolTip(_('Remove any exisiting audio overlays, such as a previously created Text-to-speech narration from this book'))
b.setIcon(QIcon.ic('trash.png'))
b.clicked.connect(self.remove_media)
v.addWidget(self.bb)
self.update_button_box()
self.stack.currentChanged.connect(self.update_button_box)
def update_button_box(self):
if self.stack.currentIndex() == 0:
self.bb.setStandardButtons(QDialogButtonBox.StandardButton.Ok | QDialogButtonBox.StandardButton.Cancel)
self.remove_media_button.setVisible(True)
else:
self.bb.setStandardButtons(QDialogButtonBox.StandardButton.Cancel)
self.remove_media_button.setVisible(False)
def remove_media(self):
from calibre.ebooks.oeb.polish.tts import remove_embedded_tts
remove_embedded_tts(self.container)
super().accept()
def accept(self):
if self.stack.currentIndex() == 0:
self.engine_settings_widget.save_settings()
self.stack.setCurrentIndex(1)
self.worker_thread.start()
def do_report_progress(self, a, kw):
self.progress(*a, **kw)
def worker(self):
from calibre.ebooks.oeb.polish.tts import embed_tts
def report_progress(*a, **kw):
self.report_progress.emit(a, kw)
return self.progress.cancel_requested
try:
err = embed_tts(self.container, report_progress, self.ensure_voices_downloaded)
except Exception as e:
err = e
err.det_msg = traceback.format_exc()
self.worker_done.emit(err)
def ensure_voices_downloaded(self, callback):
from queue import Queue
queue = Queue()
self.ensure_voices_downloaded_signal.emit(callback, queue)
e = queue.get()
if isinstance(e, Exception):
raise e
return e
def do_ensure_voices_downloaded(self, callback, queue):
try:
queue.put(callback(self))
except Exception as e:
e.det_msg = traceback.format_exc()
queue.put(e)
def on_worker_done(self, err_or_ok):
if isinstance(err_or_ok, Exception):
error_dialog(self, _('Text-to-speech narration failed'), str(err_or_ok), det_msg=getattr(err_or_ok, 'det_msg', ''), show=True)
return super().reject()
return super().accept() if err_or_ok else super().reject()
def reject(self):
if self.stack.currentIndex() == 0:
return super().reject()
with BusyCursor():
self.progress.cancel_requested = True
self.bb.setEnabled(False)
return super().reject()
def develop():
@ -44,6 +166,11 @@ def develop():
d.exec()
del d
del app
b, e = os.path.splitext(path)
outpath = b + '-tts' + e
container.commit(outpath)
print('Output saved to:', outpath)
if __name__ == '__main__':