From f0ee3506d785945f4e130ae119a2bb2863312ef7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 27 Jan 2023 21:18:19 +0530 Subject: [PATCH] Keep all the speak related functions together in a block --- src/calibre/utils/windows/winspeech.cpp | 135 ++++++++++++------------ 1 file changed, 70 insertions(+), 65 deletions(-) diff --git a/src/calibre/utils/windows/winspeech.cpp b/src/calibre/utils/windows/winspeech.cpp index 17c933f777..9f550a7122 100644 --- a/src/calibre/utils/windows/winspeech.cpp +++ b/src/calibre/utils/windows/winspeech.cpp @@ -754,48 +754,15 @@ class Synthesizer { Revokers revoker; std::recursive_mutex recursive_lock; + public: + // Speak {{{ void register_metadata_handler_for_track(uint32_t index, id_type cmd_id); void load_stream_for_playback(SpeechSynthesisStream const &stream, id_type cmd_id, bool is_cued); - - public: - void register_metadata_handler_for_speech(id_type cmd_id, long index) { - std::scoped_lock sl(recursive_lock); - if (!cmd_id_is_current(cmd_id)) return; - if (index < 0) { - for (uint32_t i = 0; i < current_item.TimedMetadataTracks().Size(); i++) { - register_metadata_handler_for_track(i, cmd_id); - } - } else { - register_metadata_handler_for_track(index, cmd_id); - } - } - + winrt::fire_and_forget speak(id_type cmd_id, std::wstring_view const &text, bool is_ssml, bool is_cued, std::vector &&buf, Marks const && marks); + void register_metadata_handler_for_speech(id_type cmd_id, long index); bool cmd_id_is_current(id_type cmd_id) const noexcept { return current_cmd_id.load() == cmd_id; } - - void output(id_type cmd_id, std::string_view const& type, json_val const && x) { - std::scoped_lock sl(recursive_lock); - if (cmd_id_is_current(cmd_id)) ::output(cmd_id, type, std::move(x)); - } - - void on_cue_entered(id_type cmd_id, const winrt::hstring &label, const SpeechCue &cue) { - std::scoped_lock sl(recursive_lock); - if (!cmd_id_is_current(cmd_id)) return; - output(cmd_id, "cue_entered", json_val(label, cue)); - if (label != L"SpeechWord") return; - int32_t pos = cue.StartPositionInInput().Value(); - for (int32_t i = std::max(0, last_reported_mark_index); i < (int32_t)current_marks.size(); i++) { - int32_t idx = -1; - if (current_marks[i].pos_in_text > pos) { - idx = i-1; - if (idx == last_reported_mark_index && current_marks[i].pos_in_text - pos < 3) idx = i; - } else if (current_marks[i].pos_in_text == pos) idx = i; - if (idx > -1) { - output(cmd_id, "mark_reached", {{"id", current_marks[idx].id}}); - last_reported_mark_index = idx; - break; - } - } - } + void on_cue_entered(id_type cmd_id, const winrt::hstring &label, const SpeechCue &cue); + // }}} void initialize() { synth = SpeechSynthesizer(); @@ -804,6 +771,11 @@ class Synthesizer { player.AutoPlay(true); } + void output(id_type cmd_id, std::string_view const& type, json_val const && x) { + std::scoped_lock sl(recursive_lock); + if (cmd_id_is_current(cmd_id)) ::output(cmd_id, type, std::move(x)); + } + void stop_current_activity() { std::scoped_lock sl(recursive_lock); if (current_cmd_id.load()) { @@ -819,32 +791,6 @@ class Synthesizer { } } - winrt::fire_and_forget speak(id_type cmd_id, std::wstring_view const &text, bool is_ssml, bool is_cued, std::vector &&buf, Marks const && marks) { - SpeechSynthesisStream stream{nullptr}; - { std::scoped_lock sl(recursive_lock); - stop_current_activity(); - current_cmd_id.store(cmd_id); - current_text_storage = std::move(buf); - current_marks = std::move(marks); - synth.Options().IncludeSentenceBoundaryMetadata(true); - synth.Options().IncludeWordBoundaryMetadata(true); - } - output(cmd_id, "synthesizing", {{"ssml", is_ssml}, {"num_marks", current_marks.size()}, {"text_length", text.size()}}); - bool ok = false; - try { - if (is_ssml) stream = co_await synth.SynthesizeSsmlToStreamAsync(text); - else stream = co_await synth.SynthesizeTextToStreamAsync(text); - ok = true; - } CATCH_ALL_EXCEPTIONS("Failed to synthesize speech", cmd_id); - if (ok) { - if (main_loop_is_running.load()) { - try { - load_stream_for_playback(stream, cmd_id, is_cued); - } CATCH_ALL_EXCEPTIONS("Failed to load synthesized stream for playback", cmd_id); - } - } - } - double volume() const { return player.Volume(); } void volume(double val) { if (val < 0 || val > 1) throw std::out_of_range("Invalid volume value must be between 0 and 1"); @@ -855,6 +801,39 @@ class Synthesizer { static Synthesizer sx; +// Speak {{{ +void Synthesizer::on_cue_entered(id_type cmd_id, const winrt::hstring &label, const SpeechCue &cue) { + std::scoped_lock sl(recursive_lock); + if (!cmd_id_is_current(cmd_id)) return; + output(cmd_id, "cue_entered", json_val(label, cue)); + if (label != L"SpeechWord") return; + uint32_t pos = cue.StartPositionInInput().Value(); + for (int32_t i = std::max(0, last_reported_mark_index); i < (int32_t)current_marks.size(); i++) { + int32_t idx = -1; + if (current_marks[i].pos_in_text > pos) { + idx = i-1; + if (idx == last_reported_mark_index && current_marks[i].pos_in_text - pos < 3) idx = i; + } else if (current_marks[i].pos_in_text == pos) idx = i; + if (idx > -1) { + output(cmd_id, "mark_reached", {{"id", current_marks[idx].id}}); + last_reported_mark_index = idx; + break; + } + } +} + +void Synthesizer::register_metadata_handler_for_speech(id_type cmd_id, long index) { + std::scoped_lock sl(recursive_lock); + if (!cmd_id_is_current(cmd_id)) return; + if (index < 0) { + for (uint32_t i = 0; i < current_item.TimedMetadataTracks().Size(); i++) { + register_metadata_handler_for_track(i, cmd_id); + } + } else { + register_metadata_handler_for_track(index, cmd_id); + } +} + void Synthesizer::register_metadata_handler_for_track(uint32_t index, id_type cmd_id) { TimedMetadataTrack track = current_item.TimedMetadataTracks().GetAt(index); @@ -916,6 +895,31 @@ Synthesizer::load_stream_for_playback(SpeechSynthesisStream const &stream, id_ty player.Source(current_item); } +winrt::fire_and_forget Synthesizer::speak(id_type cmd_id, std::wstring_view const &text, bool is_ssml, bool is_cued, std::vector &&buf, Marks const && marks) { + SpeechSynthesisStream stream{nullptr}; + { std::scoped_lock sl(recursive_lock); + stop_current_activity(); + current_cmd_id.store(cmd_id); + current_text_storage = std::move(buf); + current_marks = std::move(marks); + synth.Options().IncludeSentenceBoundaryMetadata(true); + synth.Options().IncludeWordBoundaryMetadata(true); + } + output(cmd_id, "synthesizing", {{"ssml", is_ssml}, {"num_marks", current_marks.size()}, {"text_length", text.size()}}); + bool ok = false; + try { + if (is_ssml) stream = co_await synth.SynthesizeSsmlToStreamAsync(text); + else stream = co_await synth.SynthesizeTextToStreamAsync(text); + ok = true; + } CATCH_ALL_EXCEPTIONS("Failed to synthesize speech", cmd_id); + if (ok) { + if (main_loop_is_running.load()) { + try { + load_stream_for_playback(stream, cmd_id, is_cued); + } CATCH_ALL_EXCEPTIONS("Failed to load synthesized stream for playback", cmd_id); + } + } +} static size_t decode_into(std::string_view src, std::wstring_view dest) { @@ -1008,6 +1012,7 @@ handle_speak(id_type cmd_id, std::vector &parts) { *((wchar_t*)text.data() + text.size()) = 0; // ensure NULL termination sx.speak(cmd_id, text, is_ssml, is_cued, std::move(buf), std::move(marks)); } +// }}} static int64_t handle_stdin_message(winrt::hstring const &&msg) {