From 88e2331f634a93a2f1609c1054c4abcfeb126d6f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 27 Jan 2023 21:06:20 +0530 Subject: [PATCH] Hack to get mark reporting working Since Microsoft dont seem to have implemented support for SSML bookmarks or at least I cant get it to work, use the word cue events. When it fires report any surpassed or closeby mark. --- src/calibre/utils/windows/winspeech.cpp | 43 ++++++++++++++++--------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/src/calibre/utils/windows/winspeech.cpp b/src/calibre/utils/windows/winspeech.cpp index 3c5f6dcfd4..17c933f777 100644 --- a/src/calibre/utils/windows/winspeech.cpp +++ b/src/calibre/utils/windows/winspeech.cpp @@ -6,6 +6,7 @@ */ #include "common.h" +#include #include #include #include @@ -28,6 +29,9 @@ #include #include +#ifdef max +#undef max +#endif using namespace winrt::Windows::Foundation; using namespace winrt::Windows::Foundation::Collections; using namespace winrt::Windows::Media::SpeechSynthesis; @@ -744,6 +748,7 @@ class Synthesizer { MediaPlaybackItem current_item{nullptr}; std::vector current_text_storage; Marks current_marks; + int32_t last_reported_mark_index; std::atomic current_cmd_id; Revokers revoker; @@ -752,19 +757,6 @@ class Synthesizer { void register_metadata_handler_for_track(uint32_t index, id_type cmd_id); void load_stream_for_playback(SpeechSynthesisStream const &stream, id_type cmd_id, bool is_cued); - void add_cues() { - TimedMetadataTrack track(L"mark", L"en-us", TimedMetadataKind::Speech); - track.Label(L"mark"); - for (const Mark &mark : current_marks) { - SpeechCue cue; - cue.StartPositionInInput(IReference{(int)mark.pos_in_text}); - cue.EndPositionInInput(IReference{(int)mark.pos_in_text + 1}); - cue.Text(winrt::to_hstring(mark.id)); - track.AddCue(cue); - } - current_source.ExternalTimedMetadataTracks().Append(track); - } - public: void register_metadata_handler_for_speech(id_type cmd_id, long index) { std::scoped_lock sl(recursive_lock); @@ -785,6 +777,26 @@ class Synthesizer { if (cmd_id_is_current(cmd_id)) ::output(cmd_id, type, std::move(x)); } + void on_cue_entered(id_type cmd_id, const winrt::hstring &label, const SpeechCue &cue) { + std::scoped_lock sl(recursive_lock); + if (!cmd_id_is_current(cmd_id)) return; + output(cmd_id, "cue_entered", json_val(label, cue)); + if (label != L"SpeechWord") return; + int32_t pos = cue.StartPositionInInput().Value(); + for (int32_t i = std::max(0, last_reported_mark_index); i < (int32_t)current_marks.size(); i++) { + int32_t idx = -1; + if (current_marks[i].pos_in_text > pos) { + idx = i-1; + if (idx == last_reported_mark_index && current_marks[i].pos_in_text - pos < 3) idx = i; + } else if (current_marks[i].pos_in_text == pos) idx = i; + if (idx > -1) { + output(cmd_id, "mark_reached", {{"id", current_marks[idx].id}}); + last_reported_mark_index = idx; + break; + } + } + } + void initialize() { synth = SpeechSynthesizer(); player = MediaPlayer(); @@ -803,6 +815,7 @@ class Synthesizer { player.Pause(); current_text_storage = std::vector(); current_marks = Marks(); + last_reported_mark_index = -1; } } @@ -848,8 +861,7 @@ Synthesizer::register_metadata_handler_for_track(uint32_t index, id_type cmd_id) std::scoped_lock sl(recursive_lock); if (current_cmd_id.load() != cmd_id) return; revoker.cue_entered.push_back(track.CueEntered(winrt::auto_revoke, [cmd_id](auto track, const auto& args) { - if (main_loop_is_running.load()) sx.output( - cmd_id, "cue_entered", json_val(track.Label(), args.Cue().template as())); + if (main_loop_is_running.load()) sx.on_cue_entered(cmd_id, track.Label(), args.Cue().template as()); })); revoker.cue_exited.push_back(track.CueExited(winrt::auto_revoke, [cmd_id](auto track, const auto& args) { if (main_loop_is_running.load()) sx.output( @@ -868,7 +880,6 @@ Synthesizer::load_stream_for_playback(SpeechSynthesisStream const &stream, id_ty if (cmd_id != current_cmd_id.load()) return; current_stream = stream; current_source = MediaSource::CreateFromStream(current_stream, current_stream.ContentType()); - if (is_cued) add_cues(); revoker.playback_state_changed = player.PlaybackSession().PlaybackStateChanged( winrt::auto_revoke, [cmd_id](auto session, auto const&) {