Hack to get mark reporting working

Since Microsoft dont seem to have implemented support for SSML bookmarks or at least I cant get it to work, use the word cue events. When it fires report any surpassed or closeby mark.
2025-07-08 02:34:06 -04:00 · 2023-01-27 21:06:20 +05:30 · 2023-01-27 21:06:20 +05:30 · 88e2331f63
commit 88e2331f63
parent d2983fef22
1 changed files with 27 additions and 16 deletions
--- a/src/calibre/utils/windows/winspeech.cpp
+++ b/src/calibre/utils/windows/winspeech.cpp
@ -6,6 +6,7 @@
 */
 #include "common.h"

+#include <algorithm>
 #include <atomic>
 #include <array>
 #include <vector>
@ -28,6 +29,9 @@
 #include <winrt/windows.media.core.h>
 #include <winrt/windows.media.playback.h>

+#ifdef max
+#undef max
+#endif
 using namespace winrt::Windows::Foundation;
 using namespace winrt::Windows::Foundation::Collections;
 using namespace winrt::Windows::Media::SpeechSynthesis;
@ -744,6 +748,7 @@ class Synthesizer {
    MediaPlaybackItem current_item{nullptr};
    std::vector<wchar_t> current_text_storage;
    Marks current_marks;
+    int32_t last_reported_mark_index;
    std::atomic<id_type> current_cmd_id;

    Revokers revoker;
@ -752,19 +757,6 @@ class Synthesizer {
    void register_metadata_handler_for_track(uint32_t index, id_type cmd_id);
    void load_stream_for_playback(SpeechSynthesisStream const &stream, id_type cmd_id, bool is_cued);

-    void add_cues() {
-        TimedMetadataTrack track(L"mark", L"en-us", TimedMetadataKind::Speech);
-        track.Label(L"mark");
-        for (const Mark &mark : current_marks) {
-            SpeechCue cue;
-            cue.StartPositionInInput(IReference<int>{(int)mark.pos_in_text});
-            cue.EndPositionInInput(IReference<int>{(int)mark.pos_in_text + 1});
-            cue.Text(winrt::to_hstring(mark.id));
-            track.AddCue(cue);
-        }
-        current_source.ExternalTimedMetadataTracks().Append(track);
-    }
-
    public:
    void register_metadata_handler_for_speech(id_type cmd_id, long index) {
        std::scoped_lock sl(recursive_lock);
@ -785,6 +777,26 @@ class Synthesizer {
        if (cmd_id_is_current(cmd_id)) ::output(cmd_id, type, std::move(x));
    }

+    void on_cue_entered(id_type cmd_id, const winrt::hstring &label, const SpeechCue &cue) {
+        std::scoped_lock sl(recursive_lock);
+        if (!cmd_id_is_current(cmd_id)) return;
+        output(cmd_id, "cue_entered", json_val(label, cue));
+        if (label != L"SpeechWord") return;
+        int32_t pos = cue.StartPositionInInput().Value();
+        for (int32_t i = std::max(0, last_reported_mark_index); i < (int32_t)current_marks.size(); i++) {
+            int32_t idx = -1;
+            if (current_marks[i].pos_in_text > pos) {
+                idx = i-1;
+                if (idx == last_reported_mark_index && current_marks[i].pos_in_text - pos < 3) idx = i;
+            } else if (current_marks[i].pos_in_text == pos) idx = i;
+            if (idx > -1) {
+                output(cmd_id, "mark_reached", {{"id", current_marks[idx].id}});
+                last_reported_mark_index = idx;
+                break;
+            }
+        }
+    }
+
    void initialize() {
        synth = SpeechSynthesizer();
        player = MediaPlayer();
@ -803,6 +815,7 @@ class Synthesizer {
            player.Pause();
            current_text_storage = std::vector<wchar_t>();
            current_marks = Marks();
+            last_reported_mark_index = -1;
        }
    }

@ -848,8 +861,7 @@ Synthesizer::register_metadata_handler_for_track(uint32_t index, id_type cmd_id)
    std::scoped_lock sl(recursive_lock);
    if (current_cmd_id.load() != cmd_id) return;
    revoker.cue_entered.push_back(track.CueEntered(winrt::auto_revoke, [cmd_id](auto track, const auto& args) {
-        if (main_loop_is_running.load()) sx.output(
-            cmd_id, "cue_entered", json_val(track.Label(), args.Cue().template as<SpeechCue>()));
+        if (main_loop_is_running.load()) sx.on_cue_entered(cmd_id, track.Label(), args.Cue().template as<SpeechCue>());
    }));
    revoker.cue_exited.push_back(track.CueExited(winrt::auto_revoke, [cmd_id](auto track, const auto& args) {
        if (main_loop_is_running.load()) sx.output(
@ -868,7 +880,6 @@ Synthesizer::load_stream_for_playback(SpeechSynthesisStream const &stream, id_ty
    if (cmd_id != current_cmd_id.load()) return;
    current_stream = stream;
    current_source = MediaSource::CreateFromStream(current_stream, current_stream.ContentType());
-    if (is_cued) add_cues();

    revoker.playback_state_changed = player.PlaybackSession().PlaybackStateChanged(
            winrt::auto_revoke, [cmd_id](auto session, auto const&) {