diff --git a/src/calibre/utils/windows/winspeech.cpp b/src/calibre/utils/windows/winspeech.cpp index 17297f71fa..cdfeef5d86 100644 --- a/src/calibre/utils/windows/winspeech.cpp +++ b/src/calibre/utils/windows/winspeech.cpp @@ -33,7 +33,7 @@ using namespace winrt::Windows::Media::SpeechSynthesis; using namespace winrt::Windows::Media::Playback; using namespace winrt::Windows::Media::Core; using namespace winrt::Windows::Storage::Streams; -typedef unsigned long long id_type; +typedef uint64_t id_type; #define debug(format_string, ...) { \ std::scoped_lock _sl_(output_lock); \ @@ -130,7 +130,7 @@ private: enum { DT_INT, DT_STRING, DT_LIST, DT_OBJECT, DT_NONE, DT_BOOL } type; std::string s; bool b; - long long i; + int64_t i; std::vector list; std::map object; public: @@ -140,7 +140,8 @@ public: json_val(winrt::hstring const& text) : type(DT_STRING), s(winrt::to_string(text)) {} json_val(std::wstring const& text) : type(DT_STRING), s(winrt::to_string(text)) {} json_val(std::string_view text) : type(DT_STRING), s(text) {} - json_val(long long num) : type(DT_INT), i(num) {} + json_val(int32_t num) : type(DT_INT), i(num) {} + json_val(int64_t num) : type(DT_INT), i(num) {} json_val(std::vector &&items) : type(DT_LIST), list(items) {} json_val(std::map &&m) : type(DT_OBJECT), object(m) {} json_val(std::initializer_list> const& vals) : type(DT_OBJECT), object(vals) { } @@ -191,6 +192,45 @@ public: } } + json_val(winrt::Windows::Foundation::TimeSpan const &t) : type(DT_INT) { + i = std::chrono::nanoseconds(t).count(); + } + + json_val(winrt::hstring const &label, SpeechCue const &cue) : type(DT_OBJECT) { +#define common_fields \ + {"start_time", json_val(cue.StartTime())}, \ + {"start_pos_in_text", json_val(cue.StartPositionInInput().Value())}, \ + {"end_pos_in_text", json_val(cue.EndPositionInInput().Value())}, + + if (label == L"SpeechBookmark") { + object = { + {"type", json_val("bookmark")}, + {"id", json_val(cue.Id())}, + common_fields + }; + + } else if (label == L"SpeechWord") { + object = { + {"type", json_val("word")}, + {"text", json_val(cue.Text())}, + common_fields + }; + } else if (label == L"SpeechSentence") { + object = { + {"type", json_val("sentence")}, + {"text", json_val(cue.Text())}, + common_fields + }; + } else { + object = { + {"type", json_val(label)}, + {"text", json_val(cue.Text())}, + common_fields + }; + } +#undef common_fields + } + void serialize(std::ostream &out) const { switch(type) { @@ -242,9 +282,9 @@ output(id_type cmd_id, std::string_view const &msg_type, json_val const &&msg) { } static void -output_error(id_type cmd_id, std::string_view const &msg, std::string_view const &error, long long line, HRESULT hr=S_OK) { +output_error(id_type cmd_id, std::string_view const &msg, std::string_view const &error, int64_t line, HRESULT hr=S_OK) { std::map m = {{"msg", json_val(msg)}, {"error", json_val(error)}, {"file", json_val("winspeech.cpp")}, {"line", json_val(line)}}; - if (hr != S_OK) m["hr"] = json_val((long long)hr); + if (hr != S_OK) m["hr"] = json_val((int64_t)hr); output(cmd_id, "error", std::move(m)); } @@ -647,21 +687,23 @@ class Synthesizer { Revokers revoker; std::recursive_mutex recursive_lock; - void register_metadata_handler_for_track(TimedMetadataTrack const& track, id_type cmd_id) { + void register_metadata_handler_for_track(uint32_t index, id_type cmd_id) { + TimedMetadataTrack track = current_item.TimedMetadataTracks().GetAt(index); std::scoped_lock sl(recursive_lock); if (current_cmd_id.load() != cmd_id) return; - track.CueEntered([cmd_id](auto, const auto&) { + revoker.cue_entered.push_back(track.CueEntered(winrt::auto_revoke, [cmd_id](auto track, const auto& args) { if (main_loop_is_running.load()) sx.output( - cmd_id, "cue", {{"state", "entered"}}); - }); - track.CueExited([cmd_id](auto, const auto&) { + cmd_id, "cue_entered", json_val(track.Label(), args.Cue().as())); + })); + revoker.cue_exited.push_back(track.CueExited(winrt::auto_revoke, [cmd_id](auto track, const auto& args) { if (main_loop_is_running.load()) sx.output( - cmd_id, "cue", {{"state", "exited"}}); - }); - track.TrackFailed([cmd_id](auto, const auto&) { + cmd_id, "cue_exited", json_val(track.Label(), args.Cue().as())); + })); + revoker.track_failed.push_back(track.TrackFailed(winrt::auto_revoke, [cmd_id](auto, const auto& args) { if (main_loop_is_running.load()) sx.output( cmd_id, "track_failed", {}); - }); + })); + current_item.TimedMetadataTracks().SetPresentationMode((unsigned int)index, TimedMetadataTrackPresentationMode::Hidden); } void load_stream_for_playback(SpeechSynthesisStream const &stream, id_type cmd_id) { @@ -709,11 +751,11 @@ class Synthesizer { std::scoped_lock sl(recursive_lock); if (!cmd_id_is_current(cmd_id)) return; if (index < 0) { - for (auto const &track : current_item.TimedMetadataTracks()) { - register_metadata_handler_for_track(track, cmd_id); + for (uint32_t i = 0; i < current_item.TimedMetadataTracks().Size(); i++) { + register_metadata_handler_for_track(i, cmd_id); } } else { - register_metadata_handler_for_track(current_item.TimedMetadataTracks().GetAt(index), cmd_id); + register_metadata_handler_for_track(index, cmd_id); } } @@ -782,7 +824,7 @@ handle_speak(id_type cmd_id, std::vector &parts) { sx.speak(cmd_id, address, is_ssml); } -static long long +static int64_t handle_stdin_message(winrt::hstring const &&msg) { if (msg == L"exit") { return 0; @@ -830,7 +872,7 @@ run_main_loop(PyObject*, PyObject*) { winrt::init_apartment(); // MTA (multi-threaded apartment) main_thread_id = GetCurrentThreadId(); MSG msg; - long long exit_code = 0; + int64_t exit_code = 0; bool ok = false; try { new (&sx) Synthesizer();