Saving to WAV kinda works though audio is very distorted

2025-07-09 03:04:10 -04:00 · 2023-01-28 12:32:40 +05:30 · 2023-01-28 12:32:40 +05:30 · 6e9c4e0bb9
commit 6e9c4e0bb9
parent 467af44edb
2 changed files with 171 additions and 476 deletions
--- a/src/calibre/utils/windows/winspeech.cpp
+++ b/src/calibre/utils/windows/winspeech.cpp
@ -8,7 +8,10 @@

 #include <algorithm>
 #include <atomic>
+#include <filesystem>
 #include <array>
+#include <string>
+#include <string_view>
 #include <vector>
 #include <map>
 #include <deque>
@ -44,15 +47,26 @@ typedef uint64_t id_type;
 static std::mutex output_lock;
 static DWORD main_thread_id;

-template<typename T, typename... Args> static void
-__debug_multiple(T x, Args... args) {
-    std::cerr << x << " ";
-    __debug_multiple(args...);
+template<typename T> static void
+__debug_multiple_impl(T x) {
+    if constexpr (std::is_same_v<T, wchar_t*> || std::is_same_v<T, std::wstring> || std::is_same_v<T, winrt::hstring> || std::is_same_v<T, std::wstring_view>) {
+        std::cerr << winrt::to_string(x);
+    } else {
+        std::cerr << x;
+    }
 }

 template<typename T> static void
 __debug_multiple(T x) {
-    std::cerr << x << std::endl;
+    __debug_multiple_impl(x);
+    std::cerr << std::endl;
+}
+
+template<typename T, typename... Args> static void
+__debug_multiple(T x, Args... args) {
+    __debug_multiple_impl(x);
+    std::cerr << " ";
+    __debug_multiple(args...);
 }

 template<typename... Args> static void
@ -71,14 +85,6 @@ enum {
    EXIT_REQUESTED
 };

-// trim from end (in place)
-static inline void
-rtrim(std::string &s) {
-    s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
-        return !std::isspace(ch);
-    }).base(), s.end());
-}
-
 static std::vector<std::wstring_view>
 split(std::wstring_view const &src, std::wstring const &delim = L" ") {
    size_t pos;
@ -350,6 +356,8 @@ output_error(id_type cmd_id, std::string_view const &msg, std::string_view const
 #define CATCH_ALL_EXCEPTIONS(msg, cmd_id) \
  catch(winrt::hresult_error const& ex) { \
    output_error(cmd_id, msg, winrt::to_string(ex.message()), __LINE__, ex.to_abi()); \
+} catch(const std::system_error& ex) { \
+    output_error(cmd_id, msg, "system_error with code: " + std::to_string(ex.code().value()) + " and meaning: " + ex.what(), __LINE__); \
 } catch (std::exception const &ex) { \
    output_error(cmd_id, msg, ex.what(), __LINE__); \
 } catch (std::string const &ex) { \
@ -360,369 +368,6 @@ output_error(id_type cmd_id, std::string_view const &msg, std::string_view const
    output_error(cmd_id, msg, "Unknown exception type was raised", __LINE__); \
 }

-/* Legacy code {{{
-
-template<typename T>
-class WeakRefs {
-    private:
-    std::mutex weak_ref_lock;
-    std::unordered_map<id_type, T*> refs;
-    id_type counter;
-    public:
-    id_type register_ref(T *self) {
-        std::scoped_lock lock(weak_ref_lock);
-        auto id = ++counter;
-        refs[id] = self;
-        return id;
-    }
-    void unregister_ref(T* self) {
-        std::scoped_lock lock(weak_ref_lock);
-        auto id = self->clear_id();
-        refs.erase(id);
-        self->~T();
-    }
-    void use_ref(id_type id, std::function<void(T*)> callback) {
-        std::scoped_lock lock(weak_ref_lock);
-        try {
-            callback(refs.at(id));
-        } catch (std::out_of_range) {
-            callback(NULL);
-        }
-    }
-};
-
-enum class EventType {
-    playback_state_changed = 1, media_opened, media_failed, media_ended, source_changed, cue_entered, cue_exited, track_failed
-};
-
-class Event {
-    private:
-        EventType type;
-    public:
-        Event(EventType type) : type(type) {}
-        Event(const Event &source) : type(source.type) {}
-};
-
-class SynthesizerImplementation {
-    private:
-    id_type id;
-    DWORD creation_thread_id;
-    SpeechSynthesizer synth{nullptr};
-    MediaPlayer player{nullptr};
-    MediaSource current_source{nullptr};
-    SpeechSynthesisStream current_stream{nullptr};
-    MediaPlaybackItem currently_playing{nullptr};
-
-    struct {
-        MediaPlaybackSession::PlaybackStateChanged_revoker playback_state_changed;
-        MediaPlayer::MediaEnded_revoker media_ended; MediaPlayer::MediaOpened_revoker media_opened;
-        MediaPlayer::MediaFailed_revoker media_failed; MediaPlayer::SourceChanged_revoker source_changed;
-
-        MediaPlaybackItem::TimedMetadataTracksChanged_revoker timed_metadata_tracks_changed;
-        std::vector<TimedMetadataTrack::CueEntered_revoker> cue_entered;
-        std::vector<TimedMetadataTrack::CueExited_revoker> cue_exited;
-        std::vector<TimedMetadataTrack::TrackFailed_revoker> track_failed;
-    } revoker;
-
-    std::vector<Event> events;
-    std::mutex events_lock;
-
-    public:
-    SynthesizerImplementation();
-
-    void add_simple_event(EventType type) {
-        try {
-            std::scoped_lock lock(events_lock);
-            events.emplace_back(type);
-        } catch(...) {}
-    }
-
-    SpeechSynthesisStream synthesize(const std::wstring_view &text, bool is_ssml = false) {
-        if (is_ssml) return synth.SynthesizeSsmlToStreamAsync(text).get();
-        return synth.SynthesizeTextToStreamAsync(text).get();
-    }
-
-    void speak(const std::wstring_view &text, bool is_ssml = false) {
-        revoker.cue_entered.clear();
-        revoker.cue_exited.clear();
-        revoker.track_failed.clear();
-        current_stream = synthesize(text, is_ssml);
-        current_source = MediaSource::CreateFromStream(current_stream, current_stream.ContentType());
-        currently_playing = MediaPlaybackItem(current_source);
-        auto self_id = id;
-        revoker.timed_metadata_tracks_changed = currently_playing.TimedMetadataTracksChanged(winrt::auto_revoke, [self_id](auto, auto const &args) {
-            auto change_type = args.CollectionChange();
-            auto index = args.Index();
-            synthesizer_weakrefs.use_ref(self_id, [change_type, index](auto s) {
-            if (!s) return;
-            switch (change_type) {
-            case CollectionChange::ItemInserted: {
-                s->register_metadata_handler_for_speech(s->currently_playing.TimedMetadataTracks().GetAt(index));
-                } break;
-            case CollectionChange::Reset:
-                for (auto const& track : s->currently_playing.TimedMetadataTracks()) {
-                    s->register_metadata_handler_for_speech(track);
-                }
-                break;
-            }});
-        });
-        player.Source(currently_playing);
-        for (auto const &track : currently_playing.TimedMetadataTracks()) {
-            register_metadata_handler_for_speech(track);
-        }
-    }
-
-    bool is_creation_thread() const noexcept {
-        return creation_thread_id == GetCurrentThreadId();
-    }
-
-    id_type clear_id() noexcept {
-        auto ans = id;
-        id = 0;
-        return ans;
-    }
-
-    void register_metadata_handler_for_speech(TimedMetadataTrack const& track) {
-        fprintf(stderr, "99999999999 registering metadata handler\n");
-        auto self_id = id;
-#define simple_event_listener(method, event_type) \
-        revoker.event_type.push_back(method(winrt::auto_revoke, [self_id](auto, const auto&) { \
-        fprintf(stderr, "111111111 %s %u\n", #event_type, GetCurrentThreadId()); fflush(stderr); \
-        synthesizer_weakrefs.use_ref(self_id, [](auto s) { \
-            if (!s) return; \
-            s->add_simple_event(EventType::event_type); \
-            fprintf(stderr, "2222222222 %d\n", s->player.PlaybackSession().PlaybackState()); \
-        }); \
-    }));
-        simple_event_listener(track.CueEntered, cue_entered);
-        simple_event_listener(track.CueExited, cue_exited);
-        simple_event_listener(track.TrackFailed, track_failed);
-#undef simple_event_listener
-        track.CueEntered([](auto, const auto&) {
-            fprintf(stderr, "cue entered\n"); fflush(stderr);
-        });
-}
-
-
-};
-
-struct Synthesizer {
-    PyObject_HEAD
-    SynthesizerImplementation impl;
-};
-
-static PyTypeObject SynthesizerType = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-};
-
-static WeakRefs<SynthesizerImplementation> synthesizer_weakrefs;
-
-SynthesizerImplementation::SynthesizerImplementation() {
-    events.reserve(128);
-    synth = SpeechSynthesizer();
-    synth.Options().IncludeSentenceBoundaryMetadata(true);
-    synth.Options().IncludeWordBoundaryMetadata(true);
-    player = MediaPlayer();
-    player.AudioCategory(MediaPlayerAudioCategory::Speech);
-    player.AutoPlay(true);
-    creation_thread_id = GetCurrentThreadId();
-    id = synthesizer_weakrefs.register_ref(this);
-    auto self_id = id;
-#define simple_event_listener(method, event_type) \
-        revoker.event_type = method(winrt::auto_revoke, [self_id](auto, const auto&) { \
-        fprintf(stderr, "111111111 %s %u\n", #event_type, GetCurrentThreadId()); fflush(stderr); \
-        synthesizer_weakrefs.use_ref(self_id, [](auto s) { \
-            if (!s) return; \
-            s->add_simple_event(EventType::event_type); \
-            fprintf(stderr, "2222222222 %d\n", s->player.PlaybackSession().PlaybackState()); \
-        }); \
-    });
-    simple_event_listener(player.PlaybackSession().PlaybackStateChanged, playback_state_changed);
-    simple_event_listener(player.MediaOpened, media_opened);
-    simple_event_listener(player.MediaFailed, media_failed);
-    simple_event_listener(player.MediaEnded, media_ended);
-    simple_event_listener(player.SourceChanged, source_changed);
-#undef simple_event_listener
-    player.PlaybackSession().PlaybackStateChanged([](auto, const auto&) {
-        fprintf(stderr, "111111111 %s %u\n", "playback state changed", GetCurrentThreadId()); fflush(stderr); \
-    });
-    player.MediaOpened([](auto, const auto&) {
-        fprintf(stderr, "111111111 %s %u\n", "media opened", GetCurrentThreadId()); fflush(stderr); \
-    });
-    player.MediaFailed([](auto, const auto&) {
-        fprintf(stderr, "111111111 %s %u\n", "media failed", GetCurrentThreadId()); fflush(stderr); \
-    });
-    player.MediaEnded([](auto, const auto&) {
-        fprintf(stderr, "111111111 %s %u\n", "media ended", GetCurrentThreadId()); fflush(stderr); \
-    });
-}
-
-static PyObject*
-Synthesizer_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { INITIALIZE_COM_IN_FUNCTION
-	Synthesizer *self = (Synthesizer *) type->tp_alloc(type, 0);
-    if (self) {
-        auto i = &self->impl;
-        try {
-            new (i) SynthesizerImplementation();
-        } CATCH_ALL_EXCEPTIONS("Failed to create SynthesizerImplementation object");
-        if (PyErr_Occurred()) { Py_CLEAR(self); }
-    }
-    if (self) com.detach();
-    return (PyObject*)self;
-}
-
-static void
-Synthesizer_dealloc(Synthesizer *self) {
-    auto *i = &self->impl;
-    try {
-        synthesizer_weakrefs.unregister_ref(i);
-    } CATCH_ALL_EXCEPTIONS("Failed to destruct SynthesizerImplementation");
-    if (PyErr_Occurred()) { PyErr_Print(); }
-    Py_TYPE(self)->tp_free((PyObject*)self);
-    CoUninitialize();
-}
-
-static void
-ensure_current_thread_has_message_queue(void) {
-    MSG msg;
-    PeekMessage(&msg, NULL, WM_USER, WM_USER, PM_NOREMOVE);
-}
-
-#define PREPARE_METHOD_CALL ensure_current_thread_has_message_queue(); if (!self->impl.is_creation_thread()) { PyErr_SetString(PyExc_RuntimeError, "Cannot use a Synthesizer object from a thread other than the thread it was created in"); return NULL; }
-
-static PyObject*
-Synthesizer_speak(Synthesizer *self, PyObject *args) {
-    PREPARE_METHOD_CALL;
-    wchar_raii pytext;
-    int is_ssml = 0;
-	if (!PyArg_ParseTuple(args, "O&|p", py_to_wchar_no_none, &pytext, &is_ssml)) return NULL;
-    try {
-        self->impl.speak(pytext.as_view(), (bool)is_ssml);
-    } CATCH_ALL_EXCEPTIONS("Failed to start speaking text");
-    if (PyErr_Occurred()) return NULL;
-    Py_RETURN_NONE;
-}
-
-
-static PyObject*
-Synthesizer_create_recording(Synthesizer *self, PyObject *args) {
-    PREPARE_METHOD_CALL;
-    wchar_raii pytext;
-    PyObject *callback;
-    int is_ssml = 0;
-	if (!PyArg_ParseTuple(args, "O&O|p", py_to_wchar_no_none, &pytext, &callback, &is_ssml)) return NULL;
-    if (!PyCallable_Check(callback)) { PyErr_SetString(PyExc_TypeError, "callback must be callable"); return NULL; }
-
-    SpeechSynthesisStream stream{nullptr};
-    try {
-        stream = self->impl.synthesize(pytext.as_view(), (bool)is_ssml);
-    } CATCH_ALL_EXCEPTIONS( "Failed to get SpeechSynthesisStream from text");
-    if (PyErr_Occurred()) return NULL;
-    unsigned long long stream_size = stream.Size(), bytes_read = 0;
-    DataReader reader(stream);
-    unsigned int n;
-    const static unsigned int chunk_size = 16 * 1024;
-    while (bytes_read < stream_size) {
-        try {
-            n = reader.LoadAsync(chunk_size).get();
-        } CATCH_ALL_EXCEPTIONS("Failed to load data from DataReader");
-        if (PyErr_Occurred()) return NULL;
-        if (n > 0) {
-            bytes_read += n;
-            pyobject_raii b(PyBytes_FromStringAndSize(NULL, n));
-            if (!b) return NULL;
-            unsigned char *p = reinterpret_cast<unsigned char*>(PyBytes_AS_STRING(b.ptr()));
-            reader.ReadBytes(winrt::array_view(p, p + n));
-            pyobject_raii ret(PyObject_CallFunctionObjArgs(callback, b.ptr(), NULL));
-        }
-    }
-
-    if (PyErr_Occurred()) return NULL;
-    Py_RETURN_NONE;
-}
-
-
-static PyObject*
-voice_as_dict(VoiceInformation const& voice) {
-    try {
-        const char *gender = "";
-        switch (voice.Gender()) {
-            case VoiceGender::Male: gender = "male"; break;
-            case VoiceGender::Female: gender = "female"; break;
-        }
-        return Py_BuildValue("{su su su su ss}",
-            "display_name", voice.DisplayName().c_str(),
-            "description", voice.Description().c_str(),
-            "id", voice.Id().c_str(),
-            "language", voice.Language().c_str(),
-            "gender", gender
-        );
-    } CATCH_ALL_EXCEPTIONS("Could not convert Voice to dict");
-    return NULL;
-}
-
-
-static PyObject*
-all_voices(PyObject*, PyObject*) { INITIALIZE_COM_IN_FUNCTION
-    try {
-        auto voices = SpeechSynthesizer::AllVoices();
-        pyobject_raii ans(PyTuple_New(voices.Size()));
-        if (!ans) return NULL;
-        Py_ssize_t i = 0;
-        for(auto const& voice : voices) {
-            PyObject *v = voice_as_dict(voice);
-            if (v) {
-                PyTuple_SET_ITEM(ans.ptr(), i++, v);
-            } else {
-                return NULL;
-            }
-        }
-        return ans.detach();
-    } CATCH_ALL_EXCEPTIONS("Could not get all voices");
-    return NULL;
-}
-
-static PyObject*
-default_voice(PyObject*, PyObject*) { INITIALIZE_COM_IN_FUNCTION
-    try {
-        return voice_as_dict(SpeechSynthesizer::DefaultVoice());
-    } CATCH_ALL_EXCEPTIONS("Could not get default voice");
-    return NULL;
-}
-
-#define M(name, args) { #name, (PyCFunction)Synthesizer_##name, args, ""}
-static PyMethodDef Synthesizer_methods[] = {
-    M(create_recording, METH_VARARGS),
-    M(speak, METH_VARARGS),
-    {NULL, NULL, 0, NULL}
-};
-#undef M
-
-static PyObject*
-pump_waiting_messages(PyObject*, PyObject*) {
-	UINT firstMsg = 0, lastMsg = 0;
-    MSG msg;
-    bool found = false;
-	// Read all of the messages in this next loop,
-	// removing each message as we read it.
-	while (PeekMessage(&msg, NULL, firstMsg, lastMsg, PM_REMOVE)) {
-		// If it's a quit message, we're out of here.
-		if (msg.message == WM_QUIT) {
-            Py_RETURN_NONE;
-		}
-        found = true;
-		// Otherwise, dispatch the message.
-		DispatchMessage(&msg);
-	} // End of PeekMessage while loop
-
-    if (found) Py_RETURN_TRUE;
-    Py_RETURN_FALSE;
-}
-
-
-}}} */
-
-
 struct Revokers {
    MediaPlaybackSession::PlaybackStateChanged_revoker playback_state_changed;
    MediaPlayer::MediaEnded_revoker media_ended; MediaPlayer::MediaOpened_revoker media_opened;
@ -767,12 +412,10 @@ class Synthesizer {
    // }}}

    winrt::fire_and_forget save(id_type cmd_id, std::wstring_view const &text, bool is_ssml, std::vector<wchar_t> &&buf, std::ofstream &&outfile);
-    void load_stream_for_save(SpeechSynthesisStream const &&stream, id_type cmd_id);
+    void start_save_stream(SpeechSynthesisStream const &&stream, id_type cmd_id);

    void initialize() {
        synth = SpeechSynthesizer();
-        synth.Options().IncludeSentenceBoundaryMetadata(true);
-        synth.Options().IncludeWordBoundaryMetadata(true);
        player = MediaPlayer();
        player.AudioCategory(MediaPlayerAudioCategory::Speech);
        player.AutoPlay(true);
@ -809,6 +452,59 @@ class Synthesizer {

 static Synthesizer sx;

+static size_t
+decode_into(std::string_view src, std::wstring_view dest) {
+    int n = MultiByteToWideChar(CP_UTF8, 0, src.data(), (int)src.size(), (wchar_t*)dest.data(), (int)dest.size());
+    if (n == 0 && src.size() > 0) {
+        throw std::system_error(GetLastError(), std::system_category(), "Failed to decode cued text");
+    }
+    return n;
+}
+
+static std::wstring_view
+parse_cued_text(std::string_view src, Marks &marks, std::wstring_view dest) {
+    size_t dest_pos = 0;
+    if (dest.size() < src.size()) throw std::exception("Destination buffer for parse_cued_text() too small");
+    while (src.size()) {
+        auto pos = src.find('\0');
+        size_t limit = pos == std::string_view::npos ? src.size() : pos;
+        if (limit) {
+            dest_pos += decode_into(src.substr(0, limit), dest.substr(dest_pos, dest.size() - dest_pos));
+            src = src.substr(limit, src.size() - limit);
+        }
+        if (pos != std::string_view::npos) {
+            src = src.substr(1, src.size() - 1);
+            if (src.size() >= 4) {
+                uint32_t mark = *((uint32_t*)src.data());
+                marks.emplace_back(mark, (uint32_t)dest_pos);
+                src = src.substr(4, src.size() - 4);
+            }
+        }
+    }
+    return dest.substr(0, dest_pos);
+}
+
+static std::wstring_view
+read_from_shm(id_type cmd_id, const std::wstring_view size, const std::wstring &address, std::vector<wchar_t> &buf, Marks &marks, bool is_cued=false) {
+    id_type shm_size = parse_id(size);
+    handle_raii_null handle(OpenFileMappingW(FILE_MAP_READ, false, address.data()));
+    if (!handle) {
+        output_error(cmd_id, "Could not open shared memory at: " + winrt::to_string(address), winrt::to_string(get_last_error()), __LINE__);
+        return {};
+    }
+    mapping_raii mapping(MapViewOfFile(handle.ptr(), FILE_MAP_READ, 0, 0, (SIZE_T)shm_size));
+    if (!mapping) {
+        output_error(cmd_id, "Could not map shared memory", winrt::to_string(get_last_error()), __LINE__);
+        return {};
+    }
+    buf.reserve(shm_size + 2);
+    std::string_view src((const char*)mapping.ptr(), shm_size);
+    std::wstring_view dest(buf.data(), buf.capacity());
+    if (is_cued) return parse_cued_text(src, marks, dest);
+    return std::wstring_view(buf.data(), decode_into(src, dest));
+}
+
+
 // Speak {{{
 void Synthesizer::on_cue_entered(id_type cmd_id, const winrt::hstring &label, const SpeechCue &cue) {
    std::scoped_lock sl(recursive_lock);
@ -910,6 +606,8 @@ winrt::fire_and_forget Synthesizer::speak(id_type cmd_id, std::wstring_view cons
        current_cmd_id.store(cmd_id);
        current_text_storage = std::move(buf);
        current_marks = std::move(marks);
+        synth.Options().IncludeSentenceBoundaryMetadata(true);
+        synth.Options().IncludeWordBoundaryMetadata(true);
    }
    output(cmd_id, "synthesizing", {{"ssml", is_ssml}, {"num_marks", current_marks.size()}, {"text_length", text.size()}});
    bool ok = false;
@ -927,69 +625,6 @@ winrt::fire_and_forget Synthesizer::speak(id_type cmd_id, std::wstring_view cons
    }
 }

-static size_t
-decode_into(std::string_view src, std::wstring_view dest) {
-    int n = MultiByteToWideChar(CP_UTF8, 0, src.data(), (int)src.size(), (wchar_t*)dest.data(), (int)dest.size());
-    if (n == 0 && src.size() > 0) {
-        switch (GetLastError()) {
-            case ERROR_INSUFFICIENT_BUFFER:
-                throw std::exception("Output buffer too small while decoding cued text");
-            case ERROR_INVALID_FLAGS:
-                throw std::exception("Invalid flags while decoding cued text");
-            case ERROR_INVALID_PARAMETER:
-                throw std::exception("Invalid parameters while decoding cued text");
-            case ERROR_NO_UNICODE_TRANSLATION:
-                throw std::exception("Invalid UTF-8 found while decoding cued text");
-            default:
-                throw std::exception("Unknown error while decoding cued text");
-        }
-    }
-    return n;
-}
-
-static std::wstring_view
-parse_cued_text(std::string_view src, Marks &marks, std::wstring_view dest) {
-    size_t dest_pos = 0;
-    if (dest.size() < src.size()) throw std::exception("Destination buffer for parse_cued_text() too small");
-    while (src.size()) {
-        auto pos = src.find('\0');
-        size_t limit = pos == std::string_view::npos ? src.size() : pos;
-        if (limit) {
-            dest_pos += decode_into(src.substr(0, limit), dest.substr(dest_pos, dest.size() - dest_pos));
-            src = src.substr(limit, src.size() - limit);
-        }
-        if (pos != std::string_view::npos) {
-            src = src.substr(1, src.size() - 1);
-            if (src.size() >= 4) {
-                uint32_t mark = *((uint32_t*)src.data());
-                marks.emplace_back(mark, (uint32_t)dest_pos);
-                src = src.substr(4, src.size() - 4);
-            }
-        }
-    }
-    return dest.substr(0, dest_pos);
-}
-
-static std::wstring_view
-read_from_shm(id_type cmd_id, const std::wstring_view size, const std::wstring_view address, std::vector<wchar_t> &buf, Marks &marks, bool is_cued=false) {
-    id_type shm_size = parse_id(size);
-    handle_raii handle(OpenFileMappingW(FILE_MAP_READ, false, address.data()));
-    if (handle.ptr() == INVALID_HANDLE_VALUE) {
-        output_error(cmd_id, "Could not open shared memory at", winrt::to_string(address), __LINE__);
-        return {};
-    }
-    mapping_raii mapping(MapViewOfFile(handle.ptr(), FILE_MAP_READ, 0, 0, (SIZE_T)shm_size));
-    if (!mapping) {
-        output_error(cmd_id, "Could not map shared memory with error", std::to_string(GetLastError()), __LINE__);
-        return {};
-    }
-    buf.reserve(shm_size + 2);
-    std::string_view src((const char*)mapping.ptr(), shm_size);
-    std::wstring_view dest(buf.data(), buf.capacity());
-    if (is_cued) return parse_cued_text(src, marks, dest);
-    return std::wstring_view(buf.data(), decode_into(src, dest));
-}
-
 static void
 handle_speak(id_type cmd_id, std::vector<std::wstring_view> &parts) {
    bool is_ssml = false, is_shm = false, is_cued = false;
@ -1006,7 +641,7 @@ handle_speak(id_type cmd_id, std::vector<std::wstring_view> &parts) {
    std::vector<wchar_t> buf;
    std::wstring_view text;
    if (is_shm) {
-        text = read_from_shm(cmd_id, parts.at(0), parts.at(1), buf, marks, is_cued);
+        text = read_from_shm(cmd_id, parts.at(0), std::wstring(parts.at(1)), buf, marks, is_cued);
        if (text.size() == 0) return;
    } else {
        address = join(parts);
@ -1021,11 +656,43 @@ handle_speak(id_type cmd_id, std::vector<std::wstring_view> &parts) {
 // }}}

 // Save {{{
+static winrt::fire_and_forget
+save_stream(SpeechSynthesisStream const &&stream, std::ofstream &&outfile, id_type cmd_id) {
+    unsigned long long stream_size = stream.Size(), bytes_read = 0;
+    DataReader reader(stream);
+    unsigned int n;
+    const static unsigned int chunk_size = 16 * 1024;
+    uint8_t buf[chunk_size];
+    while (bytes_read < stream_size) {
+        bool ok = false;
+        try {
+            n = co_await reader.LoadAsync(chunk_size);
+            ok = true;
+        } CATCH_ALL_EXCEPTIONS("Failed to load data from DataReader", cmd_id);
+        if (!ok) break;
+        if (n > 0) {
+            bytes_read += n;
+            ok = false;
+            try {
+                reader.ReadBytes(winrt::array_view(buf, buf + n));
+                outfile.write((const char*)buf, n);
+                if (!outfile.good()) throw "Failed to write to output file";
+                ok = true;
+            } CATCH_ALL_EXCEPTIONS("Failed to save bytes from DataReader to file", cmd_id);
+            if (!ok) break;
+        }
+    }
+    outfile.close();
+    output(cmd_id, "saved", {{"size", bytes_read}});
+}
+
 void
-Synthesizer::load_stream_for_save(SpeechSynthesisStream const &&stream, id_type cmd_id) {
+Synthesizer::start_save_stream(SpeechSynthesisStream const &&stream, id_type cmd_id) {
    std::scoped_lock sl(recursive_lock);
-    if (cmd_id != current_cmd_id.load()) return;
-    current_stream = stream;
+    try {
+        save_stream(std::move(stream), std::move(outfile), cmd_id);
+    } CATCH_ALL_EXCEPTIONS("Failed to save loaded stream", cmd_id);
+    stop_current_activity();
 }

 winrt::fire_and_forget Synthesizer::save(id_type cmd_id, std::wstring_view const &text, bool is_ssml, std::vector<wchar_t> &&buf, std::ofstream &&out) {
@ -1035,8 +702,9 @@ winrt::fire_and_forget Synthesizer::save(id_type cmd_id, std::wstring_view const
        current_cmd_id.store(cmd_id);
        current_text_storage = std::move(buf);
        outfile = std::move(out);
+        synth.Options().IncludeSentenceBoundaryMetadata(false);
+        synth.Options().IncludeWordBoundaryMetadata(false);
    }
-    output(cmd_id, "saving", {{"ssml", is_ssml}});
    bool ok = false;
    try {
        if (is_ssml) stream = co_await synth.SynthesizeSsmlToStreamAsync(text);
@ -1046,7 +714,7 @@ winrt::fire_and_forget Synthesizer::save(id_type cmd_id, std::wstring_view const
    if (ok) {
        if (main_loop_is_running.load()) {
            try {
-                load_stream_for_save(std::move(stream), cmd_id);
+                sx.start_save_stream(std::move(stream), cmd_id);
            } CATCH_ALL_EXCEPTIONS("Failed to load synthesized stream for save", cmd_id);
        }
    }
@ -1058,16 +726,20 @@ handle_save(id_type cmd_id, std::vector<std::wstring_view> &parts) {
    try {
        is_ssml = parts.at(0) == L"ssml";
    } catch (std::exception const&) {
-        throw std::string("Not a well formed save command");
+        throw "Not a well formed save command"s;
    }
    std::vector<wchar_t> buf;
-    std::wstring_view text;
    std::wstring address;
    Marks marks;
-    text = read_from_shm(cmd_id, parts.at(2), parts.at(3), buf, marks);
+    std::wstring_view text = read_from_shm(cmd_id, parts.at(1), std::wstring(parts.at(2)), buf, marks);
    if (text.size() == 0) return;
+    parts.erase(parts.begin(), parts.begin() + 3);
    *((wchar_t*)text.data() + text.size()) = 0;  // ensure NULL termination
-    std::ofstream outfile(parts.at(1), std::ios::out | std::ios::trunc);
+    auto filename = join(parts);
+    auto path = std::filesystem::absolute(filename);
+    std::ofstream outfile(path.string(), std::ios::out | std::ios::trunc);
+    if (!outfile.good()) throw "Failed to create: " + path.string();
+    output(cmd_id, "saving", {{"ssml", is_ssml}, {"output_path", path.string()}});
    sx.save(cmd_id, text, is_ssml, std::move(buf), std::move(outfile));
 }
 // }}}
@ -1132,6 +804,9 @@ run_main_loop(PyObject*, PyObject*) {
        std::cout.imbue(std::locale("C"));
        std::cin.imbue(std::locale("C"));
        std::cerr.imbue(std::locale("C"));
+        std::wcin.imbue(std::locale("C"));
+        std::wcout.imbue(std::locale("C"));
+        std::wcerr.imbue(std::locale("C"));
    } CATCH_ALL_EXCEPTIONS("Failed to set stdio locales to C", 0);
    winrt::init_apartment(); // MTA (multi-threaded apartment)
    main_thread_id = GetCurrentThreadId();
--- a/src/calibre/utils/windows/winspeech.py
+++ b/src/calibre/utils/windows/winspeech.py
@ -3,14 +3,15 @@


 import json
+import os
 import struct
 import sys
 from contextlib import closing
 from queue import Queue
 from threading import Thread

-from calibre.utils.shm import SharedMemory
 from calibre.utils.ipc.simple_worker import start_pipe_worker
+from calibre.utils.shm import SharedMemory

 SSML_SAMPLE = '''
 <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
@ -61,9 +62,8 @@ def encode_to_file_object(text, output) -> int:
    return sz


-def develop_speech(text='Lucca Brazzi sleeps with the fishes.', mark_words=True):
+def develop_loop(wait_for, *commands):
    p = start_worker()
-    print('\x1b[32mSpeaking', text, '\x1b[39m]]'[:-2], flush=True)
    q = Queue()

    def echo_output(p):
@ -79,6 +79,36 @@ def develop_speech(text='Lucca Brazzi sleeps with the fishes.', mark_words=True)

    Thread(name='Echo', target=echo_output, args=(p,), daemon=True).start()
    exit_code = 0
+    with closing(p.stdin), closing(p.stdout):
+        try:
+            send('1 echo Synthesizer started')
+            send('1 volume 0.1')
+            for command in commands:
+                send(command)
+            while True:
+                m = q.get()
+                if m['related_to'] != wait_for:
+                    continue
+                if m['payload_type'] == 'media_state_changed' and m['state'] == 'ended':
+                    break
+                if m['payload_type'] == 'saved':
+                    break
+                if m['payload_type'] == 'error':
+                    exit_code = 1
+                    break
+            send(f'333 echo Synthesizer exiting with exit code: {exit_code}')
+            send(f'334 exit {exit_code}')
+            ec = p.wait(1)
+            print(f'Worker exited with code: {os.waitstatus_to_exitcode(p.wait(1))}', file=sys.stderr, flush=True)
+            raise SystemExit(ec)
+        finally:
+            if p.poll() is None:
+                p.kill()
+                raise SystemExit(1)
+
+
+def develop_speech(text='Lucca Brazzi sleeps with the fishes.', mark_words=True):
+    print('\x1b[32mSpeaking', text, '\x1b[39m]]'[:-2], flush=True)
    st = 'ssml' if '<speak' in text else 'text'
    if mark_words:
        st = 'cued'
@ -89,25 +119,15 @@ def develop_speech(text='Lucca Brazzi sleeps with the fishes.', mark_words=True)
            text.append(w)
            if w is not words[-1]:
                text.append(' ')
-    with closing(p.stdin), closing(p.stdout), SharedMemory(size=max_buffer_size(text)) as shm:
+
+    with SharedMemory(size=max_buffer_size(text)) as shm:
        sz = encode_to_file_object(text, shm)
-        try:
-            send('1 echo Synthesizer started')
-            send('1 volume 0.1')
-            send(f'2 speak {st} shm {sz} {shm.name}')
-            while True:
-                m = q.get()
-                if m['related_to'] != 2:
-                    continue
-                if m['payload_type'] == 'media_state_changed' and m['state'] == 'ended':
-                    break
-                if m['payload_type'] == 'error':
-                    exit_code = 1
-                    break
-            send(f'3 echo Synthesizer exiting with exit code: {exit_code}')
-            send(f'4 exit {exit_code}')
-            raise SystemExit(p.wait(1))
-        finally:
-            if p.poll() is None:
-                p.kill()
-                raise SystemExit(1)
+        develop_loop(2, f'2 speak {st} shm {sz} {shm.name}')
+
+
+def develop_save(text='Lucca Brazzi sleeps with the fishes.', filename="speech.wav"):
+    print('\x1b[32mSaving', text, '\x1b[39m]]'[:-2], flush=True)
+    st = 'ssml' if '<speak' in text else 'text'
+    with SharedMemory(size=max_buffer_size(text)) as shm:
+        sz = encode_to_file_object(text, shm)
+        develop_loop(2, f'2 save {st} {sz} {shm.name} {filename}')