Saving to WAV kinda works though audio is very distorted

This commit is contained in:
Kovid Goyal 2023-01-28 12:32:40 +05:30
parent 467af44edb
commit 6e9c4e0bb9
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 171 additions and 476 deletions

View File

@ -8,7 +8,10 @@
#include <algorithm>
#include <atomic>
#include <filesystem>
#include <array>
#include <string>
#include <string_view>
#include <vector>
#include <map>
#include <deque>
@ -44,15 +47,26 @@ typedef uint64_t id_type;
static std::mutex output_lock;
static DWORD main_thread_id;
template<typename T, typename... Args> static void
__debug_multiple(T x, Args... args) {
std::cerr << x << " ";
__debug_multiple(args...);
template<typename T> static void
__debug_multiple_impl(T x) {
if constexpr (std::is_same_v<T, wchar_t*> || std::is_same_v<T, std::wstring> || std::is_same_v<T, winrt::hstring> || std::is_same_v<T, std::wstring_view>) {
std::cerr << winrt::to_string(x);
} else {
std::cerr << x;
}
}
template<typename T> static void
__debug_multiple(T x) {
std::cerr << x << std::endl;
__debug_multiple_impl(x);
std::cerr << std::endl;
}
template<typename T, typename... Args> static void
__debug_multiple(T x, Args... args) {
__debug_multiple_impl(x);
std::cerr << " ";
__debug_multiple(args...);
}
template<typename... Args> static void
@ -71,14 +85,6 @@ enum {
EXIT_REQUESTED
};
// trim from end (in place)
static inline void
rtrim(std::string &s) {
s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
return !std::isspace(ch);
}).base(), s.end());
}
static std::vector<std::wstring_view>
split(std::wstring_view const &src, std::wstring const &delim = L" ") {
size_t pos;
@ -350,6 +356,8 @@ output_error(id_type cmd_id, std::string_view const &msg, std::string_view const
#define CATCH_ALL_EXCEPTIONS(msg, cmd_id) \
catch(winrt::hresult_error const& ex) { \
output_error(cmd_id, msg, winrt::to_string(ex.message()), __LINE__, ex.to_abi()); \
} catch(const std::system_error& ex) { \
output_error(cmd_id, msg, "system_error with code: " + std::to_string(ex.code().value()) + " and meaning: " + ex.what(), __LINE__); \
} catch (std::exception const &ex) { \
output_error(cmd_id, msg, ex.what(), __LINE__); \
} catch (std::string const &ex) { \
@ -360,369 +368,6 @@ output_error(id_type cmd_id, std::string_view const &msg, std::string_view const
output_error(cmd_id, msg, "Unknown exception type was raised", __LINE__); \
}
/* Legacy code {{{
template<typename T>
class WeakRefs {
private:
std::mutex weak_ref_lock;
std::unordered_map<id_type, T*> refs;
id_type counter;
public:
id_type register_ref(T *self) {
std::scoped_lock lock(weak_ref_lock);
auto id = ++counter;
refs[id] = self;
return id;
}
void unregister_ref(T* self) {
std::scoped_lock lock(weak_ref_lock);
auto id = self->clear_id();
refs.erase(id);
self->~T();
}
void use_ref(id_type id, std::function<void(T*)> callback) {
std::scoped_lock lock(weak_ref_lock);
try {
callback(refs.at(id));
} catch (std::out_of_range) {
callback(NULL);
}
}
};
enum class EventType {
playback_state_changed = 1, media_opened, media_failed, media_ended, source_changed, cue_entered, cue_exited, track_failed
};
class Event {
private:
EventType type;
public:
Event(EventType type) : type(type) {}
Event(const Event &source) : type(source.type) {}
};
class SynthesizerImplementation {
private:
id_type id;
DWORD creation_thread_id;
SpeechSynthesizer synth{nullptr};
MediaPlayer player{nullptr};
MediaSource current_source{nullptr};
SpeechSynthesisStream current_stream{nullptr};
MediaPlaybackItem currently_playing{nullptr};
struct {
MediaPlaybackSession::PlaybackStateChanged_revoker playback_state_changed;
MediaPlayer::MediaEnded_revoker media_ended; MediaPlayer::MediaOpened_revoker media_opened;
MediaPlayer::MediaFailed_revoker media_failed; MediaPlayer::SourceChanged_revoker source_changed;
MediaPlaybackItem::TimedMetadataTracksChanged_revoker timed_metadata_tracks_changed;
std::vector<TimedMetadataTrack::CueEntered_revoker> cue_entered;
std::vector<TimedMetadataTrack::CueExited_revoker> cue_exited;
std::vector<TimedMetadataTrack::TrackFailed_revoker> track_failed;
} revoker;
std::vector<Event> events;
std::mutex events_lock;
public:
SynthesizerImplementation();
void add_simple_event(EventType type) {
try {
std::scoped_lock lock(events_lock);
events.emplace_back(type);
} catch(...) {}
}
SpeechSynthesisStream synthesize(const std::wstring_view &text, bool is_ssml = false) {
if (is_ssml) return synth.SynthesizeSsmlToStreamAsync(text).get();
return synth.SynthesizeTextToStreamAsync(text).get();
}
void speak(const std::wstring_view &text, bool is_ssml = false) {
revoker.cue_entered.clear();
revoker.cue_exited.clear();
revoker.track_failed.clear();
current_stream = synthesize(text, is_ssml);
current_source = MediaSource::CreateFromStream(current_stream, current_stream.ContentType());
currently_playing = MediaPlaybackItem(current_source);
auto self_id = id;
revoker.timed_metadata_tracks_changed = currently_playing.TimedMetadataTracksChanged(winrt::auto_revoke, [self_id](auto, auto const &args) {
auto change_type = args.CollectionChange();
auto index = args.Index();
synthesizer_weakrefs.use_ref(self_id, [change_type, index](auto s) {
if (!s) return;
switch (change_type) {
case CollectionChange::ItemInserted: {
s->register_metadata_handler_for_speech(s->currently_playing.TimedMetadataTracks().GetAt(index));
} break;
case CollectionChange::Reset:
for (auto const& track : s->currently_playing.TimedMetadataTracks()) {
s->register_metadata_handler_for_speech(track);
}
break;
}});
});
player.Source(currently_playing);
for (auto const &track : currently_playing.TimedMetadataTracks()) {
register_metadata_handler_for_speech(track);
}
}
bool is_creation_thread() const noexcept {
return creation_thread_id == GetCurrentThreadId();
}
id_type clear_id() noexcept {
auto ans = id;
id = 0;
return ans;
}
void register_metadata_handler_for_speech(TimedMetadataTrack const& track) {
fprintf(stderr, "99999999999 registering metadata handler\n");
auto self_id = id;
#define simple_event_listener(method, event_type) \
revoker.event_type.push_back(method(winrt::auto_revoke, [self_id](auto, const auto&) { \
fprintf(stderr, "111111111 %s %u\n", #event_type, GetCurrentThreadId()); fflush(stderr); \
synthesizer_weakrefs.use_ref(self_id, [](auto s) { \
if (!s) return; \
s->add_simple_event(EventType::event_type); \
fprintf(stderr, "2222222222 %d\n", s->player.PlaybackSession().PlaybackState()); \
}); \
}));
simple_event_listener(track.CueEntered, cue_entered);
simple_event_listener(track.CueExited, cue_exited);
simple_event_listener(track.TrackFailed, track_failed);
#undef simple_event_listener
track.CueEntered([](auto, const auto&) {
fprintf(stderr, "cue entered\n"); fflush(stderr);
});
}
};
struct Synthesizer {
PyObject_HEAD
SynthesizerImplementation impl;
};
static PyTypeObject SynthesizerType = {
PyVarObject_HEAD_INIT(NULL, 0)
};
static WeakRefs<SynthesizerImplementation> synthesizer_weakrefs;
SynthesizerImplementation::SynthesizerImplementation() {
events.reserve(128);
synth = SpeechSynthesizer();
synth.Options().IncludeSentenceBoundaryMetadata(true);
synth.Options().IncludeWordBoundaryMetadata(true);
player = MediaPlayer();
player.AudioCategory(MediaPlayerAudioCategory::Speech);
player.AutoPlay(true);
creation_thread_id = GetCurrentThreadId();
id = synthesizer_weakrefs.register_ref(this);
auto self_id = id;
#define simple_event_listener(method, event_type) \
revoker.event_type = method(winrt::auto_revoke, [self_id](auto, const auto&) { \
fprintf(stderr, "111111111 %s %u\n", #event_type, GetCurrentThreadId()); fflush(stderr); \
synthesizer_weakrefs.use_ref(self_id, [](auto s) { \
if (!s) return; \
s->add_simple_event(EventType::event_type); \
fprintf(stderr, "2222222222 %d\n", s->player.PlaybackSession().PlaybackState()); \
}); \
});
simple_event_listener(player.PlaybackSession().PlaybackStateChanged, playback_state_changed);
simple_event_listener(player.MediaOpened, media_opened);
simple_event_listener(player.MediaFailed, media_failed);
simple_event_listener(player.MediaEnded, media_ended);
simple_event_listener(player.SourceChanged, source_changed);
#undef simple_event_listener
player.PlaybackSession().PlaybackStateChanged([](auto, const auto&) {
fprintf(stderr, "111111111 %s %u\n", "playback state changed", GetCurrentThreadId()); fflush(stderr); \
});
player.MediaOpened([](auto, const auto&) {
fprintf(stderr, "111111111 %s %u\n", "media opened", GetCurrentThreadId()); fflush(stderr); \
});
player.MediaFailed([](auto, const auto&) {
fprintf(stderr, "111111111 %s %u\n", "media failed", GetCurrentThreadId()); fflush(stderr); \
});
player.MediaEnded([](auto, const auto&) {
fprintf(stderr, "111111111 %s %u\n", "media ended", GetCurrentThreadId()); fflush(stderr); \
});
}
static PyObject*
Synthesizer_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { INITIALIZE_COM_IN_FUNCTION
Synthesizer *self = (Synthesizer *) type->tp_alloc(type, 0);
if (self) {
auto i = &self->impl;
try {
new (i) SynthesizerImplementation();
} CATCH_ALL_EXCEPTIONS("Failed to create SynthesizerImplementation object");
if (PyErr_Occurred()) { Py_CLEAR(self); }
}
if (self) com.detach();
return (PyObject*)self;
}
static void
Synthesizer_dealloc(Synthesizer *self) {
auto *i = &self->impl;
try {
synthesizer_weakrefs.unregister_ref(i);
} CATCH_ALL_EXCEPTIONS("Failed to destruct SynthesizerImplementation");
if (PyErr_Occurred()) { PyErr_Print(); }
Py_TYPE(self)->tp_free((PyObject*)self);
CoUninitialize();
}
static void
ensure_current_thread_has_message_queue(void) {
MSG msg;
PeekMessage(&msg, NULL, WM_USER, WM_USER, PM_NOREMOVE);
}
#define PREPARE_METHOD_CALL ensure_current_thread_has_message_queue(); if (!self->impl.is_creation_thread()) { PyErr_SetString(PyExc_RuntimeError, "Cannot use a Synthesizer object from a thread other than the thread it was created in"); return NULL; }
static PyObject*
Synthesizer_speak(Synthesizer *self, PyObject *args) {
PREPARE_METHOD_CALL;
wchar_raii pytext;
int is_ssml = 0;
if (!PyArg_ParseTuple(args, "O&|p", py_to_wchar_no_none, &pytext, &is_ssml)) return NULL;
try {
self->impl.speak(pytext.as_view(), (bool)is_ssml);
} CATCH_ALL_EXCEPTIONS("Failed to start speaking text");
if (PyErr_Occurred()) return NULL;
Py_RETURN_NONE;
}
static PyObject*
Synthesizer_create_recording(Synthesizer *self, PyObject *args) {
PREPARE_METHOD_CALL;
wchar_raii pytext;
PyObject *callback;
int is_ssml = 0;
if (!PyArg_ParseTuple(args, "O&O|p", py_to_wchar_no_none, &pytext, &callback, &is_ssml)) return NULL;
if (!PyCallable_Check(callback)) { PyErr_SetString(PyExc_TypeError, "callback must be callable"); return NULL; }
SpeechSynthesisStream stream{nullptr};
try {
stream = self->impl.synthesize(pytext.as_view(), (bool)is_ssml);
} CATCH_ALL_EXCEPTIONS( "Failed to get SpeechSynthesisStream from text");
if (PyErr_Occurred()) return NULL;
unsigned long long stream_size = stream.Size(), bytes_read = 0;
DataReader reader(stream);
unsigned int n;
const static unsigned int chunk_size = 16 * 1024;
while (bytes_read < stream_size) {
try {
n = reader.LoadAsync(chunk_size).get();
} CATCH_ALL_EXCEPTIONS("Failed to load data from DataReader");
if (PyErr_Occurred()) return NULL;
if (n > 0) {
bytes_read += n;
pyobject_raii b(PyBytes_FromStringAndSize(NULL, n));
if (!b) return NULL;
unsigned char *p = reinterpret_cast<unsigned char*>(PyBytes_AS_STRING(b.ptr()));
reader.ReadBytes(winrt::array_view(p, p + n));
pyobject_raii ret(PyObject_CallFunctionObjArgs(callback, b.ptr(), NULL));
}
}
if (PyErr_Occurred()) return NULL;
Py_RETURN_NONE;
}
static PyObject*
voice_as_dict(VoiceInformation const& voice) {
try {
const char *gender = "";
switch (voice.Gender()) {
case VoiceGender::Male: gender = "male"; break;
case VoiceGender::Female: gender = "female"; break;
}
return Py_BuildValue("{su su su su ss}",
"display_name", voice.DisplayName().c_str(),
"description", voice.Description().c_str(),
"id", voice.Id().c_str(),
"language", voice.Language().c_str(),
"gender", gender
);
} CATCH_ALL_EXCEPTIONS("Could not convert Voice to dict");
return NULL;
}
static PyObject*
all_voices(PyObject*, PyObject*) { INITIALIZE_COM_IN_FUNCTION
try {
auto voices = SpeechSynthesizer::AllVoices();
pyobject_raii ans(PyTuple_New(voices.Size()));
if (!ans) return NULL;
Py_ssize_t i = 0;
for(auto const& voice : voices) {
PyObject *v = voice_as_dict(voice);
if (v) {
PyTuple_SET_ITEM(ans.ptr(), i++, v);
} else {
return NULL;
}
}
return ans.detach();
} CATCH_ALL_EXCEPTIONS("Could not get all voices");
return NULL;
}
static PyObject*
default_voice(PyObject*, PyObject*) { INITIALIZE_COM_IN_FUNCTION
try {
return voice_as_dict(SpeechSynthesizer::DefaultVoice());
} CATCH_ALL_EXCEPTIONS("Could not get default voice");
return NULL;
}
#define M(name, args) { #name, (PyCFunction)Synthesizer_##name, args, ""}
static PyMethodDef Synthesizer_methods[] = {
M(create_recording, METH_VARARGS),
M(speak, METH_VARARGS),
{NULL, NULL, 0, NULL}
};
#undef M
static PyObject*
pump_waiting_messages(PyObject*, PyObject*) {
UINT firstMsg = 0, lastMsg = 0;
MSG msg;
bool found = false;
// Read all of the messages in this next loop,
// removing each message as we read it.
while (PeekMessage(&msg, NULL, firstMsg, lastMsg, PM_REMOVE)) {
// If it's a quit message, we're out of here.
if (msg.message == WM_QUIT) {
Py_RETURN_NONE;
}
found = true;
// Otherwise, dispatch the message.
DispatchMessage(&msg);
} // End of PeekMessage while loop
if (found) Py_RETURN_TRUE;
Py_RETURN_FALSE;
}
}}} */
struct Revokers {
MediaPlaybackSession::PlaybackStateChanged_revoker playback_state_changed;
MediaPlayer::MediaEnded_revoker media_ended; MediaPlayer::MediaOpened_revoker media_opened;
@ -767,12 +412,10 @@ class Synthesizer {
// }}}
winrt::fire_and_forget save(id_type cmd_id, std::wstring_view const &text, bool is_ssml, std::vector<wchar_t> &&buf, std::ofstream &&outfile);
void load_stream_for_save(SpeechSynthesisStream const &&stream, id_type cmd_id);
void start_save_stream(SpeechSynthesisStream const &&stream, id_type cmd_id);
void initialize() {
synth = SpeechSynthesizer();
synth.Options().IncludeSentenceBoundaryMetadata(true);
synth.Options().IncludeWordBoundaryMetadata(true);
player = MediaPlayer();
player.AudioCategory(MediaPlayerAudioCategory::Speech);
player.AutoPlay(true);
@ -809,6 +452,59 @@ class Synthesizer {
static Synthesizer sx;
static size_t
decode_into(std::string_view src, std::wstring_view dest) {
int n = MultiByteToWideChar(CP_UTF8, 0, src.data(), (int)src.size(), (wchar_t*)dest.data(), (int)dest.size());
if (n == 0 && src.size() > 0) {
throw std::system_error(GetLastError(), std::system_category(), "Failed to decode cued text");
}
return n;
}
static std::wstring_view
parse_cued_text(std::string_view src, Marks &marks, std::wstring_view dest) {
size_t dest_pos = 0;
if (dest.size() < src.size()) throw std::exception("Destination buffer for parse_cued_text() too small");
while (src.size()) {
auto pos = src.find('\0');
size_t limit = pos == std::string_view::npos ? src.size() : pos;
if (limit) {
dest_pos += decode_into(src.substr(0, limit), dest.substr(dest_pos, dest.size() - dest_pos));
src = src.substr(limit, src.size() - limit);
}
if (pos != std::string_view::npos) {
src = src.substr(1, src.size() - 1);
if (src.size() >= 4) {
uint32_t mark = *((uint32_t*)src.data());
marks.emplace_back(mark, (uint32_t)dest_pos);
src = src.substr(4, src.size() - 4);
}
}
}
return dest.substr(0, dest_pos);
}
static std::wstring_view
read_from_shm(id_type cmd_id, const std::wstring_view size, const std::wstring &address, std::vector<wchar_t> &buf, Marks &marks, bool is_cued=false) {
id_type shm_size = parse_id(size);
handle_raii_null handle(OpenFileMappingW(FILE_MAP_READ, false, address.data()));
if (!handle) {
output_error(cmd_id, "Could not open shared memory at: " + winrt::to_string(address), winrt::to_string(get_last_error()), __LINE__);
return {};
}
mapping_raii mapping(MapViewOfFile(handle.ptr(), FILE_MAP_READ, 0, 0, (SIZE_T)shm_size));
if (!mapping) {
output_error(cmd_id, "Could not map shared memory", winrt::to_string(get_last_error()), __LINE__);
return {};
}
buf.reserve(shm_size + 2);
std::string_view src((const char*)mapping.ptr(), shm_size);
std::wstring_view dest(buf.data(), buf.capacity());
if (is_cued) return parse_cued_text(src, marks, dest);
return std::wstring_view(buf.data(), decode_into(src, dest));
}
// Speak {{{
void Synthesizer::on_cue_entered(id_type cmd_id, const winrt::hstring &label, const SpeechCue &cue) {
std::scoped_lock sl(recursive_lock);
@ -910,6 +606,8 @@ winrt::fire_and_forget Synthesizer::speak(id_type cmd_id, std::wstring_view cons
current_cmd_id.store(cmd_id);
current_text_storage = std::move(buf);
current_marks = std::move(marks);
synth.Options().IncludeSentenceBoundaryMetadata(true);
synth.Options().IncludeWordBoundaryMetadata(true);
}
output(cmd_id, "synthesizing", {{"ssml", is_ssml}, {"num_marks", current_marks.size()}, {"text_length", text.size()}});
bool ok = false;
@ -927,69 +625,6 @@ winrt::fire_and_forget Synthesizer::speak(id_type cmd_id, std::wstring_view cons
}
}
static size_t
decode_into(std::string_view src, std::wstring_view dest) {
int n = MultiByteToWideChar(CP_UTF8, 0, src.data(), (int)src.size(), (wchar_t*)dest.data(), (int)dest.size());
if (n == 0 && src.size() > 0) {
switch (GetLastError()) {
case ERROR_INSUFFICIENT_BUFFER:
throw std::exception("Output buffer too small while decoding cued text");
case ERROR_INVALID_FLAGS:
throw std::exception("Invalid flags while decoding cued text");
case ERROR_INVALID_PARAMETER:
throw std::exception("Invalid parameters while decoding cued text");
case ERROR_NO_UNICODE_TRANSLATION:
throw std::exception("Invalid UTF-8 found while decoding cued text");
default:
throw std::exception("Unknown error while decoding cued text");
}
}
return n;
}
static std::wstring_view
parse_cued_text(std::string_view src, Marks &marks, std::wstring_view dest) {
size_t dest_pos = 0;
if (dest.size() < src.size()) throw std::exception("Destination buffer for parse_cued_text() too small");
while (src.size()) {
auto pos = src.find('\0');
size_t limit = pos == std::string_view::npos ? src.size() : pos;
if (limit) {
dest_pos += decode_into(src.substr(0, limit), dest.substr(dest_pos, dest.size() - dest_pos));
src = src.substr(limit, src.size() - limit);
}
if (pos != std::string_view::npos) {
src = src.substr(1, src.size() - 1);
if (src.size() >= 4) {
uint32_t mark = *((uint32_t*)src.data());
marks.emplace_back(mark, (uint32_t)dest_pos);
src = src.substr(4, src.size() - 4);
}
}
}
return dest.substr(0, dest_pos);
}
static std::wstring_view
read_from_shm(id_type cmd_id, const std::wstring_view size, const std::wstring_view address, std::vector<wchar_t> &buf, Marks &marks, bool is_cued=false) {
id_type shm_size = parse_id(size);
handle_raii handle(OpenFileMappingW(FILE_MAP_READ, false, address.data()));
if (handle.ptr() == INVALID_HANDLE_VALUE) {
output_error(cmd_id, "Could not open shared memory at", winrt::to_string(address), __LINE__);
return {};
}
mapping_raii mapping(MapViewOfFile(handle.ptr(), FILE_MAP_READ, 0, 0, (SIZE_T)shm_size));
if (!mapping) {
output_error(cmd_id, "Could not map shared memory with error", std::to_string(GetLastError()), __LINE__);
return {};
}
buf.reserve(shm_size + 2);
std::string_view src((const char*)mapping.ptr(), shm_size);
std::wstring_view dest(buf.data(), buf.capacity());
if (is_cued) return parse_cued_text(src, marks, dest);
return std::wstring_view(buf.data(), decode_into(src, dest));
}
static void
handle_speak(id_type cmd_id, std::vector<std::wstring_view> &parts) {
bool is_ssml = false, is_shm = false, is_cued = false;
@ -1006,7 +641,7 @@ handle_speak(id_type cmd_id, std::vector<std::wstring_view> &parts) {
std::vector<wchar_t> buf;
std::wstring_view text;
if (is_shm) {
text = read_from_shm(cmd_id, parts.at(0), parts.at(1), buf, marks, is_cued);
text = read_from_shm(cmd_id, parts.at(0), std::wstring(parts.at(1)), buf, marks, is_cued);
if (text.size() == 0) return;
} else {
address = join(parts);
@ -1021,11 +656,43 @@ handle_speak(id_type cmd_id, std::vector<std::wstring_view> &parts) {
// }}}
// Save {{{
static winrt::fire_and_forget
save_stream(SpeechSynthesisStream const &&stream, std::ofstream &&outfile, id_type cmd_id) {
unsigned long long stream_size = stream.Size(), bytes_read = 0;
DataReader reader(stream);
unsigned int n;
const static unsigned int chunk_size = 16 * 1024;
uint8_t buf[chunk_size];
while (bytes_read < stream_size) {
bool ok = false;
try {
n = co_await reader.LoadAsync(chunk_size);
ok = true;
} CATCH_ALL_EXCEPTIONS("Failed to load data from DataReader", cmd_id);
if (!ok) break;
if (n > 0) {
bytes_read += n;
ok = false;
try {
reader.ReadBytes(winrt::array_view(buf, buf + n));
outfile.write((const char*)buf, n);
if (!outfile.good()) throw "Failed to write to output file";
ok = true;
} CATCH_ALL_EXCEPTIONS("Failed to save bytes from DataReader to file", cmd_id);
if (!ok) break;
}
}
outfile.close();
output(cmd_id, "saved", {{"size", bytes_read}});
}
void
Synthesizer::load_stream_for_save(SpeechSynthesisStream const &&stream, id_type cmd_id) {
Synthesizer::start_save_stream(SpeechSynthesisStream const &&stream, id_type cmd_id) {
std::scoped_lock sl(recursive_lock);
if (cmd_id != current_cmd_id.load()) return;
current_stream = stream;
try {
save_stream(std::move(stream), std::move(outfile), cmd_id);
} CATCH_ALL_EXCEPTIONS("Failed to save loaded stream", cmd_id);
stop_current_activity();
}
winrt::fire_and_forget Synthesizer::save(id_type cmd_id, std::wstring_view const &text, bool is_ssml, std::vector<wchar_t> &&buf, std::ofstream &&out) {
@ -1035,8 +702,9 @@ winrt::fire_and_forget Synthesizer::save(id_type cmd_id, std::wstring_view const
current_cmd_id.store(cmd_id);
current_text_storage = std::move(buf);
outfile = std::move(out);
synth.Options().IncludeSentenceBoundaryMetadata(false);
synth.Options().IncludeWordBoundaryMetadata(false);
}
output(cmd_id, "saving", {{"ssml", is_ssml}});
bool ok = false;
try {
if (is_ssml) stream = co_await synth.SynthesizeSsmlToStreamAsync(text);
@ -1046,7 +714,7 @@ winrt::fire_and_forget Synthesizer::save(id_type cmd_id, std::wstring_view const
if (ok) {
if (main_loop_is_running.load()) {
try {
load_stream_for_save(std::move(stream), cmd_id);
sx.start_save_stream(std::move(stream), cmd_id);
} CATCH_ALL_EXCEPTIONS("Failed to load synthesized stream for save", cmd_id);
}
}
@ -1058,16 +726,20 @@ handle_save(id_type cmd_id, std::vector<std::wstring_view> &parts) {
try {
is_ssml = parts.at(0) == L"ssml";
} catch (std::exception const&) {
throw std::string("Not a well formed save command");
throw "Not a well formed save command"s;
}
std::vector<wchar_t> buf;
std::wstring_view text;
std::wstring address;
Marks marks;
text = read_from_shm(cmd_id, parts.at(2), parts.at(3), buf, marks);
std::wstring_view text = read_from_shm(cmd_id, parts.at(1), std::wstring(parts.at(2)), buf, marks);
if (text.size() == 0) return;
parts.erase(parts.begin(), parts.begin() + 3);
*((wchar_t*)text.data() + text.size()) = 0; // ensure NULL termination
std::ofstream outfile(parts.at(1), std::ios::out | std::ios::trunc);
auto filename = join(parts);
auto path = std::filesystem::absolute(filename);
std::ofstream outfile(path.string(), std::ios::out | std::ios::trunc);
if (!outfile.good()) throw "Failed to create: " + path.string();
output(cmd_id, "saving", {{"ssml", is_ssml}, {"output_path", path.string()}});
sx.save(cmd_id, text, is_ssml, std::move(buf), std::move(outfile));
}
// }}}
@ -1132,6 +804,9 @@ run_main_loop(PyObject*, PyObject*) {
std::cout.imbue(std::locale("C"));
std::cin.imbue(std::locale("C"));
std::cerr.imbue(std::locale("C"));
std::wcin.imbue(std::locale("C"));
std::wcout.imbue(std::locale("C"));
std::wcerr.imbue(std::locale("C"));
} CATCH_ALL_EXCEPTIONS("Failed to set stdio locales to C", 0);
winrt::init_apartment(); // MTA (multi-threaded apartment)
main_thread_id = GetCurrentThreadId();

View File

@ -3,14 +3,15 @@
import json
import os
import struct
import sys
from contextlib import closing
from queue import Queue
from threading import Thread
from calibre.utils.shm import SharedMemory
from calibre.utils.ipc.simple_worker import start_pipe_worker
from calibre.utils.shm import SharedMemory
SSML_SAMPLE = '''
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
@ -61,9 +62,8 @@ def encode_to_file_object(text, output) -> int:
return sz
def develop_speech(text='Lucca Brazzi sleeps with the fishes.', mark_words=True):
def develop_loop(wait_for, *commands):
p = start_worker()
print('\x1b[32mSpeaking', text, '\x1b[39m]]'[:-2], flush=True)
q = Queue()
def echo_output(p):
@ -79,6 +79,36 @@ def develop_speech(text='Lucca Brazzi sleeps with the fishes.', mark_words=True)
Thread(name='Echo', target=echo_output, args=(p,), daemon=True).start()
exit_code = 0
with closing(p.stdin), closing(p.stdout):
try:
send('1 echo Synthesizer started')
send('1 volume 0.1')
for command in commands:
send(command)
while True:
m = q.get()
if m['related_to'] != wait_for:
continue
if m['payload_type'] == 'media_state_changed' and m['state'] == 'ended':
break
if m['payload_type'] == 'saved':
break
if m['payload_type'] == 'error':
exit_code = 1
break
send(f'333 echo Synthesizer exiting with exit code: {exit_code}')
send(f'334 exit {exit_code}')
ec = p.wait(1)
print(f'Worker exited with code: {os.waitstatus_to_exitcode(p.wait(1))}', file=sys.stderr, flush=True)
raise SystemExit(ec)
finally:
if p.poll() is None:
p.kill()
raise SystemExit(1)
def develop_speech(text='Lucca Brazzi sleeps with the fishes.', mark_words=True):
print('\x1b[32mSpeaking', text, '\x1b[39m]]'[:-2], flush=True)
st = 'ssml' if '<speak' in text else 'text'
if mark_words:
st = 'cued'
@ -89,25 +119,15 @@ def develop_speech(text='Lucca Brazzi sleeps with the fishes.', mark_words=True)
text.append(w)
if w is not words[-1]:
text.append(' ')
with closing(p.stdin), closing(p.stdout), SharedMemory(size=max_buffer_size(text)) as shm:
with SharedMemory(size=max_buffer_size(text)) as shm:
sz = encode_to_file_object(text, shm)
try:
send('1 echo Synthesizer started')
send('1 volume 0.1')
send(f'2 speak {st} shm {sz} {shm.name}')
while True:
m = q.get()
if m['related_to'] != 2:
continue
if m['payload_type'] == 'media_state_changed' and m['state'] == 'ended':
break
if m['payload_type'] == 'error':
exit_code = 1
break
send(f'3 echo Synthesizer exiting with exit code: {exit_code}')
send(f'4 exit {exit_code}')
raise SystemExit(p.wait(1))
finally:
if p.poll() is None:
p.kill()
raise SystemExit(1)
develop_loop(2, f'2 speak {st} shm {sz} {shm.name}')
def develop_save(text='Lucca Brazzi sleeps with the fishes.', filename="speech.wav"):
print('\x1b[32mSaving', text, '\x1b[39m]]'[:-2], flush=True)
st = 'ssml' if '<speak' in text else 'text'
with SharedMemory(size=max_buffer_size(text)) as shm:
sz = encode_to_file_object(text, shm)
develop_loop(2, f'2 save {st} {sz} {shm.name} {filename}')