diff --git a/src/calibre/gui2/tts/qt.py b/src/calibre/gui2/tts/qt.py
index 249f97763c..bacb537afb 100644
--- a/src/calibre/gui2/tts/qt.py
+++ b/src/calibre/gui2/tts/qt.py
@@ -13,8 +13,9 @@ class QtTTSBackend(TTSBackend):
         super().__init__(parent)
         self.speaking_text = ''
         self.last_word_offset = 0
-        self._qt_reload_after_configure(engine_name)
         self.last_spoken_word = None
+        self.ignore_tracking_until_state_changes_to_speaking = False
+        self._qt_reload_after_configure(engine_name)
 
     @property
     def available_voices(self) -> dict[str, tuple[Voice, ...]]:
@@ -43,6 +44,8 @@ class QtTTSBackend(TTSBackend):
         self.last_word_offset = 0
         self.last_spoken_word = None
         self.speaking_text = text
+        if self.tts.engine() == 'sapi':
+            self.ignore_tracking_until_state_changes_to_speaking = True
         self.tts.say(text)
 
     def error_message(self) -> str:
@@ -96,18 +99,23 @@ class QtTTSBackend(TTSBackend):
         self._current_settings = settings
 
     def _saying_word(self, word: str, utterance_id: int, start: int, length: int) -> None:
+        # print(f'{repr(word)=} {start=} {length=}, {repr(self.speaking_text[start:start+length])=} {self.ignore_tracking_until_state_changes_to_speaking=}')
+        if self.ignore_tracking_until_state_changes_to_speaking:
+            return
         # Qt's word tracking is broken with non-BMP unicode chars, the
         # start and length values are totally wrong, so track manually
-        # print(f'{repr(word)=} {idx=} {start=} {length=}, {repr(self.speaking_text[start:start+length])=}')
         key = word, start, length
         # Qt sometimes repeats words with windows modern engine at least, for a test case see https://bugs.launchpad.net/calibre/+bug/2080708
         if self.last_spoken_word == key:
             return
         self.last_spoken_word = key
         idx = self.speaking_text.find(word, self.last_word_offset)
+        # print(f'{self.last_word_offset=} {idx=}')
         if idx > -1:
             self.saying.emit(idx, len(word))
             self.last_word_offset = idx + len(word)
 
     def _state_changed(self, state: QTextToSpeech.State) -> None:
+        if self.ignore_tracking_until_state_changes_to_speaking and state is QTextToSpeech.State.Speaking:
+            self.ignore_tracking_until_state_changes_to_speaking = False
         self.state_changed.emit(state)