feat(tts): add karaoke-style TTS progress bar to assistant UI

Add rendering and support for a karaoke-style text-to-speechprogress bar in assistant messages so users can see the currently spoken sentence and highlighted word during playback. - Append TTS karaoke bar to AssistantMessageWidget when the message is the active TTS target and playback is speaking/paused/loading. - Implement _buildKaraokeBar to render the active sentence with a highlighted word span, using ConduitCard and theme styles. - Import conduit_components for shared UI primitives. - Extend TextToSpeechState with sentence data: sentences, sentenceOffsets, activeSentenceIndex, and per-word progress (wordStartInSentence, wordEndInSentence). - Add provider callbacks wiring: onSentenceIndex and onDeviceWordProgress handlers (hooked into TTS backend). - Prepare sentence splitting and word-progress plumbing in the TTS provider (prepares data used to drive the karaoke display). This change improves UX by visually indicating the spoken sentence and current word during TTS playback, aiding comprehension and accessibility.
2025-10-23 17:05:35 +05:30
parent 8ec411d6aa
commit 56246507de
3 changed files with 176 additions and 1 deletions
@@ -15,6 +15,11 @@ class TextToSpeechState {
  final TtsPlaybackStatus status;
  final String? activeMessageId;
  final String? errorMessage;
+  final List<String> sentences;
+  final List<int> sentenceOffsets; // start indices in full text
+  final int activeSentenceIndex; // -1 when none
+  final int? wordStartInSentence; // nullable; only for on-device
+  final int? wordEndInSentence; // nullable; only for on-device

  const TextToSpeechState({
    this.initialized = false,
@@ -22,6 +27,11 @@ class TextToSpeechState {
    this.status = TtsPlaybackStatus.idle,
    this.activeMessageId,
    this.errorMessage,
+    this.sentences = const [],
+    this.sentenceOffsets = const [],
+    this.activeSentenceIndex = -1,
+    this.wordStartInSentence,
+    this.wordEndInSentence,
  });

  bool get isSpeaking => status == TtsPlaybackStatus.speaking;
@@ -37,6 +47,12 @@ class TextToSpeechState {
    bool clearActiveMessageId = false,
    String? errorMessage,
    bool clearErrorMessage = false,
+    List<String>? sentences,
+    List<int>? sentenceOffsets,
+    int? activeSentenceIndex,
+    bool clearWord = false,
+    int? wordStartInSentence,
+    int? wordEndInSentence,
  }) {
    return TextToSpeechState(
      initialized: initialized ?? this.initialized,
@@ -48,6 +64,15 @@ class TextToSpeechState {
      errorMessage: clearErrorMessage
          ? null
          : errorMessage ?? this.errorMessage,
+      sentences: sentences ?? this.sentences,
+      sentenceOffsets: sentenceOffsets ?? this.sentenceOffsets,
+      activeSentenceIndex: activeSentenceIndex ?? this.activeSentenceIndex,
+      wordStartInSentence: clearWord
+          ? null
+          : (wordStartInSentence ?? this.wordStartInSentence),
+      wordEndInSentence: clearWord
+          ? null
+          : (wordEndInSentence ?? this.wordEndInSentence),
    );
  }
 }
@@ -70,6 +95,8 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
        onPause: _handlePause,
        onContinue: _handleContinue,
        onError: _handleError,
+        onSentenceIndex: _handleSentenceIndex,
+        onDeviceWordProgress: _handleDeviceWordProgress,
      );

      ref.onDispose(() {
@@ -184,15 +211,23 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
      return;
    }

+    // Prepare sentence split for highlighting
+    final cleanText = MarkdownToText.convert(text);
+    final sentences = _splitForTts(cleanText);
+    final offsets = _computeOffsets(sentences);
+
    state = state.copyWith(
      status: TtsPlaybackStatus.loading,
      activeMessageId: messageId,
      clearErrorMessage: true,
+      sentences: sentences,
+      sentenceOffsets: offsets,
+      activeSentenceIndex: sentences.isEmpty ? -1 : 0,
+      clearWord: true,
    );

    try {
      // Convert markdown to clean text for TTS
-      final cleanText = MarkdownToText.convert(text);
      if (cleanText.isEmpty) {
        // No speakable content
        if (!ref.mounted) {
@@ -224,6 +259,34 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
    }
  }

+  List<String> _splitForTts(String text) {
+    final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim();
+    if (normalized.isEmpty) return const [];
+    final parts = <String>[];
+    final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)");
+    int index = 0;
+    for (final match in sentenceRegex.allMatches('$normalized ')) {
+      final s = match.group(1) ?? '';
+      if (s.trim().isNotEmpty) parts.add(s.trim());
+      index = match.end;
+    }
+    if (index < normalized.length) {
+      final tail = normalized.substring(index).trim();
+      if (tail.isNotEmpty) parts.add(tail);
+    }
+    return parts;
+  }
+
+  List<int> _computeOffsets(List<String> sentences) {
+    final offsets = <int>[];
+    int acc = 0;
+    for (final s in sentences) {
+      offsets.add(acc);
+      acc += s.length + 1; // assume a space or punctuation between
+    }
+    return offsets;
+  }
+
  Future<void> pause() async {
    if (!state.initialized || !state.available) {
      return;
@@ -294,6 +357,41 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
      clearActiveMessageId: true,
    );
  }
+
+  void _handleSentenceIndex(int index) {
+    if (!ref.mounted) return;
+    final clamped = index.clamp(
+      -1,
+      state.sentences.isEmpty ? -1 : state.sentences.length - 1,
+    );
+    state = state.copyWith(
+      activeSentenceIndex: clamped,
+      // clear per-word highlight when sentence switches (server or device)
+      clearWord: true,
+    );
+  }
+
+  void _handleDeviceWordProgress(int start, int end) {
+    if (!ref.mounted) return;
+    // Map global offsets to sentence index
+    final offsets = state.sentenceOffsets;
+    if (offsets.isEmpty) return;
+    int idx = 0;
+    for (var i = 0; i < offsets.length; i++) {
+      final sStart = offsets[i];
+      final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30;
+      if (start >= sStart && start < sEnd) {
+        idx = i;
+        break;
+      }
+    }
+    final sentenceStart = offsets[idx];
+    state = state.copyWith(
+      activeSentenceIndex: idx,
+      wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20),
+      wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20),
+    );
+  }
 }

 final textToSpeechServiceProvider = Provider<TextToSpeechService>((ref) {