feat(tts): add karaoke-style TTS progress bar to assistant UI

Add rendering and support for a karaoke-style text-to-speechprogress bar in assistant messages so users can see the currently
spoken sentence and highlighted word during playback.

- Append TTS karaoke bar to AssistantMessageWidget when the message is
  the active TTS target and playback is speaking/paused/loading.
- Implement _buildKaraokeBar to render the active sentence with a
  highlighted word span, using ConduitCard and theme styles.
- Import conduit_components for shared UI primitives.
- Extend TextToSpeechState with sentence data:
  sentences, sentenceOffsets, activeSentenceIndex, and per-word
  progress (wordStartInSentence, wordEndInSentence).
- Add provider callbacks wiring: onSentenceIndex and
  onDeviceWordProgress handlers (hooked into TTS backend).
- Prepare sentence splitting and word-progress plumbing in the TTS
  provider (prepares data used to drive the karaoke display).

This change improves UX by visually indicating the spoken sentence
and current word during TTS playback, aiding comprehension and
accessibility.
This commit is contained in:
cogwheel0
2025-10-23 17:05:35 +05:30
parent 8ec411d6aa
commit 56246507de
3 changed files with 176 additions and 1 deletions

View File

@@ -15,6 +15,11 @@ class TextToSpeechState {
final TtsPlaybackStatus status;
final String? activeMessageId;
final String? errorMessage;
final List<String> sentences;
final List<int> sentenceOffsets; // start indices in full text
final int activeSentenceIndex; // -1 when none
final int? wordStartInSentence; // nullable; only for on-device
final int? wordEndInSentence; // nullable; only for on-device
const TextToSpeechState({
this.initialized = false,
@@ -22,6 +27,11 @@ class TextToSpeechState {
this.status = TtsPlaybackStatus.idle,
this.activeMessageId,
this.errorMessage,
this.sentences = const [],
this.sentenceOffsets = const [],
this.activeSentenceIndex = -1,
this.wordStartInSentence,
this.wordEndInSentence,
});
bool get isSpeaking => status == TtsPlaybackStatus.speaking;
@@ -37,6 +47,12 @@ class TextToSpeechState {
bool clearActiveMessageId = false,
String? errorMessage,
bool clearErrorMessage = false,
List<String>? sentences,
List<int>? sentenceOffsets,
int? activeSentenceIndex,
bool clearWord = false,
int? wordStartInSentence,
int? wordEndInSentence,
}) {
return TextToSpeechState(
initialized: initialized ?? this.initialized,
@@ -48,6 +64,15 @@ class TextToSpeechState {
errorMessage: clearErrorMessage
? null
: errorMessage ?? this.errorMessage,
sentences: sentences ?? this.sentences,
sentenceOffsets: sentenceOffsets ?? this.sentenceOffsets,
activeSentenceIndex: activeSentenceIndex ?? this.activeSentenceIndex,
wordStartInSentence: clearWord
? null
: (wordStartInSentence ?? this.wordStartInSentence),
wordEndInSentence: clearWord
? null
: (wordEndInSentence ?? this.wordEndInSentence),
);
}
}
@@ -70,6 +95,8 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
onPause: _handlePause,
onContinue: _handleContinue,
onError: _handleError,
onSentenceIndex: _handleSentenceIndex,
onDeviceWordProgress: _handleDeviceWordProgress,
);
ref.onDispose(() {
@@ -184,15 +211,23 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
return;
}
// Prepare sentence split for highlighting
final cleanText = MarkdownToText.convert(text);
final sentences = _splitForTts(cleanText);
final offsets = _computeOffsets(sentences);
state = state.copyWith(
status: TtsPlaybackStatus.loading,
activeMessageId: messageId,
clearErrorMessage: true,
sentences: sentences,
sentenceOffsets: offsets,
activeSentenceIndex: sentences.isEmpty ? -1 : 0,
clearWord: true,
);
try {
// Convert markdown to clean text for TTS
final cleanText = MarkdownToText.convert(text);
if (cleanText.isEmpty) {
// No speakable content
if (!ref.mounted) {
@@ -224,6 +259,34 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
}
}
List<String> _splitForTts(String text) {
final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim();
if (normalized.isEmpty) return const [];
final parts = <String>[];
final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)");
int index = 0;
for (final match in sentenceRegex.allMatches('$normalized ')) {
final s = match.group(1) ?? '';
if (s.trim().isNotEmpty) parts.add(s.trim());
index = match.end;
}
if (index < normalized.length) {
final tail = normalized.substring(index).trim();
if (tail.isNotEmpty) parts.add(tail);
}
return parts;
}
List<int> _computeOffsets(List<String> sentences) {
final offsets = <int>[];
int acc = 0;
for (final s in sentences) {
offsets.add(acc);
acc += s.length + 1; // assume a space or punctuation between
}
return offsets;
}
Future<void> pause() async {
if (!state.initialized || !state.available) {
return;
@@ -294,6 +357,41 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
clearActiveMessageId: true,
);
}
void _handleSentenceIndex(int index) {
if (!ref.mounted) return;
final clamped = index.clamp(
-1,
state.sentences.isEmpty ? -1 : state.sentences.length - 1,
);
state = state.copyWith(
activeSentenceIndex: clamped,
// clear per-word highlight when sentence switches (server or device)
clearWord: true,
);
}
void _handleDeviceWordProgress(int start, int end) {
if (!ref.mounted) return;
// Map global offsets to sentence index
final offsets = state.sentenceOffsets;
if (offsets.isEmpty) return;
int idx = 0;
for (var i = 0; i < offsets.length; i++) {
final sStart = offsets[i];
final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30;
if (start >= sStart && start < sEnd) {
idx = i;
break;
}
}
final sentenceStart = offsets[idx];
state = state.copyWith(
activeSentenceIndex: idx,
wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20),
wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20),
);
}
}
final textToSpeechServiceProvider = Provider<TextToSpeechService>((ref) {