feat(tts): add karaoke-style TTS progress bar to assistant UI
Add rendering and support for a karaoke-style text-to-speechprogress bar in assistant messages so users can see the currently spoken sentence and highlighted word during playback. - Append TTS karaoke bar to AssistantMessageWidget when the message is the active TTS target and playback is speaking/paused/loading. - Implement _buildKaraokeBar to render the active sentence with a highlighted word span, using ConduitCard and theme styles. - Import conduit_components for shared UI primitives. - Extend TextToSpeechState with sentence data: sentences, sentenceOffsets, activeSentenceIndex, and per-word progress (wordStartInSentence, wordEndInSentence). - Add provider callbacks wiring: onSentenceIndex and onDeviceWordProgress handlers (hooked into TTS backend). - Prepare sentence splitting and word-progress plumbing in the TTS provider (prepares data used to drive the karaoke display). This change improves UX by visually indicating the spoken sentence and current word during TTS playback, aiding comprehension and accessibility.
This commit is contained in:
@@ -15,6 +15,11 @@ class TextToSpeechState {
|
|||||||
final TtsPlaybackStatus status;
|
final TtsPlaybackStatus status;
|
||||||
final String? activeMessageId;
|
final String? activeMessageId;
|
||||||
final String? errorMessage;
|
final String? errorMessage;
|
||||||
|
final List<String> sentences;
|
||||||
|
final List<int> sentenceOffsets; // start indices in full text
|
||||||
|
final int activeSentenceIndex; // -1 when none
|
||||||
|
final int? wordStartInSentence; // nullable; only for on-device
|
||||||
|
final int? wordEndInSentence; // nullable; only for on-device
|
||||||
|
|
||||||
const TextToSpeechState({
|
const TextToSpeechState({
|
||||||
this.initialized = false,
|
this.initialized = false,
|
||||||
@@ -22,6 +27,11 @@ class TextToSpeechState {
|
|||||||
this.status = TtsPlaybackStatus.idle,
|
this.status = TtsPlaybackStatus.idle,
|
||||||
this.activeMessageId,
|
this.activeMessageId,
|
||||||
this.errorMessage,
|
this.errorMessage,
|
||||||
|
this.sentences = const [],
|
||||||
|
this.sentenceOffsets = const [],
|
||||||
|
this.activeSentenceIndex = -1,
|
||||||
|
this.wordStartInSentence,
|
||||||
|
this.wordEndInSentence,
|
||||||
});
|
});
|
||||||
|
|
||||||
bool get isSpeaking => status == TtsPlaybackStatus.speaking;
|
bool get isSpeaking => status == TtsPlaybackStatus.speaking;
|
||||||
@@ -37,6 +47,12 @@ class TextToSpeechState {
|
|||||||
bool clearActiveMessageId = false,
|
bool clearActiveMessageId = false,
|
||||||
String? errorMessage,
|
String? errorMessage,
|
||||||
bool clearErrorMessage = false,
|
bool clearErrorMessage = false,
|
||||||
|
List<String>? sentences,
|
||||||
|
List<int>? sentenceOffsets,
|
||||||
|
int? activeSentenceIndex,
|
||||||
|
bool clearWord = false,
|
||||||
|
int? wordStartInSentence,
|
||||||
|
int? wordEndInSentence,
|
||||||
}) {
|
}) {
|
||||||
return TextToSpeechState(
|
return TextToSpeechState(
|
||||||
initialized: initialized ?? this.initialized,
|
initialized: initialized ?? this.initialized,
|
||||||
@@ -48,6 +64,15 @@ class TextToSpeechState {
|
|||||||
errorMessage: clearErrorMessage
|
errorMessage: clearErrorMessage
|
||||||
? null
|
? null
|
||||||
: errorMessage ?? this.errorMessage,
|
: errorMessage ?? this.errorMessage,
|
||||||
|
sentences: sentences ?? this.sentences,
|
||||||
|
sentenceOffsets: sentenceOffsets ?? this.sentenceOffsets,
|
||||||
|
activeSentenceIndex: activeSentenceIndex ?? this.activeSentenceIndex,
|
||||||
|
wordStartInSentence: clearWord
|
||||||
|
? null
|
||||||
|
: (wordStartInSentence ?? this.wordStartInSentence),
|
||||||
|
wordEndInSentence: clearWord
|
||||||
|
? null
|
||||||
|
: (wordEndInSentence ?? this.wordEndInSentence),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -70,6 +95,8 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
|
|||||||
onPause: _handlePause,
|
onPause: _handlePause,
|
||||||
onContinue: _handleContinue,
|
onContinue: _handleContinue,
|
||||||
onError: _handleError,
|
onError: _handleError,
|
||||||
|
onSentenceIndex: _handleSentenceIndex,
|
||||||
|
onDeviceWordProgress: _handleDeviceWordProgress,
|
||||||
);
|
);
|
||||||
|
|
||||||
ref.onDispose(() {
|
ref.onDispose(() {
|
||||||
@@ -184,15 +211,23 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prepare sentence split for highlighting
|
||||||
|
final cleanText = MarkdownToText.convert(text);
|
||||||
|
final sentences = _splitForTts(cleanText);
|
||||||
|
final offsets = _computeOffsets(sentences);
|
||||||
|
|
||||||
state = state.copyWith(
|
state = state.copyWith(
|
||||||
status: TtsPlaybackStatus.loading,
|
status: TtsPlaybackStatus.loading,
|
||||||
activeMessageId: messageId,
|
activeMessageId: messageId,
|
||||||
clearErrorMessage: true,
|
clearErrorMessage: true,
|
||||||
|
sentences: sentences,
|
||||||
|
sentenceOffsets: offsets,
|
||||||
|
activeSentenceIndex: sentences.isEmpty ? -1 : 0,
|
||||||
|
clearWord: true,
|
||||||
);
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Convert markdown to clean text for TTS
|
// Convert markdown to clean text for TTS
|
||||||
final cleanText = MarkdownToText.convert(text);
|
|
||||||
if (cleanText.isEmpty) {
|
if (cleanText.isEmpty) {
|
||||||
// No speakable content
|
// No speakable content
|
||||||
if (!ref.mounted) {
|
if (!ref.mounted) {
|
||||||
@@ -224,6 +259,34 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<String> _splitForTts(String text) {
|
||||||
|
final normalized = text.replaceAll(RegExp(r"\s+"), ' ').trim();
|
||||||
|
if (normalized.isEmpty) return const [];
|
||||||
|
final parts = <String>[];
|
||||||
|
final sentenceRegex = RegExp(r"(.+?[\.!?]+)(\s+|\$)");
|
||||||
|
int index = 0;
|
||||||
|
for (final match in sentenceRegex.allMatches('$normalized ')) {
|
||||||
|
final s = match.group(1) ?? '';
|
||||||
|
if (s.trim().isNotEmpty) parts.add(s.trim());
|
||||||
|
index = match.end;
|
||||||
|
}
|
||||||
|
if (index < normalized.length) {
|
||||||
|
final tail = normalized.substring(index).trim();
|
||||||
|
if (tail.isNotEmpty) parts.add(tail);
|
||||||
|
}
|
||||||
|
return parts;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<int> _computeOffsets(List<String> sentences) {
|
||||||
|
final offsets = <int>[];
|
||||||
|
int acc = 0;
|
||||||
|
for (final s in sentences) {
|
||||||
|
offsets.add(acc);
|
||||||
|
acc += s.length + 1; // assume a space or punctuation between
|
||||||
|
}
|
||||||
|
return offsets;
|
||||||
|
}
|
||||||
|
|
||||||
Future<void> pause() async {
|
Future<void> pause() async {
|
||||||
if (!state.initialized || !state.available) {
|
if (!state.initialized || !state.available) {
|
||||||
return;
|
return;
|
||||||
@@ -294,6 +357,41 @@ class TextToSpeechController extends Notifier<TextToSpeechState> {
|
|||||||
clearActiveMessageId: true,
|
clearActiveMessageId: true,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void _handleSentenceIndex(int index) {
|
||||||
|
if (!ref.mounted) return;
|
||||||
|
final clamped = index.clamp(
|
||||||
|
-1,
|
||||||
|
state.sentences.isEmpty ? -1 : state.sentences.length - 1,
|
||||||
|
);
|
||||||
|
state = state.copyWith(
|
||||||
|
activeSentenceIndex: clamped,
|
||||||
|
// clear per-word highlight when sentence switches (server or device)
|
||||||
|
clearWord: true,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
void _handleDeviceWordProgress(int start, int end) {
|
||||||
|
if (!ref.mounted) return;
|
||||||
|
// Map global offsets to sentence index
|
||||||
|
final offsets = state.sentenceOffsets;
|
||||||
|
if (offsets.isEmpty) return;
|
||||||
|
int idx = 0;
|
||||||
|
for (var i = 0; i < offsets.length; i++) {
|
||||||
|
final sStart = offsets[i];
|
||||||
|
final sEnd = i + 1 < offsets.length ? offsets[i + 1] : 1 << 30;
|
||||||
|
if (start >= sStart && start < sEnd) {
|
||||||
|
idx = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final sentenceStart = offsets[idx];
|
||||||
|
state = state.copyWith(
|
||||||
|
activeSentenceIndex: idx,
|
||||||
|
wordStartInSentence: (start - sentenceStart).clamp(0, 1 << 20),
|
||||||
|
wordEndInSentence: (end - sentenceStart).clamp(0, 1 << 20),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final textToSpeechServiceProvider = Provider<TextToSpeechService>((ref) {
|
final textToSpeechServiceProvider = Provider<TextToSpeechService>((ref) {
|
||||||
|
|||||||
@@ -31,6 +31,8 @@ class TextToSpeechService {
|
|||||||
VoidCallback? _onPause;
|
VoidCallback? _onPause;
|
||||||
VoidCallback? _onContinue;
|
VoidCallback? _onContinue;
|
||||||
void Function(String message)? _onError;
|
void Function(String message)? _onError;
|
||||||
|
void Function(int sentenceIndex)? _onSentenceIndex;
|
||||||
|
void Function(int start, int end)? _onDeviceWordProgress;
|
||||||
|
|
||||||
bool get isInitialized => _initialized;
|
bool get isInitialized => _initialized;
|
||||||
bool get isAvailable => _available;
|
bool get isAvailable => _available;
|
||||||
@@ -51,6 +53,8 @@ class TextToSpeechService {
|
|||||||
VoidCallback? onPause,
|
VoidCallback? onPause,
|
||||||
VoidCallback? onContinue,
|
VoidCallback? onContinue,
|
||||||
void Function(String message)? onError,
|
void Function(String message)? onError,
|
||||||
|
void Function(int sentenceIndex)? onSentenceIndex,
|
||||||
|
void Function(int start, int end)? onDeviceWordProgress,
|
||||||
}) {
|
}) {
|
||||||
_onStart = onStart;
|
_onStart = onStart;
|
||||||
_onComplete = onComplete;
|
_onComplete = onComplete;
|
||||||
@@ -58,6 +62,8 @@ class TextToSpeechService {
|
|||||||
_onPause = onPause;
|
_onPause = onPause;
|
||||||
_onContinue = onContinue;
|
_onContinue = onContinue;
|
||||||
_onError = onError;
|
_onError = onError;
|
||||||
|
_onSentenceIndex = onSentenceIndex;
|
||||||
|
_onDeviceWordProgress = onDeviceWordProgress;
|
||||||
|
|
||||||
_tts.setStartHandler(_handleStart);
|
_tts.setStartHandler(_handleStart);
|
||||||
_tts.setCompletionHandler(_handleComplete);
|
_tts.setCompletionHandler(_handleComplete);
|
||||||
@@ -65,6 +71,13 @@ class TextToSpeechService {
|
|||||||
_tts.setPauseHandler(_handlePause);
|
_tts.setPauseHandler(_handlePause);
|
||||||
_tts.setContinueHandler(_handleContinue);
|
_tts.setContinueHandler(_handleContinue);
|
||||||
_tts.setErrorHandler(_handleError);
|
_tts.setErrorHandler(_handleError);
|
||||||
|
try {
|
||||||
|
_tts.setProgressHandler((String text, int start, int end, String word) {
|
||||||
|
_onDeviceWordProgress?.call(start, end);
|
||||||
|
});
|
||||||
|
} catch (_) {
|
||||||
|
// Some platforms may not support progress handler
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initialize the native TTS engine lazily
|
/// Initialize the native TTS engine lazily
|
||||||
@@ -151,6 +164,7 @@ class TextToSpeechService {
|
|||||||
if (result is int && result != 1) {
|
if (result is int && result != 1) {
|
||||||
_onError?.call('Text-to-speech engine returned code $result');
|
_onError?.call('Text-to-speech engine returned code $result');
|
||||||
}
|
}
|
||||||
|
_onSentenceIndex?.call(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
Future<void> pause() async {
|
Future<void> pause() async {
|
||||||
@@ -370,6 +384,7 @@ class TextToSpeechService {
|
|||||||
_buffered.add(Uint8List.fromList(firstBytes));
|
_buffered.add(Uint8List.fromList(firstBytes));
|
||||||
_currentIndex = 0;
|
_currentIndex = 0;
|
||||||
await _player.play(BytesSource(_buffered.first));
|
await _player.play(BytesSource(_buffered.first));
|
||||||
|
_onSentenceIndex?.call(0);
|
||||||
|
|
||||||
// Prefetch the rest in background
|
// Prefetch the rest in background
|
||||||
unawaited(
|
unawaited(
|
||||||
@@ -438,6 +453,7 @@ class TextToSpeechService {
|
|||||||
_currentIndex = nextIndex;
|
_currentIndex = nextIndex;
|
||||||
final bytes = _buffered[nextIndex];
|
final bytes = _buffered[nextIndex];
|
||||||
await _player.play(BytesSource(bytes));
|
await _player.play(BytesSource(bytes));
|
||||||
|
_onSentenceIndex?.call(_currentIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> _splitForTts(String text) {
|
List<String> _splitForTts(String text) {
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ import 'package:conduit/l10n/app_localizations.dart';
|
|||||||
import 'enhanced_attachment.dart';
|
import 'enhanced_attachment.dart';
|
||||||
import 'package:conduit/shared/widgets/chat_action_button.dart';
|
import 'package:conduit/shared/widgets/chat_action_button.dart';
|
||||||
import '../../../shared/widgets/model_avatar.dart';
|
import '../../../shared/widgets/model_avatar.dart';
|
||||||
|
import '../../../shared/widgets/conduit_components.dart';
|
||||||
import 'package:url_launcher/url_launcher_string.dart';
|
import 'package:url_launcher/url_launcher_string.dart';
|
||||||
import '../providers/chat_providers.dart' show sendMessageWithContainer;
|
import '../providers/chat_providers.dart' show sendMessageWithContainer;
|
||||||
import '../../../core/utils/debug_logger.dart';
|
import '../../../core/utils/debug_logger.dart';
|
||||||
@@ -457,12 +458,72 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (children.isEmpty) return const SizedBox.shrink();
|
if (children.isEmpty) return const SizedBox.shrink();
|
||||||
|
// Append TTS karaoke bar if this is the active message
|
||||||
|
final ttsState = ref.watch(textToSpeechControllerProvider);
|
||||||
|
final isActive =
|
||||||
|
ttsState.activeMessageId == _messageId &&
|
||||||
|
(ttsState.status == TtsPlaybackStatus.speaking ||
|
||||||
|
ttsState.status == TtsPlaybackStatus.paused ||
|
||||||
|
ttsState.status == TtsPlaybackStatus.loading);
|
||||||
|
if (isActive && ttsState.activeSentenceIndex >= 0) {
|
||||||
|
children.add(const SizedBox(height: Spacing.sm));
|
||||||
|
children.add(_buildKaraokeBar(ttsState));
|
||||||
|
}
|
||||||
|
|
||||||
return Column(
|
return Column(
|
||||||
crossAxisAlignment: CrossAxisAlignment.start,
|
crossAxisAlignment: CrossAxisAlignment.start,
|
||||||
children: children,
|
children: children,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Widget _buildKaraokeBar(TextToSpeechState ttsState) {
|
||||||
|
final theme = context.conduitTheme;
|
||||||
|
final idx = ttsState.activeSentenceIndex;
|
||||||
|
if (idx < 0 || idx >= ttsState.sentences.length) {
|
||||||
|
return const SizedBox.shrink();
|
||||||
|
}
|
||||||
|
final sentence = ttsState.sentences[idx];
|
||||||
|
final ws = ttsState.wordStartInSentence;
|
||||||
|
final we = ttsState.wordEndInSentence;
|
||||||
|
|
||||||
|
final baseStyle = TextStyle(
|
||||||
|
color: theme.textPrimary,
|
||||||
|
height: 1.2,
|
||||||
|
fontSize: 14,
|
||||||
|
);
|
||||||
|
final highlightStyle = baseStyle.copyWith(
|
||||||
|
backgroundColor: theme.buttonPrimary.withValues(alpha: 0.25),
|
||||||
|
color: theme.textPrimary,
|
||||||
|
fontWeight: FontWeight.w600,
|
||||||
|
);
|
||||||
|
|
||||||
|
InlineSpan buildSpans() {
|
||||||
|
if (ws == null ||
|
||||||
|
we == null ||
|
||||||
|
ws < 0 ||
|
||||||
|
we <= ws ||
|
||||||
|
ws >= sentence.length) {
|
||||||
|
return TextSpan(text: sentence, style: baseStyle);
|
||||||
|
}
|
||||||
|
final safeEnd = we.clamp(0, sentence.length);
|
||||||
|
final before = sentence.substring(0, ws);
|
||||||
|
final word = sentence.substring(ws, safeEnd);
|
||||||
|
final after = sentence.substring(safeEnd);
|
||||||
|
return TextSpan(
|
||||||
|
children: [
|
||||||
|
if (before.isNotEmpty) TextSpan(text: before, style: baseStyle),
|
||||||
|
TextSpan(text: word, style: highlightStyle),
|
||||||
|
if (after.isNotEmpty) TextSpan(text: after, style: baseStyle),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ConduitCard(
|
||||||
|
padding: const EdgeInsets.all(Spacing.sm),
|
||||||
|
child: RichText(text: buildSpans()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
bool get _shouldShowTypingIndicator =>
|
bool get _shouldShowTypingIndicator =>
|
||||||
widget.isStreaming && _isAssistantResponseEmpty;
|
widget.isStreaming && _isAssistantResponseEmpty;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user