diff --git a/android/app/src/main/AndroidManifest.xml b/android/app/src/main/AndroidManifest.xml
index c3b95a5..38a833e 100644
--- a/android/app/src/main/AndroidManifest.xml
+++ b/android/app/src/main/AndroidManifest.xml
@@ -2,6 +2,7 @@
+
diff --git a/ios/Flutter/AppFrameworkInfo.plist b/ios/Flutter/AppFrameworkInfo.plist
index 1dc6cf7..b2a56aa 100644
--- a/ios/Flutter/AppFrameworkInfo.plist
+++ b/ios/Flutter/AppFrameworkInfo.plist
@@ -21,6 +21,6 @@
CFBundleVersion
1.0
MinimumOSVersion
- 13.0
+ 15.1
diff --git a/ios/Podfile b/ios/Podfile
index e3b3517..24026fa 100644
--- a/ios/Podfile
+++ b/ios/Podfile
@@ -1,5 +1,5 @@
# Uncomment this line to define a global platform for your project
-platform :ios, '13.0'
+platform :ios, '15.1'
# CocoaPods analytics sends network stats synchronously affecting flutter build latency.
ENV['COCOAPODS_DISABLE_STATS'] = 'true'
diff --git a/ios/Podfile.lock b/ios/Podfile.lock
index 42cde87..d56263b 100644
--- a/ios/Podfile.lock
+++ b/ios/Podfile.lock
@@ -49,13 +49,18 @@ PODS:
- Flutter
- image_picker_ios (0.0.1):
- Flutter
- - mic_stream_recorder (0.0.1):
- - Flutter
+ - onnxruntime-c (1.22.0)
+ - onnxruntime-objc (1.22.0):
+ - onnxruntime-objc/Core (= 1.22.0)
+ - onnxruntime-objc/Core (1.22.0):
+ - onnxruntime-c (= 1.22.0)
- package_info_plus (0.4.5):
- Flutter
- path_provider_foundation (0.0.1):
- Flutter
- FlutterMacOS
+ - record_ios (1.1.0):
+ - Flutter
- SDWebImage (5.21.1):
- SDWebImage/Core (= 5.21.1)
- SDWebImage/Core (5.21.1)
@@ -80,6 +85,9 @@ PODS:
- SwiftyGif (5.4.5)
- url_launcher_ios (0.0.1):
- Flutter
+ - vad (0.0.6):
+ - Flutter
+ - onnxruntime-objc (= 1.22.0)
- wakelock_plus (0.0.1):
- Flutter
- webview_flutter_wkwebview (0.0.1):
@@ -96,9 +104,9 @@ DEPENDENCIES:
- flutter_secure_storage (from `.symlinks/plugins/flutter_secure_storage/ios`)
- flutter_tts (from `.symlinks/plugins/flutter_tts/ios`)
- image_picker_ios (from `.symlinks/plugins/image_picker_ios/ios`)
- - mic_stream_recorder (from `.symlinks/plugins/mic_stream_recorder/ios`)
- package_info_plus (from `.symlinks/plugins/package_info_plus/ios`)
- path_provider_foundation (from `.symlinks/plugins/path_provider_foundation/darwin`)
+ - record_ios (from `.symlinks/plugins/record_ios/ios`)
- share_handler_ios (from `.symlinks/plugins/share_handler_ios/ios`)
- share_handler_ios_models (from `.symlinks/plugins/share_handler_ios/ios/Models`)
- share_plus (from `.symlinks/plugins/share_plus/ios`)
@@ -106,6 +114,7 @@ DEPENDENCIES:
- sqflite_darwin (from `.symlinks/plugins/sqflite_darwin/darwin`)
- stts (from `.symlinks/plugins/stts/ios`)
- url_launcher_ios (from `.symlinks/plugins/url_launcher_ios/ios`)
+ - vad (from `.symlinks/plugins/vad/ios`)
- wakelock_plus (from `.symlinks/plugins/wakelock_plus/ios`)
- webview_flutter_wkwebview (from `.symlinks/plugins/webview_flutter_wkwebview/darwin`)
@@ -113,6 +122,8 @@ SPEC REPOS:
trunk:
- DKImagePickerController
- DKPhotoGallery
+ - onnxruntime-c
+ - onnxruntime-objc
- SDWebImage
- SwiftyGif
@@ -135,12 +146,12 @@ EXTERNAL SOURCES:
:path: ".symlinks/plugins/flutter_tts/ios"
image_picker_ios:
:path: ".symlinks/plugins/image_picker_ios/ios"
- mic_stream_recorder:
- :path: ".symlinks/plugins/mic_stream_recorder/ios"
package_info_plus:
:path: ".symlinks/plugins/package_info_plus/ios"
path_provider_foundation:
:path: ".symlinks/plugins/path_provider_foundation/darwin"
+ record_ios:
+ :path: ".symlinks/plugins/record_ios/ios"
share_handler_ios:
:path: ".symlinks/plugins/share_handler_ios/ios"
share_handler_ios_models:
@@ -155,6 +166,8 @@ EXTERNAL SOURCES:
:path: ".symlinks/plugins/stts/ios"
url_launcher_ios:
:path: ".symlinks/plugins/url_launcher_ios/ios"
+ vad:
+ :path: ".symlinks/plugins/vad/ios"
wakelock_plus:
:path: ".symlinks/plugins/wakelock_plus/ios"
webview_flutter_wkwebview:
@@ -172,9 +185,11 @@ SPEC CHECKSUMS:
flutter_secure_storage: 1ed9476fba7e7a782b22888f956cce43e2c62f13
flutter_tts: b88dbc8655d3dc961bc4a796e4e16a4cc1795833
image_picker_ios: 7fe1ff8e34c1790d6fff70a32484959f563a928a
- mic_stream_recorder: 27d2d1225563a3a28bf4019fc5cc198cffd7dad1
+ onnxruntime-c: 7f778680e96145956c0a31945f260321eed2611a
+ onnxruntime-objc: 83d28b87525bd971259a66e153ea32b5d023de19
package_info_plus: af8e2ca6888548050f16fa2f1938db7b5a5df499
path_provider_foundation: 080d55be775b7414fd5a5ef3ac137b97b097e564
+ record_ios: f75fa1d57f840012775c0e93a38a7f3ceea1a374
SDWebImage: f29024626962457f3470184232766516dee8dfea
share_handler_ios: e2244e990f826b2c8eaa291ac3831569438ba0fb
share_handler_ios_models: fc638c9b4330dc7f082586c92aee9dfa0b87b871
@@ -184,9 +199,10 @@ SPEC CHECKSUMS:
stts: 1a48df645bb516e86e4121d5253b582749a1d3a6
SwiftyGif: 706c60cf65fa2bc5ee0313beece843c8eb8194d4
url_launcher_ios: 694010445543906933d732453a59da0a173ae33d
+ vad: 7934867589afe53567f492df66fb1615f2185822
wakelock_plus: e29112ab3ef0b318e58cfa5c32326458be66b556
webview_flutter_wkwebview: 8ebf4fded22593026f7dbff1fbff31ea98573c8d
-PODFILE CHECKSUM: df88575cf61e98a1a3edf2f8c887dad2c18c2079
+PODFILE CHECKSUM: a6ecbec6401c6461e69650e9ef66360aee70610f
COCOAPODS: 1.16.2
diff --git a/ios/Runner.xcodeproj/project.pbxproj b/ios/Runner.xcodeproj/project.pbxproj
index 90360a7..7f9d99d 100644
--- a/ios/Runner.xcodeproj/project.pbxproj
+++ b/ios/Runner.xcodeproj/project.pbxproj
@@ -585,7 +585,7 @@
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
- IPHONEOS_DEPLOYMENT_TARGET = 13.0;
+ IPHONEOS_DEPLOYMENT_TARGET = 15.1;
MTL_ENABLE_DEBUG_INFO = NO;
SDKROOT = iphoneos;
SUPPORTED_PLATFORMS = iphoneos;
@@ -722,7 +722,7 @@
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
- IPHONEOS_DEPLOYMENT_TARGET = 13.0;
+ IPHONEOS_DEPLOYMENT_TARGET = 15.1;
MTL_ENABLE_DEBUG_INFO = YES;
ONLY_ACTIVE_ARCH = YES;
SDKROOT = iphoneos;
@@ -773,7 +773,7 @@
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
- IPHONEOS_DEPLOYMENT_TARGET = 13.0;
+ IPHONEOS_DEPLOYMENT_TARGET = 15.1;
MTL_ENABLE_DEBUG_INFO = NO;
SDKROOT = iphoneos;
SUPPORTED_PLATFORMS = iphoneos;
@@ -865,7 +865,7 @@
INFOPLIST_FILE = ShareExtension/Info.plist;
INFOPLIST_KEY_CFBundleDisplayName = ShareExtension;
INFOPLIST_KEY_NSHumanReadableCopyright = "";
- IPHONEOS_DEPLOYMENT_TARGET = 14.0;
+ IPHONEOS_DEPLOYMENT_TARGET = 15.1;
LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)",
"@executable_path/Frameworks",
@@ -908,7 +908,7 @@
INFOPLIST_FILE = ShareExtension/Info.plist;
INFOPLIST_KEY_CFBundleDisplayName = ShareExtension;
INFOPLIST_KEY_NSHumanReadableCopyright = "";
- IPHONEOS_DEPLOYMENT_TARGET = 14.0;
+ IPHONEOS_DEPLOYMENT_TARGET = 15.1;
LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)",
"@executable_path/Frameworks",
@@ -948,7 +948,7 @@
INFOPLIST_FILE = ShareExtension/Info.plist;
INFOPLIST_KEY_CFBundleDisplayName = ShareExtension;
INFOPLIST_KEY_NSHumanReadableCopyright = "";
- IPHONEOS_DEPLOYMENT_TARGET = 14.0;
+ IPHONEOS_DEPLOYMENT_TARGET = 15.1;
LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)",
"@executable_path/Frameworks",
diff --git a/lib/features/chat/services/text_to_speech_service.dart b/lib/features/chat/services/text_to_speech_service.dart
index 7c8f4b4..0fe7240 100644
--- a/lib/features/chat/services/text_to_speech_service.dart
+++ b/lib/features/chat/services/text_to_speech_service.dart
@@ -11,6 +11,13 @@ import '../../../core/services/settings_service.dart';
typedef _SpeechChunk = ({Uint8List bytes, String mimeType});
+class SpeechAudioChunk {
+ const SpeechAudioChunk({required this.bytes, required this.mimeType});
+
+ final Uint8List bytes;
+ final String mimeType;
+}
+
/// Lightweight wrapper around FlutterTts to centralize configuration
class TextToSpeechService {
final FlutterTts _tts = FlutterTts();
@@ -45,6 +52,7 @@ class TextToSpeechService {
bool get isAvailable => _available;
bool get deviceEngineAvailable => _deviceEngineAvailable;
bool get serverEngineAvailable => _api != null;
+ bool get prefersServerEngine => _shouldUseServer();
TextToSpeechService({ApiService? api}) : _api = api {
// Wire minimal player events to callbacks
@@ -277,6 +285,29 @@ class TextToSpeechService {
_onSentenceIndex?.call(0);
}
+ Future synthesizeServerSpeechChunk(String text) async {
+ if (text.trim().isEmpty) {
+ throw ArgumentError('Cannot synthesize empty text');
+ }
+ if (_api == null) {
+ throw StateError('Server text-to-speech is unavailable');
+ }
+ if (!_initialized) {
+ await initialize(
+ deviceVoice: _preferredVoice,
+ serverVoice: _serverPreferredVoice,
+ engine: _engine,
+ );
+ }
+ final voice = await _resolveServerVoice();
+ final chunk = await _api.generateSpeech(
+ text: text,
+ voice: voice,
+ speed: _speechRate,
+ );
+ return SpeechAudioChunk(bytes: chunk.bytes, mimeType: chunk.mimeType);
+ }
+
Future pause() async {
if (!_initialized) return;
try {
@@ -572,6 +603,15 @@ class TextToSpeechService {
}
}
+ Future preloadServerDefaults() async {
+ if (_api == null) {
+ return;
+ }
+ try {
+ await _getServerDefaultVoice();
+ } catch (_) {}
+ }
+
// ===== Server chunked playback =====
Future _startServerChunkedPlayback(String text) async {
diff --git a/lib/features/chat/services/voice_call_service.dart b/lib/features/chat/services/voice_call_service.dart
index 1fb4123..fa32b90 100644
--- a/lib/features/chat/services/voice_call_service.dart
+++ b/lib/features/chat/services/voice_call_service.dart
@@ -1,5 +1,7 @@
import 'dart:async';
+import 'dart:collection';
+import 'package:audioplayers/audioplayers.dart';
import 'package:riverpod_annotation/riverpod_annotation.dart';
import 'package:wakelock_plus/wakelock_plus.dart';
@@ -49,6 +51,18 @@ class VoiceCallService {
final Set _pauseReasons = {};
SocketEventSubscription? _socketSubscription;
Timer? _keepAliveTimer;
+ final ListQueue _speechQueue = ListQueue();
+ int _enqueuedSentenceCount = 0;
+ String? _activeAssistantMessageId;
+ bool _responseCompleted = false;
+ bool _listeningSuspendedForSpeech = false;
+ final Map _serverAudioBuffer = {};
+ final AudioPlayer _serverAudioPlayer = AudioPlayer();
+ int _serverAudioSession = 0;
+ int _pendingServerAudioFetches = 0;
+ bool _serverPipelineActive = false;
+ int _nextServerChunkId = 0;
+ int _nextServerPlaybackId = 0;
final StreamController _stateController =
StreamController.broadcast();
@@ -75,6 +89,12 @@ class VoiceCallService {
// sentence/word callbacks are not required for call UI, but harmless
);
+ _serverAudioPlayer.onPlayerComplete.listen((_) {
+ _handleServerAudioComplete();
+ });
+
+ unawaited(_tts.preloadServerDefaults());
+
// Set up notification action handler
_notificationService.onActionPressed = _handleNotificationAction;
}
@@ -197,6 +217,13 @@ class VoiceCallService {
if (_isDisposed) return;
try {
+ _speechQueue.clear();
+ _enqueuedSentenceCount = 0;
+ _activeAssistantMessageId = null;
+ _responseCompleted = false;
+ _listeningSuspendedForSpeech = false;
+ _resetServerAudio(stopPlayback: true);
+
if (_pauseReasons.isNotEmpty) {
_listeningPaused = true;
if (_state != VoiceCallState.paused) {
@@ -276,6 +303,14 @@ class VoiceCallService {
String _accumulatedResponse = '';
bool _isSpeaking = false;
+ bool get _hasPendingSpeech {
+ if (_serverPipelineActive) {
+ return _isSpeaking ||
+ _serverAudioBuffer.isNotEmpty ||
+ _pendingServerAudioFetches > 0;
+ }
+ return _isSpeaking || _speechQueue.isNotEmpty;
+ }
void _handleSocketEvent(
Map event,
@@ -284,18 +319,32 @@ class VoiceCallService {
if (_isDisposed) return;
final outerData = event['data'];
+ final messageId = event['message_id']?.toString();
if (outerData is Map) {
final eventType = outerData['type']?.toString();
final innerData = outerData['data'];
if (eventType == 'chat:completion' && innerData is Map) {
+ final bool doneFlag = innerData['done'] == true;
+ if (messageId != null && messageId.isNotEmpty) {
+ _handleAssistantMessageStart(messageId);
+ }
+
// Handle full content replacement (used by some models/backends)
if (innerData.containsKey('content')) {
final content = innerData['content']?.toString() ?? '';
if (content.isNotEmpty) {
_accumulatedResponse = content;
_responseController.add(content);
+ _processSpeakableSegments(isFinalChunk: doneFlag);
+ if (doneFlag) {
+ _responseCompleted = true;
+ _maybeResumeListeningAfterSpeech();
+ }
+ } else if (doneFlag) {
+ _responseCompleted = true;
+ _maybeResumeListeningAfterSpeech();
}
}
@@ -313,61 +362,248 @@ class VoiceCallService {
if (deltaContent.isNotEmpty) {
_accumulatedResponse += deltaContent;
_responseController.add(_accumulatedResponse);
+ _processSpeakableSegments(isFinalChunk: false);
}
}
// Check for completion
- if (finishReason == 'stop') {
- if (_accumulatedResponse.isNotEmpty && !_isSpeaking) {
- _speakResponse(_accumulatedResponse);
- _accumulatedResponse = '';
- } else if (_accumulatedResponse.isEmpty) {
- // No response, restart listening unless paused
- if (_pauseReasons.isEmpty) {
- _startListening();
- } else if (_state != VoiceCallState.paused) {
- _updateState(VoiceCallState.paused);
- }
- }
+ if (finishReason == 'stop' || finishReason == 'length') {
+ _responseCompleted = true;
+ _processSpeakableSegments(isFinalChunk: true);
+ _maybeResumeListeningAfterSpeech();
}
}
}
+
+ if (doneFlag && !_responseCompleted) {
+ _responseCompleted = true;
+ _processSpeakableSegments(isFinalChunk: true);
+ _maybeResumeListeningAfterSpeech();
+ }
}
}
}
- Future _speakResponse(String response) async {
- if (_isDisposed || _isSpeaking) return;
+ void _handleAssistantMessageStart(String messageId) {
+ if (_activeAssistantMessageId == messageId) {
+ return;
+ }
+ _activeAssistantMessageId = messageId;
+ _accumulatedResponse = '';
+ _responseController.add('');
+ _speechQueue.clear();
+ _enqueuedSentenceCount = 0;
+ _responseCompleted = false;
+ _resetServerAudio(stopPlayback: true);
+ if (_isSpeaking) {
+ _isSpeaking = false;
+ unawaited(_tts.stop());
+ }
+ }
- try {
- _isSpeaking = true;
+ void _processSpeakableSegments({required bool isFinalChunk}) {
+ if (_isDisposed) return;
+ final cleanText = MarkdownToText.convert(_accumulatedResponse).trim();
+ if (cleanText.isEmpty) {
+ return;
+ }
- // Stop listening before speaking
- await _voiceInput.stopListening();
- await _transcriptSubscription?.cancel();
- await _intensitySubscription?.cancel();
+ final segments = _tts.splitTextForSpeech(cleanText);
+ if (segments.isEmpty) {
+ return;
+ }
- _updateState(VoiceCallState.speaking);
+ var availableCount = segments.length;
+ if (!isFinalChunk && availableCount > 0) {
+ availableCount -= 1;
+ }
+ if (availableCount < 0) {
+ availableCount = 0;
+ }
- // Convert markdown to clean text for TTS
- final cleanText = MarkdownToText.convert(response);
- if (cleanText.isEmpty) {
- // No speakable content, restart listening
- _isSpeaking = false;
- await _startListening();
- return;
+ if (_enqueuedSentenceCount > availableCount) {
+ _enqueuedSentenceCount = availableCount;
+ }
+
+ if (availableCount > _enqueuedSentenceCount) {
+ final newChunks = segments.sublist(
+ _enqueuedSentenceCount,
+ availableCount,
+ );
+ _enqueuedSentenceCount = availableCount;
+ for (final chunk in newChunks) {
+ _enqueueSpeechChunk(chunk);
}
+ }
- await _tts.speak(cleanText);
- // After speaking completes, _handleTtsComplete will restart listening
+ if (isFinalChunk && _enqueuedSentenceCount < segments.length) {
+ _enqueuedSentenceCount = segments.length;
+ _enqueueSpeechChunk(segments.last);
+ }
+ }
+
+ void _enqueueSpeechChunk(String chunk) {
+ if (_isDisposed) return;
+ final trimmed = chunk.trim();
+ if (trimmed.isEmpty) {
+ return;
+ }
+ if (_isMuted) {
+ return; // Skip playback while muted
+ }
+ if (_tts.prefersServerEngine) {
+ _serverPipelineActive = true;
+ final chunkId = _nextServerChunkId++;
+ _prefetchServerAudio(trimmed, chunkId);
+ return;
+ }
+ _speechQueue.add(trimmed);
+ if (!_isSpeaking) {
+ unawaited(_startNextSpeechChunk());
+ }
+ }
+
+ Future _startNextSpeechChunk() async {
+ if (_isDisposed) return;
+ if (_speechQueue.isEmpty || _isSpeaking || _isMuted) {
+ return;
+ }
+
+ final next = _speechQueue.removeFirst();
+ try {
+ await _prepareForSpeechPlayback();
+ _isSpeaking = true;
+ _updateState(VoiceCallState.speaking);
+ await _tts.speak(next);
} catch (e) {
_isSpeaking = false;
_updateState(VoiceCallState.error);
- // Restart listening even if TTS fails
- await _startListening();
+ unawaited(_startListening());
}
}
+ void _prefetchServerAudio(String chunk, int chunkId) {
+ if (_isDisposed) {
+ return;
+ }
+ final session = _serverAudioSession;
+ _pendingServerAudioFetches++;
+ _tts
+ .synthesizeServerSpeechChunk(chunk)
+ .then((audioChunk) {
+ _pendingServerAudioFetches--;
+ if (_pendingServerAudioFetches < 0) {
+ _pendingServerAudioFetches = 0;
+ }
+ if (_isDisposed ||
+ !_serverPipelineActive ||
+ session != _serverAudioSession) {
+ return;
+ }
+ _serverAudioBuffer[chunkId] = audioChunk;
+ _maybeStartServerAudio();
+ })
+ .catchError((error, _) {
+ _pendingServerAudioFetches--;
+ if (_pendingServerAudioFetches < 0) {
+ _pendingServerAudioFetches = 0;
+ }
+ if (_isDisposed) {
+ return;
+ }
+ _handleTtsError(error.toString());
+ });
+ }
+
+ void _maybeStartServerAudio() {
+ if (_isDisposed || !_serverPipelineActive) {
+ return;
+ }
+ if (_isSpeaking || _isMuted) {
+ return;
+ }
+ final chunk = _serverAudioBuffer.remove(_nextServerPlaybackId);
+ if (chunk == null) {
+ return;
+ }
+ _nextServerPlaybackId++;
+ _playServerAudioChunk(chunk);
+ }
+
+ Future _playServerAudioChunk(SpeechAudioChunk chunk) async {
+ try {
+ await _prepareForSpeechPlayback();
+ _isSpeaking = true;
+ _updateState(VoiceCallState.speaking);
+ await _serverAudioPlayer.play(
+ BytesSource(chunk.bytes, mimeType: chunk.mimeType),
+ );
+ } catch (e) {
+ _isSpeaking = false;
+ _handleTtsError(e.toString());
+ }
+ }
+
+ void _handleServerAudioComplete() {
+ if (_isDisposed) {
+ return;
+ }
+ _isSpeaking = false;
+ if (_serverAudioBuffer.containsKey(_nextServerPlaybackId)) {
+ _maybeStartServerAudio();
+ return;
+ }
+ _maybeResumeListeningAfterSpeech();
+ }
+
+ void _resetServerAudio({bool stopPlayback = false}) {
+ _serverAudioBuffer.clear();
+ _pendingServerAudioFetches = 0;
+ _serverAudioSession++;
+ _nextServerChunkId = 0;
+ _nextServerPlaybackId = 0;
+ if (stopPlayback) {
+ unawaited(_serverAudioPlayer.stop());
+ _isSpeaking = false;
+ }
+ _serverPipelineActive = false;
+ }
+
+ Future _prepareForSpeechPlayback() async {
+ if (_listeningSuspendedForSpeech) {
+ return;
+ }
+ _listeningSuspendedForSpeech = true;
+ await _voiceInput.stopListening();
+ await _transcriptSubscription?.cancel();
+ _transcriptSubscription = null;
+ await _intensitySubscription?.cancel();
+ _intensitySubscription = null;
+ }
+
+ void _maybeResumeListeningAfterSpeech() {
+ if (!_responseCompleted) {
+ return;
+ }
+ if (_hasPendingSpeech) {
+ return;
+ }
+
+ if (_pauseReasons.isNotEmpty) {
+ _listeningPaused = true;
+ if (_state != VoiceCallState.paused) {
+ _updateState(VoiceCallState.paused);
+ }
+ return;
+ }
+
+ if (_serverPipelineActive && _pendingServerAudioFetches > 0) {
+ return;
+ }
+
+ unawaited(_startListening());
+ }
+
void _handleTtsStart() {
if (_isDisposed) return;
_updateState(VoiceCallState.speaking);
@@ -376,17 +612,19 @@ class VoiceCallService {
void _handleTtsComplete() {
if (_isDisposed) return;
_isSpeaking = false;
- // After assistant finishes speaking, resume only if not paused
- if (_pauseReasons.isNotEmpty) {
- _listeningPaused = true;
- _updateState(VoiceCallState.paused);
+ if (_speechQueue.isNotEmpty) {
+ unawaited(_startNextSpeechChunk());
return;
}
- _startListening();
+ _maybeResumeListeningAfterSpeech();
}
void _handleTtsError(String error) {
if (_isDisposed) return;
+ _isSpeaking = false;
+ _speechQueue.clear();
+ _resetServerAudio(stopPlayback: true);
+ _listeningSuspendedForSpeech = false;
_updateState(VoiceCallState.error);
// Try to recover by restarting listening
_startListening();
@@ -405,6 +643,7 @@ class VoiceCallService {
await _voiceInput.stopListening();
await _tts.stop();
+ await _serverAudioPlayer.stop();
await BackgroundStreamingHandler.instance.stopBackgroundExecution(const [
_voiceCallStreamId,
@@ -421,6 +660,13 @@ class VoiceCallService {
_isMuted = false;
_listeningPaused = false;
_pauseReasons.clear();
+ _speechQueue.clear();
+ _enqueuedSentenceCount = 0;
+ _responseCompleted = false;
+ _listeningSuspendedForSpeech = false;
+ _activeAssistantMessageId = null;
+ _isSpeaking = false;
+ _resetServerAudio(stopPlayback: true);
_updateState(VoiceCallState.disconnected);
}
@@ -462,6 +708,11 @@ class VoiceCallService {
Future cancelSpeaking() async {
if (_isDisposed) return;
+ _speechQueue.clear();
+ _enqueuedSentenceCount = 0;
+ _responseCompleted = false;
+ _listeningSuspendedForSpeech = false;
+ _resetServerAudio(stopPlayback: true);
await _tts.stop();
_isSpeaking = false;
_accumulatedResponse = '';
@@ -527,6 +778,11 @@ class VoiceCallService {
_isSpeaking = false;
_accumulatedResponse = '';
}
+ _speechQueue.clear();
+ _enqueuedSentenceCount = 0;
+ _responseCompleted = false;
+ _listeningSuspendedForSpeech = false;
+ _resetServerAudio(stopPlayback: true);
pauseListening(reason: VoiceCallPauseReason.mute);
} else {
resumeListening(reason: VoiceCallPauseReason.mute);
@@ -547,6 +803,7 @@ class VoiceCallService {
_voiceInput.dispose();
await _tts.dispose();
+ await _serverAudioPlayer.dispose();
// Cancel notification
await _notificationService.cancelNotification();
diff --git a/lib/features/chat/services/voice_input_service.dart b/lib/features/chat/services/voice_input_service.dart
index c1990aa..df7e2fe 100644
--- a/lib/features/chat/services/voice_input_service.dart
+++ b/lib/features/chat/services/voice_input_service.dart
@@ -1,13 +1,13 @@
import 'dart:async';
-import 'dart:io' show File, Platform;
+import 'dart:convert';
+import 'dart:io' show Platform;
+import 'dart:typed_data';
import 'package:flutter/widgets.dart';
import 'package:flutter_riverpod/flutter_riverpod.dart';
import 'package:riverpod_annotation/riverpod_annotation.dart';
-import 'package:mic_stream_recorder/mic_stream_recorder.dart';
import 'package:stts/stts.dart';
-import 'package:path/path.dart' as p;
-import 'package:path_provider/path_provider.dart';
+import 'package:vad/vad.dart';
import '../../../core/providers/app_providers.dart';
import '../../../core/services/api_service.dart';
@@ -23,7 +23,10 @@ class LocaleName {
}
class VoiceInputService {
- final MicStreamRecorder _recorder = MicStreamRecorder();
+ static const int _vadSampleRate = 16000;
+ static const int _vadFrameSamples = 1536;
+
+ final VadHandler _vadHandler = VadHandler.create();
final Stt _speech = Stt();
final ApiService? _api;
final Ref? _ref;
@@ -41,17 +44,17 @@ class VoiceInputService {
_intensityController?.stream ?? const Stream.empty();
int _lastIntensity = 0;
Timer? _intensityDecayTimer;
- Timer? _silenceTimer;
- bool _hasDetectedSpeech = false;
- int _amplitudeCallbackCount = 0;
- Timer? _amplitudeFallbackTimer;
+ List? _vadPendingSamples;
Stream get textStream =>
_textStreamController?.stream ?? const Stream.empty();
Timer? _autoStopTimer;
- StreamSubscription? _ampSub;
StreamSubscription? _sttResultSub;
StreamSubscription? _sttStateSub;
+ StreamSubscription>? _vadSpeechEndSub;
+ StreamSubscription<({double isSpeech, double notSpeech, List frame})>?
+ _vadFrameSub;
+ StreamSubscription? _vadErrorSub;
bool get isSupportedPlatform => Platform.isAndroid || Platform.isIOS;
bool get hasServerStt => _api != null;
@@ -60,9 +63,7 @@ class VoiceInputService {
bool get prefersServerOnly => _preference == SttPreference.serverOnly;
bool get prefersDeviceOnly => _preference == SttPreference.deviceOnly;
- VoiceInputService({ApiService? api, Ref? ref})
- : _api = api,
- _ref = ref;
+ VoiceInputService({ApiService? api, Ref? ref}) : _api = api, _ref = ref;
void updatePreference(SttPreference preference) {
_preference = preference;
@@ -327,33 +328,27 @@ class VoiceInputService {
_autoStopTimer?.cancel();
_autoStopTimer = null;
- _silenceTimer?.cancel();
- _silenceTimer = null;
-
- _amplitudeFallbackTimer?.cancel();
- _amplitudeFallbackTimer = null;
-
if (_usingServerStt) {
- await _finalizeServerRecording();
+ await _stopVadRecording();
+ final samples = _vadPendingSamples;
+ _vadPendingSamples = null;
+ if (samples != null && samples.isNotEmpty) {
+ await _processVadSamples(samples);
+ }
} else {
await _stopLocalStt();
+ if (_currentText.isNotEmpty) {
+ _textStreamController?.add(_currentText);
+ }
}
- await _ampSub?.cancel();
- _ampSub = null;
-
_intensityDecayTimer?.cancel();
_intensityDecayTimer = null;
_lastIntensity = 0;
- if (!_usingServerStt && _currentText.isNotEmpty) {
- _textStreamController?.add(_currentText);
- }
-
await _closeControllers();
_usingServerStt = false;
- _hasDetectedSpeech = false;
}
Future _stopLocalStt() async {
@@ -411,82 +406,100 @@ class VoiceInputService {
}
Future _startServerRecording() async {
- final path = await _createRecordingPath();
- _hasDetectedSpeech = false;
+ await _setupVadStreams();
+ final settings = _ref?.read(appSettingsProvider);
+ final silenceMs = settings?.voiceSilenceDuration ?? 2000;
+ final redemptionFrames = _silenceDurationToFrames(silenceMs);
+ final endPadFrames = redemptionFrames > 4
+ ? (redemptionFrames / 4).round().clamp(1, redemptionFrames)
+ : 1;
- await _recorder.startRecording(path);
+ try {
+ await _vadHandler.startListening(
+ frameSamples: _vadFrameSamples,
+ redemptionFrames: redemptionFrames,
+ endSpeechPadFrames: endPadFrames,
+ preSpeechPadFrames: 2,
+ minSpeechFrames: 3,
+ submitUserSpeechOnPause: true,
+ recordConfig: const RecordConfig(
+ encoder: AudioEncoder.pcm16bits,
+ sampleRate: _vadSampleRate,
+ numChannels: 1,
+ bitRate: 16,
+ echoCancel: true,
+ autoGain: true,
+ noiseSuppress: true,
+ androidConfig: AndroidRecordConfig(
+ audioSource: AndroidAudioSource.voiceCommunication,
+ audioManagerMode: AudioManagerMode.modeInCommunication,
+ speakerphone: true,
+ manageBluetooth: true,
+ useLegacy: false,
+ ),
+ ),
+ );
+ } catch (error) {
+ _textStreamController?.addError(error);
+ rethrow;
+ }
+ }
- await _ampSub?.cancel();
- _amplitudeFallbackTimer?.cancel();
- _amplitudeCallbackCount = 0;
+ Future _setupVadStreams() async {
+ await _vadSpeechEndSub?.cancel();
+ _vadSpeechEndSub = _vadHandler.onSpeechEnd.listen((samples) {
+ if (!_isListening || !_usingServerStt) return;
+ if (samples.isEmpty) return;
+ _vadPendingSamples = samples;
+ if (_isListening) {
+ unawaited(_stopListening());
+ }
+ });
- _ampSub = _recorder.amplitudeStream.listen((amplitude) {
- _amplitudeCallbackCount++;
+ await _vadFrameSub?.cancel();
+ _vadFrameSub = _vadHandler.onFrameProcessed.listen((frameData) {
if (!_isListening) return;
-
- _lastIntensity = _normalizedToIntensity(amplitude);
+ final intensity = _intensityFromVadFrame(frameData.frame);
+ _lastIntensity = intensity;
try {
_intensityController?.add(_lastIntensity);
} catch (_) {}
-
- _handleServerAmplitude(amplitude);
});
- _amplitudeFallbackTimer = Timer(const Duration(seconds: 1), () {
- if (_amplitudeCallbackCount == 0) {
- _silenceTimer = Timer(const Duration(seconds: 15), () {
- if (_isListening && _usingServerStt) {
- unawaited(_stopListening());
- }
- });
+ await _vadErrorSub?.cancel();
+ _vadErrorSub = _vadHandler.onError.listen((message) {
+ _textStreamController?.addError(Exception(message));
+ if (_isListening) {
+ unawaited(_stopListening());
}
});
}
- void _handleServerAmplitude(double amplitude) {
- if (!_usingServerStt || !_isListening) return;
-
- const double speechThreshold = 0.55;
- if (amplitude.isNaN || amplitude.isInfinite) return;
-
- if (amplitude > speechThreshold) {
- _hasDetectedSpeech = true;
- _silenceTimer?.cancel();
- _silenceTimer = null;
- } else if (_hasDetectedSpeech && _silenceTimer == null) {
- final silenceDuration = _ref?.read(appSettingsProvider).voiceSilenceDuration ?? 2000;
- _silenceTimer = Timer(Duration(milliseconds: silenceDuration), () {
- if (_isListening && _usingServerStt) {
- unawaited(_stopListening());
- }
- });
- }
+ Future _stopVadRecording() async {
+ try {
+ await _vadHandler.stopListening();
+ } catch (_) {}
+ await _vadSpeechEndSub?.cancel();
+ _vadSpeechEndSub = null;
+ await _vadFrameSub?.cancel();
+ _vadFrameSub = null;
+ await _vadErrorSub?.cancel();
+ _vadErrorSub = null;
}
- Future _createRecordingPath() async {
- final directory = await getTemporaryDirectory();
- final timestamp = DateTime.now().millisecondsSinceEpoch;
- final fileName = 'conduit_voice_$timestamp.m4a';
- return p.join(directory.path, fileName);
- }
-
- Future _finalizeServerRecording() async {
+ Future _processVadSamples(List samples) async {
final api = _api;
if (api == null) return;
- final path = await _recorder.stopRecording();
- if (path == null || path.isEmpty) return;
-
- final file = File(path);
try {
- if (!await file.exists()) return;
- final bytes = await file.readAsBytes();
- if (bytes.isEmpty) return;
+ final wavBytes = _samplesToWav(samples);
+ final fileName =
+ 'conduit_voice_${DateTime.now().millisecondsSinceEpoch}.wav';
final response = await api.transcribeSpeech(
- audioBytes: bytes,
- fileName: p.basename(path),
- mimeType: 'audio/mp4',
+ audioBytes: wavBytes,
+ fileName: fileName,
+ mimeType: 'audio/wav',
language: _languageForServer(),
);
@@ -499,19 +512,72 @@ class VoiceInputService {
}
} catch (error) {
_textStreamController?.addError(error);
- } finally {
- unawaited(_cleanupRecordingFile(file));
}
}
- Future _cleanupRecordingFile(File file) async {
- try {
- if (await file.exists()) {
- await file.delete();
- }
- } catch (_) {}
+ int _silenceDurationToFrames(int milliseconds) {
+ final frameDurationMs = (_vadFrameSamples / _vadSampleRate) * 1000;
+ final frames = (milliseconds / frameDurationMs).round();
+ return frames.clamp(4, 50);
}
+ int _intensityFromVadFrame(List frame) {
+ if (frame.isEmpty) return 0;
+ double peak = 0;
+ for (final sample in frame) {
+ final value = sample.abs();
+ if (value > peak) {
+ peak = value;
+ }
+ }
+ final scaled = (peak * 12).round();
+ return scaled.clamp(0, 10);
+ }
+
+ Uint8List _samplesToWav(List samples) {
+ if (samples.isEmpty) {
+ return Uint8List(0);
+ }
+ final Int16List pcm = Int16List(samples.length);
+ for (var i = 0; i < samples.length; i++) {
+ final clamped = samples[i].clamp(-1.0, 1.0);
+ final scaled = (clamped * 32767).round().clamp(-32768, 32767);
+ pcm[i] = scaled;
+ }
+
+ final dataLength = pcm.lengthInBytes;
+ final bytesPerSample = 2;
+ final numChannels = 1;
+ final byteRate = _vadSampleRate * numChannels * bytesPerSample;
+ final blockAlign = numChannels * bytesPerSample;
+
+ final builder = BytesBuilder();
+ builder.add(ascii.encode('RIFF'));
+ builder.add(_int32Le(36 + dataLength));
+ builder.add(ascii.encode('WAVE'));
+ builder.add(ascii.encode('fmt '));
+ builder.add(_int32Le(16));
+ builder.add(_int16Le(1));
+ builder.add(_int16Le(numChannels));
+ builder.add(_int32Le(_vadSampleRate));
+ builder.add(_int32Le(byteRate));
+ builder.add(_int16Le(blockAlign));
+ builder.add(_int16Le(16));
+ builder.add(ascii.encode('data'));
+ builder.add(_int32Le(dataLength));
+ builder.add(Uint8List.view(pcm.buffer));
+ return builder.toBytes();
+ }
+
+ List _int16Le(int value) => [value & 0xff, (value >> 8) & 0xff];
+
+ List _int32Le(int value) => [
+ value & 0xff,
+ (value >> 8) & 0xff,
+ (value >> 16) & 0xff,
+ (value >> 24) & 0xff,
+ ];
+
String? _languageForServer() {
final locale = _selectedLocaleId;
if (locale != null && locale.isNotEmpty) {
@@ -611,11 +677,6 @@ class VoiceInputService {
return null;
}
- int _normalizedToIntensity(double value) {
- if (value.isNaN || value.isInfinite) return 0;
- return (value * 10).round().clamp(0, 10);
- }
-
Future _closeControllers() async {
if (_textStreamController != null) {
try {
@@ -647,7 +708,7 @@ class VoiceInputService {
void dispose() {
stopListening();
- _silenceTimer?.cancel();
+ unawaited(_vadHandler.dispose());
try {
_speech.dispose().catchError((_) {});
} catch (_) {}
diff --git a/pubspec.lock b/pubspec.lock
index b662398..f12e607 100644
--- a/pubspec.lock
+++ b/pubspec.lock
@@ -965,14 +965,6 @@ packages:
url: "https://pub.dev"
source: hosted
version: "1.16.0"
- mic_stream_recorder:
- dependency: "direct main"
- description:
- name: mic_stream_recorder
- sha256: "73965991ef5cc93d2b0c1e6d590cbd567a853b9ee7b2d52de43a73f185bb0d9c"
- url: "https://pub.dev"
- source: hosted
- version: "1.1.2"
mime:
dependency: transitive
description:
@@ -1173,6 +1165,70 @@ packages:
url: "https://pub.dev"
source: hosted
version: "1.5.0"
+ record:
+ dependency: transitive
+ description:
+ name: record
+ sha256: "6bad72fb3ea6708d724cf8b6c97c4e236cf9f43a52259b654efeb6fd9b737f1f"
+ url: "https://pub.dev"
+ source: hosted
+ version: "6.1.2"
+ record_android:
+ dependency: transitive
+ description:
+ name: record_android
+ sha256: fb54ee4e28f6829b8c580252a9ef49d9c549cfd263b0660ad7eeac0908658e9f
+ url: "https://pub.dev"
+ source: hosted
+ version: "1.4.4"
+ record_ios:
+ dependency: transitive
+ description:
+ name: record_ios
+ sha256: "765b42ac1be019b1674ddd809b811fc721fe5a93f7bb1da7803f0d16772fd6d7"
+ url: "https://pub.dev"
+ source: hosted
+ version: "1.1.4"
+ record_linux:
+ dependency: transitive
+ description:
+ name: record_linux
+ sha256: "235b1f1fb84e810f8149cc0c2c731d7d697f8d1c333b32cb820c449bf7bb72d8"
+ url: "https://pub.dev"
+ source: hosted
+ version: "1.2.1"
+ record_macos:
+ dependency: transitive
+ description:
+ name: record_macos
+ sha256: "842ea4b7e95f4dd237aacffc686d1b0ff4277e3e5357865f8d28cd28bc18ed95"
+ url: "https://pub.dev"
+ source: hosted
+ version: "1.1.2"
+ record_platform_interface:
+ dependency: transitive
+ description:
+ name: record_platform_interface
+ sha256: b0065fdf1ec28f5a634d676724d388a77e43ce7646fb049949f58c69f3fcb4ed
+ url: "https://pub.dev"
+ source: hosted
+ version: "1.4.0"
+ record_web:
+ dependency: transitive
+ description:
+ name: record_web
+ sha256: "20ac10d56514cb9f8cecc8f3579383084fdfb43b0d04e05a95244d0d76091d90"
+ url: "https://pub.dev"
+ source: hosted
+ version: "1.2.1"
+ record_windows:
+ dependency: transitive
+ description:
+ name: record_windows
+ sha256: "223258060a1d25c62bae18282c16783f28581ec19401d17e56b5205b9f039d78"
+ url: "https://pub.dev"
+ source: hosted
+ version: "1.0.7"
riverpod:
dependency: transitive
description:
@@ -1682,6 +1738,14 @@ packages:
url: "https://pub.dev"
source: hosted
version: "4.5.1"
+ vad:
+ dependency: "direct main"
+ description:
+ name: vad
+ sha256: ef6c8b12c5af7a6a519ff5684f074b8a2ac00c434705f544af379ea77bccd258
+ url: "https://pub.dev"
+ source: hosted
+ version: "0.0.7+1"
vector_graphics:
dependency: transitive
description:
diff --git a/pubspec.yaml b/pubspec.yaml
index f0dfa30..13de85d 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -44,7 +44,7 @@ dependencies:
flutter_animate: ^4.5.0
# Platform Features
- mic_stream_recorder: ^1.1.2
+ vad: ^0.0.7+1
stts: ^1.2.5
flutter_tts: ^4.2.3
audioplayers: ^6.5.1