Merge pull request #187 from cogwheel0/optimize-audio-and-voice-input

optimize-audio-and-voice-input
2025-11-27 19:56:57 +05:30
parent 852a65f870 f9574dfec0
commit 9ea8949e4f
8 changed files with 401 additions and 283 deletions
@@ -122,6 +122,4 @@
            android:name="flutterEmbedding"
            android:value="2" />
    </application>
-    
-    <!-- Queries for speech recognition removed; using server transcription -->
 </manifest>
@@ -5,6 +5,9 @@ PODS:
  - connectivity_plus (0.0.1):
    - Flutter
  - CryptoSwift (1.8.4)
+  - CwlCatchException (2.2.1):
+    - CwlCatchExceptionSupport (~> 2.2.1)
+  - CwlCatchExceptionSupport (2.2.1)
  - DKImagePickerController/Core (4.3.9):
    - DKImagePickerController/ImageDataManager
    - DKImagePickerController/Resource
@@ -85,11 +88,13 @@ PODS:
  - shared_preferences_foundation (0.0.1):
    - Flutter
    - FlutterMacOS
+  - speech_to_text (7.2.0):
+    - CwlCatchException
+    - Flutter
+    - FlutterMacOS
  - sqflite_darwin (0.0.4):
    - Flutter
    - FlutterMacOS
-  - stts (1.0.0):
-    - Flutter
  - SwiftyGif (5.4.5)
  - url_launcher_ios (0.0.1):
    - Flutter
@@ -122,8 +127,8 @@ DEPENDENCIES:
  - share_handler_ios_models (from `.symlinks/plugins/share_handler_ios/ios/Models`)
  - share_plus (from `.symlinks/plugins/share_plus/ios`)
  - shared_preferences_foundation (from `.symlinks/plugins/shared_preferences_foundation/darwin`)
+  - speech_to_text (from `.symlinks/plugins/speech_to_text/darwin`)
  - sqflite_darwin (from `.symlinks/plugins/sqflite_darwin/darwin`)
-  - stts (from `.symlinks/plugins/stts/ios`)
  - url_launcher_ios (from `.symlinks/plugins/url_launcher_ios/ios`)
  - vad (from `.symlinks/plugins/vad/ios`)
  - wakelock_plus (from `.symlinks/plugins/wakelock_plus/ios`)
@@ -132,6 +137,8 @@ DEPENDENCIES:
 SPEC REPOS:
  trunk:
    - CryptoSwift
+    - CwlCatchException
+    - CwlCatchExceptionSupport
    - DKImagePickerController
    - DKPhotoGallery
    - onnxruntime-c
@@ -178,10 +185,10 @@ EXTERNAL SOURCES:
    :path: ".symlinks/plugins/share_plus/ios"
  shared_preferences_foundation:
    :path: ".symlinks/plugins/shared_preferences_foundation/darwin"
+  speech_to_text:
+    :path: ".symlinks/plugins/speech_to_text/darwin"
  sqflite_darwin:
    :path: ".symlinks/plugins/sqflite_darwin/darwin"
-  stts:
-    :path: ".symlinks/plugins/stts/ios"
  url_launcher_ios:
    :path: ".symlinks/plugins/url_launcher_ios/ios"
  vad:
@@ -195,6 +202,8 @@ SPEC CHECKSUMS:
  audioplayers_darwin: 4f9ca89d92d3d21cec7ec580e78ca888e5fb68bd
  connectivity_plus: cb623214f4e1f6ef8fe7403d580fdad517d2f7dd
  CryptoSwift: e64e11850ede528a02a0f3e768cec8e9d92ecb90
+  CwlCatchException: 7acc161b299a6de7f0a46a6ed741eae2c8b4d75a
+  CwlCatchExceptionSupport: 54ccab8d8c78907b57f99717fb19d4cc3bce02dc
  DKImagePickerController: 946cec48c7873164274ecc4624d19e3da4c1ef3c
  DKPhotoGallery: b3834fecb755ee09a593d7c9e389d8b5d6deed60
  file_picker: a0560bc09d61de87f12d246fc47d2119e6ef37be
@@ -217,8 +226,8 @@ SPEC CHECKSUMS:
  share_handler_ios_models: fc638c9b4330dc7f082586c92aee9dfa0b87b871
  share_plus: 50da8cb520a8f0f65671c6c6a99b3617ed10a58a
  shared_preferences_foundation: 7036424c3d8ec98dfe75ff1667cb0cd531ec82bb
+  speech_to_text: 3b313d98516d3d0406cea424782ec25470c59d19
  sqflite_darwin: 20b2a3a3b70e43edae938624ce550a3cbf66a3d0
-  stts: 1a48df645bb516e86e4121d5253b582749a1d3a6
  SwiftyGif: 706c60cf65fa2bc5ee0313beece843c8eb8194d4
  url_launcher_ios: 7a95fa5b60cc718a708b8f2966718e93db0cef1b
  vad: 7934867589afe53567f492df66fb1615f2185822
@@ -130,6 +130,29 @@ class CallKitService {
    return <Map<String, dynamic>>[];
  }

+  /// Checks for active calls and clears them if they are not tracked by the app.
+  Future<void> checkAndCleanActiveCalls() async {
+    if (!_shouldUseCallKit('check active calls')) return;
+
+    try {
+      final calls = await activeCalls();
+      if (calls.isNotEmpty) {
+        developer.log(
+          'Found ${calls.length} active CallKit calls on startup. Cleaning up.',
+          name: 'callkit',
+        );
+        await endAllCalls();
+      }
+    } catch (error, stackTrace) {
+      developer.log(
+        'Failed to clean up active calls: $error',
+        name: 'callkit',
+        error: error,
+        stackTrace: stackTrace,
+      );
+    }
+  }
+
  /// Stream of CallKit events from the native layer.
  Stream<CallEvent> get events {
    if (!_callKitAllowed) {
@@ -182,7 +205,7 @@ class CallKitService {
      ios: const IOSParams(
        handleType: 'generic',
        supportsVideo: false,
-        audioSessionMode: 'default',
+        audioSessionMode: 'voiceChat',
        audioSessionActive: true,
        audioSessionPreferredSampleRate: 44100.0,
        audioSessionPreferredIOBufferDuration: 0.005,
@@ -70,20 +70,12 @@ class TextToSpeechService {
      }
    });

-    if (!kIsWeb && Platform.isIOS) {
-      final context = AudioContext(
-        iOS: AudioContextIOS(
-          category: AVAudioSessionCategory.playAndRecord,
-          options: const {
-            AVAudioSessionOptions.defaultToSpeaker,
-            AVAudioSessionOptions.mixWithOthers,
-            AVAudioSessionOptions.allowBluetooth,
-            AVAudioSessionOptions.allowBluetoothA2DP,
-          },
+    if (!kIsWeb && Platform.isAndroid) {
+      _player.setAudioContext(
+        AudioContext(
+          android: const AudioContextAndroid(),
        ),
-        android: const AudioContextAndroid(),
      );
-      _player.setAudioContext(context);
    }
  }

@@ -103,13 +95,8 @@ class TextToSpeechService {

      if (!kIsWeb && Platform.isIOS) {
        await _tts.setSharedInstance(true);
-        await _tts
-            .setIosAudioCategory(IosTextToSpeechAudioCategory.playAndRecord, [
-              IosTextToSpeechAudioCategoryOptions.mixWithOthers,
-              IosTextToSpeechAudioCategoryOptions.defaultToSpeaker,
-              IosTextToSpeechAudioCategoryOptions.allowBluetooth,
-              IosTextToSpeechAudioCategoryOptions.allowBluetoothA2DP,
-            ]);
+        // Rely on the native VoiceBackgroundAudioManager for iOS
+        // audio session configuration to avoid routing conflicts.
      }

      if (_engine != TtsEngine.server) {
@@ -123,6 +123,11 @@ class VoiceCallService {
    _pauseReasons.clear();
    _listeningPaused = false;

+    // Clean up any zombie calls from previous sessions
+    if (_callKitEnabled) {
+      unawaited(_callKitService.checkAndCleanActiveCalls());
+    }
+
    // Initialize notification service
    await _notificationService.initialize();

@@ -312,9 +317,17 @@ class VoiceCallService {
        throw Exception('Failed to establish socket connection');
      }

+      // Initialize voice input first so we know which STT mode will be used
+      await _voiceInput.initialize();
+
+      // Only activate VoiceBackgroundAudioManager for server STT
+      // For local STT, speech_to_text handles its own iOS audio session
+      final useServerMic =
+          (_voiceInput.prefersServerOnly && _voiceInput.hasServerStt) ||
+          (!_voiceInput.hasLocalStt && _voiceInput.hasServerStt);
      await BackgroundStreamingHandler.instance.startBackgroundExecution(const [
        _voiceCallStreamId,
-      ], requiresMicrophone: true);
+      ], requiresMicrophone: useServerMic);

      // Set up periodic keep-alive to refresh wake lock (every 5 minutes)
      _keepAliveTimer?.cancel();
@@ -385,10 +398,11 @@ class VoiceCallService {
        throw Exception('Preferred speech recognition engine is unavailable');
      }

-      _updateState(VoiceCallState.listening);
-
      final stream = await _voiceInput.beginListening();

+      // Only mark as listening after STT has successfully started.
+      _updateState(VoiceCallState.listening);
+
      _transcriptSubscription = stream.listen(
        (text) {
          if (_isDisposed) return;
@@ -401,13 +415,27 @@ class VoiceCallService {
        },
        onDone: () async {
          if (_isDisposed) return;
+
+          final trimmed = _accumulatedTranscript.trim();
          // User stopped speaking, send message to assistant
-          if (_accumulatedTranscript.trim().isNotEmpty) {
-            await _sendMessageToAssistant(_accumulatedTranscript);
-          } else {
-            // No input, restart listening
-            await _startListening();
+          if (trimmed.isNotEmpty) {
+            await _sendMessageToAssistant(trimmed);
+            return;
          }
+
+          // No input – avoid a tight restart loop and only restart
+          // while the call is still active and not paused.
+          await Future.delayed(const Duration(milliseconds: 250));
+          if (_isDisposed) return;
+          if (_state == VoiceCallState.disconnected ||
+              _state == VoiceCallState.error) {
+            return;
+          }
+          if (_pauseReasons.isNotEmpty) {
+            // Respect paused state; resumeListening() will restart if needed.
+            return;
+          }
+          await _startListening();
        },
      );

@@ -3,12 +3,13 @@ import 'dart:convert';
 import 'dart:io' show Platform;
 import 'dart:typed_data';

+import 'package:flutter/services.dart';
 import 'package:flutter/widgets.dart';
 import 'package:flutter_riverpod/flutter_riverpod.dart';
-import 'package:record/record.dart'
-    hide IosAudioCategory, IosAudioCategoryOptions;
+import 'package:record/record.dart';
 import 'package:riverpod_annotation/riverpod_annotation.dart';
-import 'package:stts/stts.dart';
+import 'package:speech_to_text/speech_recognition_result.dart';
+import 'package:speech_to_text/speech_to_text.dart';
 import 'package:vad/vad.dart';

 import '../../../core/providers/app_providers.dart';
@@ -18,7 +19,7 @@ import '../../../core/services/settings_service.dart';

 part 'voice_input_service.g.dart';

-// Lightweight replacement for previous stt.LocaleName used across the UI
+/// Lightweight locale representation used across the UI.
 class LocaleName {
  final String localeId;
  final String name;
@@ -36,8 +37,8 @@ class VoiceInputService {
  static const Duration _localeFetchTimeout = Duration(seconds: 2);
  static const String _backgroundSttStreamId = 'voice-input-stt';

-  final VadHandler _vadHandler = VadHandler.create();
-  final Stt _speech = Stt();
+  VadHandler? _vadHandler;
+  final SpeechToText _speech = SpeechToText();
  final AudioRecorder _microphonePermissionProbe = AudioRecorder();
  final ApiService? _api;
  final Ref? _ref;
@@ -53,6 +54,7 @@ class VoiceInputService {
  Future<void>? _startingLocalStt;
  StreamController<String>? _textStreamController;
  String _currentText = '';
+  bool _receivedFinalResult = false;
  StreamController<int>? _intensityController;
  Stream<int> get intensityStream =>
      _intensityController?.stream ?? const Stream<int>.empty();
@@ -64,8 +66,6 @@ class VoiceInputService {
  Stream<String> get textStream =>
      _textStreamController?.stream ?? const Stream<String>.empty();
  Timer? _autoStopTimer;
-  StreamSubscription<SttRecognition>? _sttResultSub;
-  StreamSubscription<SttState>? _sttStateSub;
  StreamSubscription<List<double>>? _vadSpeechEndSub;
  StreamSubscription<({double isSpeech, double notSpeech, List<double> frame})>?
  _vadFrameSub;
@@ -100,8 +100,11 @@ class VoiceInputService {
    }
    // Prepare local speech recognizer
    try {
-      // Check permission and supported status
-      _localSttAvailable = await _speech.isSupported();
+      // Initialize speech_to_text and check availability
+      _localSttAvailable = await _speech.initialize(
+        onStatus: _handleSttStatus,
+        onError: _handleSttError,
+      );
      if (_localSttAvailable) {
        await _loadLocales(deviceTag);
      }
@@ -112,21 +115,77 @@ class VoiceInputService {
    return true;
  }

+  void _handleSttStatus(String status) {
+    debugPrint('Local STT Status: $status');
+    if (status == 'listening') {
+      _localSttActive = true;
+    } else if (status == 'notListening' || status == 'done') {
+      final wasActive = _localSttActive;
+      _localSttActive = false;
+      // If we were actively listening and the platform stopped us,
+      // properly close the stream so voice call service can restart
+      if (wasActive && _isListening && !_usingServerStt) {
+        debugPrint('Platform stopped listening, closing stream');
+        // On Android, the 'done' status often fires BEFORE the final result
+        // callback arrives. Wait for the final result to avoid cutting off
+        // the last word.
+        if (Platform.isAndroid && !_receivedFinalResult) {
+          _waitForFinalResultThenStop();
+        } else {
+          unawaited(_stopListening());
+        }
+      }
+    }
+  }
+
+  /// Waits briefly for Android to deliver the final STT result before stopping.
+  void _waitForFinalResultThenStop() {
+    Future(() async {
+      // Wait up to 300ms for the final result to arrive
+      for (var i = 0; i < 6; i++) {
+        await Future.delayed(const Duration(milliseconds: 50));
+        if (_receivedFinalResult || !_isListening) break;
+      }
+      if (_isListening) {
+        await _stopListening();
+      }
+    });
+  }
+
+  void _handleSttError(dynamic error) {
+    debugPrint('Local STT Error: $error');
+    final errorStr = error.toString().toLowerCase();
+
+    // These errors are non-fatal - they just mean no speech was detected
+    // or the session timed out. The status handler will close the stream
+    // and voice call service will restart listening.
+    final nonFatalErrors = [
+      'error_no_match',
+      'error_speech_timeout',
+      'error_busy', // Temporary, can retry
+    ];
+
+    final isNonFatal = nonFatalErrors.any((e) => errorStr.contains(e));
+    if (isNonFatal) {
+      debugPrint('Non-fatal STT error, allowing normal stream close');
+      // Let the status handler / auto-stop timer close the stream.
+      // We do not treat this as a fatal failure for the current session.
+      return;
+    }
+
+    // Fatal errors - mark STT as unavailable
+    _handleLocalRecognizerError(error);
+  }
+
  Future<bool> checkPermissions() async {
    final micGranted = await _ensureMicrophonePermission();
    if (!micGranted) {
      return false;
    }
-    if (_localSttAvailable && _preference != SttPreference.serverOnly) {
-      try {
-        final sttGranted = await _speech.hasPermission();
-        if (!sttGranted) {
-          _localSttAvailable = false;
-        }
-      } catch (_) {
-        _localSttAvailable = false;
-      }
-    }
+    // Note: Don't disable _localSttAvailable based on hasPermission check
+    // The permission might be granted lazily when listen() is called on iOS,
+    // and the check can be unreliable. Let speech_to_text handle permissions
+    // during the actual listen() call.
    return true;
  }

@@ -136,23 +195,21 @@ class VoiceInputService {
  bool get hasLocalStt => _localSttAvailable;
  bool get localeMetadataIncomplete => _usingFallbackLocales;

-  // Add a method to check if on-device STT is properly supported
+  /// Checks if on-device STT is properly supported.
  Future<bool> checkOnDeviceSupport() async {
    if (!isSupportedPlatform || !_isInitialized) return false;
    try {
-      final supported = await _speech.isSupported();
-      return supported;
+      // speech_to_text isAvailable is set after initialize()
+      return _speech.isAvailable;
    } catch (e) {
      // ignore errors checking on-device support
      return false;
    }
  }

-  // Test method to verify on-device STT functionality
+  /// Test method to verify on-device STT functionality.
  Future<String> testOnDeviceStt() async {
    try {
-      // starting on-device STT test
-
      // First ensure we're initialized
      await initialize();

@@ -167,24 +224,19 @@ class VoiceInputService {
      }

      // Test if speech recognition is available
-      final supported = await _speech.isSupported();
-      if (!supported) {
+      if (!_speech.isAvailable) {
        return 'Speech recognition service is not available on this device';
      }

-      // Set language if available, then start and stop quickly
-      if (_selectedLocaleId != null) {
-        try {
-          await _speech.setLanguage(_selectedLocaleId!);
-        } catch (_) {}
-      }
-      await _speech.start(SttRecognitionOptions(punctuation: true));
+      // Start and stop quickly to test
+      await _speech.listen(onResult: (_) {}, localeId: _selectedLocaleId);
      await Future.delayed(const Duration(milliseconds: 100));
      await _speech.stop();

-      return 'On-device STT test completed successfully. Local STT available: $_localSttAvailable, Selected locale: $_selectedLocaleId';
+      return 'On-device STT test completed successfully. '
+          'Local STT available: $_localSttAvailable, '
+          'Selected locale: $_selectedLocaleId';
    } catch (e) {
-      // on-device STT test failed
      return 'On-device STT test failed: $e';
    }
  }
@@ -198,23 +250,39 @@ class VoiceInputService {

  Future<void> _loadLocales(String deviceTag) async {
    _ensureFallbackLocale(deviceTag);
-    List<String> langs = const [];
    try {
-      langs = await _speech.getLanguages().timeout(
-        _localeFetchTimeout,
-        onTimeout: () => const [],
+      final sttLocales = await Future.value(
+        _speech.locales(),
+      ).timeout(_localeFetchTimeout, onTimeout: () => const []);
+      if (sttLocales.isEmpty) {
+        return;
+      }
+
+      // Map speech_to_text LocaleName to our own LocaleName class
+      _locales = sttLocales
+          .map((loc) => LocaleName(loc.localeId, loc.name))
+          .toList();
+      _usingFallbackLocales = false;
+
+      // Prefer the STT engine's own system locale when available, since
+      // it may differ from Flutter's UI locale on some Android devices.
+      final systemLocale = await _speech.systemLocale();
+      final systemTag = systemLocale?.localeId;
+      final tagForMatch = (systemTag != null && systemTag.isNotEmpty)
+          ? systemTag
+          : deviceTag;
+
+      final match = _matchLocale(tagForMatch);
+      _selectedLocaleId = match.localeId;
+
+      debugPrint(
+        'VoiceInputService: deviceTag=$deviceTag, '
+        'systemLocale=$systemTag, '
+        'selectedLocaleId=$_selectedLocaleId',
      );
    } catch (_) {
-      // Engines such as Whisper Voice may not support this call.
-      langs = const [];
+      // Some engines may not support locale listing
    }
-    if (langs.isEmpty) {
-      return;
-    }
-    _locales = langs.map((locale) => LocaleName(locale, locale)).toList();
-    _usingFallbackLocales = false;
-    final match = _matchLocale(deviceTag);
-    _selectedLocaleId = match.localeId;
  }

  void _ensureFallbackLocale(String deviceTag) {
@@ -255,7 +323,8 @@ class VoiceInputService {
    if (!_isListening) {
      return;
    }
-    _localSttAvailable = false;
+    // Don't permanently disable _localSttAvailable on transient errors
+    // The next session should still try local STT
    final message = error?.toString().trim();
    final exception = Exception(
      (message == null || message.isEmpty)
@@ -284,37 +353,39 @@ class VoiceInputService {
    _startingLocalStt = completer.future;
    _localSttActive = false;

-    await _ensureLocalSttReset();
-    await _configureIosAudioSession();
-
-    if (_selectedLocaleId != null) {
-      await _speech.setLanguage(_selectedLocaleId!);
+    // Only reset if there's an active session to avoid startup delay
+    if (_speech.isListening) {
+      await _ensureLocalSttReset();
+      // Give the platform a moment to fully release the audio session
+      await Future.delayed(const Duration(milliseconds: 100));
    }

-    Future<void> attempt(bool offline) async {
-      await _speech.start(
-        SttRecognitionOptions(punctuation: true, offline: offline),
-      );
-      _localSttActive = true;
-    }
+    // Use user's configured silence duration for pause detection
+    final settings = _ref?.read(appSettingsProvider);
+    final pauseDuration = Duration(
+      milliseconds: settings?.voiceSilenceDuration ?? 2000,
+    );

    try {
-      await attempt(true);
+      await _speech.listen(
+        onResult: _handleSttResult,
+        localeId: _selectedLocaleId,
+        // Extended duration for voice calls - listen up to 60 seconds
+        listenFor: const Duration(seconds: 60),
+        // Use user's silence duration setting for pause detection
+        pauseFor: pauseDuration,
+        listenOptions: SpeechListenOptions(
+          listenMode: ListenMode.dictation,
+          cancelOnError: false,
+          partialResults: true,
+          autoPunctuation: true,
+          enableHapticFeedback: false,
+        ),
+      );
+      _localSttActive = true;
    } catch (error) {
      _localSttActive = false;
      await _ensureLocalSttReset();
-      if (Platform.isIOS && allowOnlineFallback) {
-        try {
-          await attempt(false);
-          return;
-        } catch (secondary) {
-          await _ensureLocalSttReset();
-          throw Exception(
-            'On-device speech failed ($error); '
-            'online fallback failed ($secondary).',
-          );
-        }
-      }
      rethrow;
    } finally {
      completer.complete();
@@ -322,6 +393,22 @@ class VoiceInputService {
    }
  }

+  void _handleSttResult(SpeechRecognitionResult result) {
+    if (!_isListening) return;
+    final prevLen = _currentText.length;
+    _currentText = result.recognizedWords;
+    _textStreamController?.add(_currentText);
+    if (result.finalResult) {
+      _receivedFinalResult = true;
+    }
+    final delta = (_currentText.length - prevLen).clamp(0, 50);
+    final mapped = (delta / 5.0).ceil();
+    _lastIntensity = mapped.clamp(0, 10);
+    try {
+      _intensityController?.add(_lastIntensity);
+    } catch (_) {}
+  }
+
  Future<Stream<String>> startListening() async {
    if (!_isInitialized) {
      throw Exception('Voice input not initialized');
@@ -340,10 +427,19 @@ class VoiceInputService {
    _textStreamController = StreamController<String>.broadcast();
    _currentText = '';
    _isListening = true;
+    _receivedFinalResult = false;
    _intensityController = StreamController<int>.broadcast();
    _lastIntensity = 0;
    _usingServerStt = false;

+    // Optional haptic feedback when listening starts
+    final hapticsEnabled = _ref?.read(hapticEnabledProvider) ?? false;
+    if (hapticsEnabled) {
+      try {
+        HapticFeedback.heavyImpact();
+      } catch (_) {}
+    }
+
    _startIntensityDecayTimer();

    final bool canUseLocal = _localSttAvailable;
@@ -356,7 +452,6 @@ class VoiceInputService {
            (!shouldUseLocal && _preference != SttPreference.deviceOnly));

    if (shouldUseLocal) {
-      await _pinBackgroundMicrophone();
      _autoStopTimer?.cancel();
      _autoStopTimer = Timer(const Duration(seconds: 60), () {
        if (_isListening) {
@@ -364,9 +459,7 @@ class VoiceInputService {
        }
      });
      try {
-        final isStillAvailable = await _speech.isSupported();
-        if (!isStillAvailable && _isListening) {
-          _localSttAvailable = false;
+        if (!_speech.isAvailable && _isListening) {
          _textStreamController?.addError(
            Exception('On-device speech recognition unavailable'),
          );
@@ -377,50 +470,12 @@ class VoiceInputService {
        // ignore availability check errors
      }

-      _sttResultSub = _speech.onResultChanged.listen(
-        (SttRecognition result) {
-          if (!_isListening) return;
-          final prevLen = _currentText.length;
-          _currentText = result.text;
-          _textStreamController?.add(_currentText);
-          final delta = (_currentText.length - prevLen).clamp(0, 50);
-          final mapped = (delta / 5.0).ceil();
-          _lastIntensity = mapped.clamp(0, 10);
-          try {
-            _intensityController?.add(_lastIntensity);
-          } catch (_) {}
-          if (result.isFinal) {
-            unawaited(_stopListening());
-          }
-        },
-        onError: (error) {
-          debugPrint('Local STT Error: $error');
-          _handleLocalRecognizerError(error);
-        },
-      );
-
-      _sttStateSub = _speech.onStateChanged.listen(
-        (state) {
-          debugPrint('Local STT State: $state');
-          if (state == SttState.start) {
-            _localSttActive = true;
-          } else if (state == SttState.stop) {
-            _localSttActive = false;
-          }
-        },
-        onError: (error) {
-          debugPrint('Local STT State Error: $error');
-          _handleLocalRecognizerError(error);
-        },
-      );
-
      try {
        debugPrint('Starting local recognition...');
        await _startLocalRecognition(allowOnlineFallback: !prefersDeviceOnly);
        debugPrint('Local recognition started');
      } catch (error) {
        debugPrint('Failed to start local recognition: $error');
-        _localSttAvailable = false;
        if (!_isListening) {
          return _textStreamController!.stream;
        }
@@ -482,12 +537,11 @@ class VoiceInputService {
  Future<void> _stopListening() async {
    if (!_isListening) return;

-    _isListening = false;
-
    _autoStopTimer?.cancel();
    _autoStopTimer = null;

    if (_usingServerStt) {
+      _isListening = false;
      await _stopVadRecording();
      final samples = _vadPendingSamples;
      _vadPendingSamples = null;
@@ -495,7 +549,17 @@ class VoiceInputService {
        await _processVadSamples(samples);
      }
    } else {
+      // On Android, stop() triggers a final result with any buffered words.
+      // Keep _isListening true until after stop() so _handleSttResult accepts it.
      await _stopLocalStt();
+      // Wait for Android's STT engine to deliver the final result callback
+      if (Platform.isAndroid && !_receivedFinalResult) {
+        for (var i = 0; i < 6; i++) {
+          await Future.delayed(const Duration(milliseconds: 50));
+          if (_receivedFinalResult) break;
+        }
+      }
+      _isListening = false;
      if (_currentText.isNotEmpty) {
        _textStreamController?.add(_currentText);
      }
@@ -518,18 +582,6 @@ class VoiceInputService {
        await pendingStart;
      } catch (_) {}
    }
-    if (_sttResultSub != null) {
-      try {
-        await _sttResultSub?.cancel();
-      } catch (_) {}
-      _sttResultSub = null;
-    }
-    if (_sttStateSub != null) {
-      try {
-        await _sttStateSub?.cancel();
-      } catch (_) {}
-      _sttStateSub = null;
-    }

    final shouldStopStt = _localSttActive && _localSttAvailable;
    _localSttActive = false;
@@ -538,21 +590,6 @@ class VoiceInputService {
        await _speech.stop();
      } catch (_) {}
    }
-    if (Platform.isIOS) {
-      try {
-        await _speech.ios?.setAudioSessionActive(false);
-      } catch (_) {}
-    }
-  }
-
-  Future<void> _pinBackgroundMicrophone() async {
-    if (!Platform.isIOS || _backgroundMicPinned) return;
-    try {
-      await BackgroundStreamingHandler.instance.startBackgroundExecution(const [
-        _backgroundSttStreamId,
-      ], requiresMicrophone: true);
-      _backgroundMicPinned = true;
-    } catch (_) {}
  }

  Future<void> _releaseBackgroundMicrophone() async {
@@ -567,34 +604,16 @@ class VoiceInputService {

  Future<void> _ensureLocalSttReset() async {
    try {
-      await _speech.stop();
-    } catch (_) {}
-    if (Platform.isIOS) {
-      try {
-        await _speech.ios?.setAudioSessionActive(false);
-      } catch (_) {}
-    }
-  }
-
-  Future<void> _configureIosAudioSession() async {
-    if (!Platform.isIOS) return;
-    final ios = _speech.ios;
-    if (ios == null) return;
-    try {
-      await ios.setAudioSessionCategory(
-        category: IosAudioCategory.playAndRecord,
-        options: [
-          IosAudioCategoryOptions.allowBluetooth,
-          IosAudioCategoryOptions.defaultToSpeaker,
-          IosAudioCategoryOptions.duckOthers,
-        ],
-      );
-      await ios.setAudioSessionActive(true);
+      await _speech.cancel();
    } catch (_) {}
  }

  Future<void> _startServerRecording() async {
-    await _setupVadStreams();
+    // Create a fresh VadHandler for this session to avoid reusing any
+    // internal AudioRecorder that may be in a bad state after errors.
+    final vad = VadHandler.create();
+    _vadHandler = vad;
+    await _setupVadStreams(vad);
    final settings = _ref?.read(appSettingsProvider);
    final silenceMs = settings?.voiceSilenceDuration ?? 2000;
    final redemptionFrames = _silenceDurationToFrames(
@@ -603,7 +622,7 @@ class VoiceInputService {
    );

    try {
-      await _vadHandler.startListening(
+      await vad.startListening(
        frameSamples: _vadFrameSamples,
        model: 'v5',
        minSpeechFrames: _vadMinSpeechFrames,
@@ -623,29 +642,59 @@ class VoiceInputService {
          noiseSuppress: true,
          androidConfig: AndroidRecordConfig(
            audioSource: AndroidAudioSource.voiceRecognition,
-            audioManagerMode: AudioManagerMode.modeInCommunication,
-            speakerphone: true,
+            // Use normal mode instead of modeInCommunication to avoid
+            // audio routing conflicts with TTS playback after recording stops.
+            audioManagerMode: AudioManagerMode.modeNormal,
+            speakerphone: false,
            manageBluetooth: true,
            useLegacy: false,
          ),
-          iosConfig: IosRecordConfig(
-            categoryOptions: [
-              IosAudioCategoryOption.allowBluetooth,
-              IosAudioCategoryOption.defaultToSpeaker,
-              IosAudioCategoryOption.duckOthers,
-            ],
-          ),
        ),
      );
    } catch (error) {
+      // If starting the audio stream fails (e.g. recorder disposed),
+      // drop this handler so the next session gets a clean instance.
+      if (identical(_vadHandler, vad)) {
+        _vadHandler = null;
+      }
+
+      // Known Android issue: the underlying AudioRecorder can be in a bad
+      // state after audio focus changes triggered by TTS playback. When
+      // this happens and local STT is available, transparently fall back
+      // to on-device STT instead of failing the entire voice turn.
+      final canFallbackToLocal = _localSttAvailable && !prefersServerOnly;
+      if (error is PlatformException &&
+          error.code == 'record' &&
+          (error.message ?? '').contains(
+            'Recorder has not yet been created or has already been disposed.',
+          ) &&
+          canFallbackToLocal &&
+          _isListening) {
+        debugPrint(
+          'VadHandler.startListening failed due to recorder error – '
+          'falling back to local STT.',
+        );
+        _usingServerStt = false;
+        try {
+          await _stopVadRecording();
+        } catch (_) {}
+        try {
+          await _startLocalRecognition(allowOnlineFallback: !prefersDeviceOnly);
+          return;
+        } catch (fallbackError) {
+          _textStreamController?.addError(fallbackError);
+          rethrow;
+        }
+      }
+
      _textStreamController?.addError(error);
      rethrow;
    }
  }

-  Future<void> _setupVadStreams() async {
+  Future<void> _setupVadStreams(VadHandler vad) async {
    await _vadSpeechEndSub?.cancel();
-    _vadSpeechEndSub = _vadHandler.onSpeechEnd.listen((samples) {
+    _vadSpeechEndSub = vad.onSpeechEnd.listen((samples) {
      if (!_isListening || !_usingServerStt) return;
      if (samples.isEmpty) return;
      _vadPendingSamples = samples;
@@ -655,7 +704,7 @@ class VoiceInputService {
    });

    await _vadFrameSub?.cancel();
-    _vadFrameSub = _vadHandler.onFrameProcessed.listen((frameData) {
+    _vadFrameSub = vad.onFrameProcessed.listen((frameData) {
      if (!_isListening) return;
      final intensity = _intensityFromVadFrame(frameData.frame);
      _lastIntensity = intensity;
@@ -665,7 +714,7 @@ class VoiceInputService {
    });

    await _vadErrorSub?.cancel();
-    _vadErrorSub = _vadHandler.onError.listen((message) {
+    _vadErrorSub = vad.onError.listen((message) {
      _textStreamController?.addError(Exception(message));
      if (_isListening) {
        unawaited(_stopListening());
@@ -674,9 +723,12 @@ class VoiceInputService {
  }

  Future<void> _stopVadRecording() async {
-    try {
-      await _vadHandler.stopListening();
-    } catch (_) {}
+    final vad = _vadHandler;
+    if (vad != null) {
+      try {
+        await vad.stopListening();
+      } catch (_) {}
+    }
    await _vadSpeechEndSub?.cancel();
    _vadSpeechEndSub = null;
    await _vadFrameSub?.cancel();
@@ -685,6 +737,16 @@ class VoiceInputService {
    _vadErrorSub = null;
  }

+  Future<void> _disposeVadHandler() async {
+    final vad = _vadHandler;
+    _vadHandler = null;
+    if (vad != null) {
+      try {
+        await vad.dispose();
+      } catch (_) {}
+    }
+  }
+
  Future<void> _processVadSamples(List<double> samples) async {
    final api = _api;
    if (api == null) return;
@@ -737,46 +799,49 @@ class VoiceInputService {
    if (samples.isEmpty) {
      return Uint8List(0);
    }
-    final Int16List pcm = Int16List(samples.length);
-    for (var i = 0; i < samples.length; i++) {
-      final clamped = samples[i].clamp(-1.0, 1.0);
-      final scaled = (clamped * 32767).round().clamp(-32768, 32767);
-      pcm[i] = scaled;
-    }
-
-    final dataLength = pcm.lengthInBytes;
+    final dataLength = samples.length * 2; // 2 bytes per sample (16-bit)
    final bytesPerSample = 2;
    final numChannels = 1;
    final byteRate = _vadSampleRate * numChannels * bytesPerSample;
    final blockAlign = numChannels * bytesPerSample;
+    const headerSize = 44;

-    final builder = BytesBuilder();
-    builder.add(ascii.encode('RIFF'));
-    builder.add(_int32Le(36 + dataLength));
-    builder.add(ascii.encode('WAVE'));
-    builder.add(ascii.encode('fmt '));
-    builder.add(_int32Le(16));
-    builder.add(_int16Le(1));
-    builder.add(_int16Le(numChannels));
-    builder.add(_int32Le(_vadSampleRate));
-    builder.add(_int32Le(byteRate));
-    builder.add(_int16Le(blockAlign));
-    builder.add(_int16Le(16));
-    builder.add(ascii.encode('data'));
-    builder.add(_int32Le(dataLength));
-    builder.add(Uint8List.view(pcm.buffer));
-    return builder.toBytes();
+    final totalSize = headerSize + dataLength;
+    final buffer = Uint8List(totalSize);
+    final view = ByteData.view(buffer.buffer);
+
+    // RIFF chunk
+    buffer.setRange(0, 4, ascii.encode('RIFF'));
+    view.setUint32(4, 36 + dataLength, Endian.little);
+    buffer.setRange(8, 12, ascii.encode('WAVE'));
+
+    // fmt chunk
+    buffer.setRange(12, 16, ascii.encode('fmt '));
+    view.setUint32(16, 16, Endian.little); // PCM chunk size
+    view.setUint16(20, 1, Endian.little); // AudioFormat (1 = PCM)
+    view.setUint16(22, numChannels, Endian.little);
+    view.setUint32(24, _vadSampleRate, Endian.little);
+    view.setUint32(28, byteRate, Endian.little);
+    view.setUint16(32, blockAlign, Endian.little);
+    view.setUint16(34, 16, Endian.little); // BitsPerSample
+
+    // data chunk
+    buffer.setRange(36, 40, ascii.encode('data'));
+    view.setUint32(40, dataLength, Endian.little);
+
+    // Write samples
+    var offset = 44;
+    for (var i = 0; i < samples.length; i++) {
+      final clamped = samples[i].clamp(-1.0, 1.0);
+      // Convert float to 16-bit PCM
+      final pcm = (clamped * 32767).round().clamp(-32768, 32767);
+      view.setInt16(offset, pcm, Endian.little);
+      offset += 2;
+    }
+
+    return buffer;
  }

-  List<int> _int16Le(int value) => [value & 0xff, (value >> 8) & 0xff];
-
-  List<int> _int32Le(int value) => [
-    value & 0xff,
-    (value >> 8) & 0xff,
-    (value >> 16) & 0xff,
-    (value >> 24) & 0xff,
-  ];
-
  String? _languageForServer() {
    final locale = _selectedLocaleId;
    if (locale != null && locale.isNotEmpty) {
@@ -907,10 +972,10 @@ class VoiceInputService {

  void dispose() {
    stopListening();
-    unawaited(_vadHandler.dispose());
+    unawaited(_disposeVadHandler());
    unawaited(_microphonePermissionProbe.dispose());
    try {
-      _speech.dispose().catchError((_) {});
+      _speech.stop();
    } catch (_) {}
  }
 }
@@ -1109,6 +1109,14 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "2.3.0"
+  pedantic:
+    dependency: transitive
+    description:
+      name: pedantic
+      sha256: "67fc27ed9639506c856c840ccce7594d0bdcd91bc8d53d6e52359449a1d50602"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.11.1"
  petitparser:
    dependency: transitive
    description:
@@ -1514,6 +1522,30 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "1.10.1"
+  speech_to_text:
+    dependency: "direct main"
+    description:
+      name: speech_to_text
+      sha256: c07557664974afa061f221d0d4186935bea4220728ea9446702825e8b988db04
+      url: "https://pub.dev"
+    source: hosted
+    version: "7.3.0"
+  speech_to_text_platform_interface:
+    dependency: transitive
+    description:
+      name: speech_to_text_platform_interface
+      sha256: a1935847704e41ee468aad83181ddd2423d0833abe55d769c59afca07adb5114
+      url: "https://pub.dev"
+    source: hosted
+    version: "2.3.0"
+  speech_to_text_windows:
+    dependency: transitive
+    description:
+      name: speech_to_text_windows
+      sha256: "2c9846d18253c7bbe059a276297ef9f27e8a2745dead32192525beb208195072"
+      url: "https://pub.dev"
+    source: hosted
+    version: "1.0.0+beta.8"
  sqflite:
    dependency: transitive
    description:
@@ -1594,30 +1626,6 @@ packages:
      url: "https://pub.dev"
    source: hosted
    version: "1.4.1"
-  stts:
-    dependency: "direct main"
-    description:
-      name: stts
-      sha256: "166ca37d241652cdefb9c31e18a7be93ff4eb847382ae32c5563e07bf44bf1d8"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.2.6"
-  stts_platform_interface:
-    dependency: transitive
-    description:
-      name: stts_platform_interface
-      sha256: "6b82268d59d608e9b5accdadf0e7ccaea7928e8fce68ca393111fa7193d1bf10"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.2.0"
-  stts_web:
-    dependency: transitive
-    description:
-      name: stts_web
-      sha256: "62625c3b4d86076820d687dc468845a0f54c7dd4ead155b58f1e5864488c7f1c"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.1.0"
  synchronized:
    dependency: transitive
    description:
@@ -45,7 +45,7 @@ dependencies:
  
  # Platform Features
  vad: ^0.0.7+1
-  stts: ^1.2.5
+  speech_to_text: ^7.3.0
  record: ^6.1.2
  flutter_tts: ^4.2.3
  audioplayers: ^6.5.1