feat(voice-input): improve STT locale selection and Android handling

2025-11-27 19:48:25 +05:30
parent d092bb2e44
commit f9574dfec0
2 changed files with 131 additions and 22 deletions
@@ -3,6 +3,7 @@ import 'dart:convert';
 import 'dart:io' show Platform;
 import 'dart:typed_data';

+import 'package:flutter/services.dart';
 import 'package:flutter/widgets.dart';
 import 'package:flutter_riverpod/flutter_riverpod.dart';
 import 'package:record/record.dart';
@@ -36,7 +37,7 @@ class VoiceInputService {
  static const Duration _localeFetchTimeout = Duration(seconds: 2);
  static const String _backgroundSttStreamId = 'voice-input-stt';

-  final VadHandler _vadHandler = VadHandler.create();
+  VadHandler? _vadHandler;
  final SpeechToText _speech = SpeechToText();
  final AudioRecorder _microphonePermissionProbe = AudioRecorder();
  final ApiService? _api;
@@ -53,6 +54,7 @@ class VoiceInputService {
  Future<void>? _startingLocalStt;
  StreamController<String>? _textStreamController;
  String _currentText = '';
+  bool _receivedFinalResult = false;
  StreamController<int>? _intensityController;
  Stream<int> get intensityStream =>
      _intensityController?.stream ?? const Stream<int>.empty();
@@ -124,11 +126,32 @@ class VoiceInputService {
      // properly close the stream so voice call service can restart
      if (wasActive && _isListening && !_usingServerStt) {
        debugPrint('Platform stopped listening, closing stream');
-        unawaited(_stopListening());
+        // On Android, the 'done' status often fires BEFORE the final result
+        // callback arrives. Wait for the final result to avoid cutting off
+        // the last word.
+        if (Platform.isAndroid && !_receivedFinalResult) {
+          _waitForFinalResultThenStop();
+        } else {
+          unawaited(_stopListening());
+        }
      }
    }
  }

+  /// Waits briefly for Android to deliver the final STT result before stopping.
+  void _waitForFinalResultThenStop() {
+    Future(() async {
+      // Wait up to 300ms for the final result to arrive
+      for (var i = 0; i < 6; i++) {
+        await Future.delayed(const Duration(milliseconds: 50));
+        if (_receivedFinalResult || !_isListening) break;
+      }
+      if (_isListening) {
+        await _stopListening();
+      }
+    });
+  }
+
  void _handleSttError(dynamic error) {
    debugPrint('Local STT Error: $error');
    final errorStr = error.toString().toLowerCase();
@@ -234,13 +257,29 @@ class VoiceInputService {
      if (sttLocales.isEmpty) {
        return;
      }
+
      // Map speech_to_text LocaleName to our own LocaleName class
      _locales = sttLocales
          .map((loc) => LocaleName(loc.localeId, loc.name))
          .toList();
      _usingFallbackLocales = false;
-      final match = _matchLocale(deviceTag);
+
+      // Prefer the STT engine's own system locale when available, since
+      // it may differ from Flutter's UI locale on some Android devices.
+      final systemLocale = await _speech.systemLocale();
+      final systemTag = systemLocale?.localeId;
+      final tagForMatch = (systemTag != null && systemTag.isNotEmpty)
+          ? systemTag
+          : deviceTag;
+
+      final match = _matchLocale(tagForMatch);
      _selectedLocaleId = match.localeId;
+
+      debugPrint(
+        'VoiceInputService: deviceTag=$deviceTag, '
+        'systemLocale=$systemTag, '
+        'selectedLocaleId=$_selectedLocaleId',
+      );
    } catch (_) {
      // Some engines may not support locale listing
    }
@@ -359,15 +398,15 @@ class VoiceInputService {
    final prevLen = _currentText.length;
    _currentText = result.recognizedWords;
    _textStreamController?.add(_currentText);
+    if (result.finalResult) {
+      _receivedFinalResult = true;
+    }
    final delta = (_currentText.length - prevLen).clamp(0, 50);
    final mapped = (delta / 5.0).ceil();
    _lastIntensity = mapped.clamp(0, 10);
    try {
      _intensityController?.add(_lastIntensity);
    } catch (_) {}
-    if (result.finalResult) {
-      unawaited(_stopListening());
-    }
  }

  Future<Stream<String>> startListening() async {
@@ -388,10 +427,19 @@ class VoiceInputService {
    _textStreamController = StreamController<String>.broadcast();
    _currentText = '';
    _isListening = true;
+    _receivedFinalResult = false;
    _intensityController = StreamController<int>.broadcast();
    _lastIntensity = 0;
    _usingServerStt = false;

+    // Optional haptic feedback when listening starts
+    final hapticsEnabled = _ref?.read(hapticEnabledProvider) ?? false;
+    if (hapticsEnabled) {
+      try {
+        HapticFeedback.heavyImpact();
+      } catch (_) {}
+    }
+
    _startIntensityDecayTimer();

    final bool canUseLocal = _localSttAvailable;
@@ -489,12 +537,11 @@ class VoiceInputService {
  Future<void> _stopListening() async {
    if (!_isListening) return;

-    _isListening = false;
-
    _autoStopTimer?.cancel();
    _autoStopTimer = null;

    if (_usingServerStt) {
+      _isListening = false;
      await _stopVadRecording();
      final samples = _vadPendingSamples;
      _vadPendingSamples = null;
@@ -502,7 +549,17 @@ class VoiceInputService {
        await _processVadSamples(samples);
      }
    } else {
+      // On Android, stop() triggers a final result with any buffered words.
+      // Keep _isListening true until after stop() so _handleSttResult accepts it.
      await _stopLocalStt();
+      // Wait for Android's STT engine to deliver the final result callback
+      if (Platform.isAndroid && !_receivedFinalResult) {
+        for (var i = 0; i < 6; i++) {
+          await Future.delayed(const Duration(milliseconds: 50));
+          if (_receivedFinalResult) break;
+        }
+      }
+      _isListening = false;
      if (_currentText.isNotEmpty) {
        _textStreamController?.add(_currentText);
      }
@@ -552,7 +609,11 @@ class VoiceInputService {
  }

  Future<void> _startServerRecording() async {
-    await _setupVadStreams();
+    // Create a fresh VadHandler for this session to avoid reusing any
+    // internal AudioRecorder that may be in a bad state after errors.
+    final vad = VadHandler.create();
+    _vadHandler = vad;
+    await _setupVadStreams(vad);
    final settings = _ref?.read(appSettingsProvider);
    final silenceMs = settings?.voiceSilenceDuration ?? 2000;
    final redemptionFrames = _silenceDurationToFrames(
@@ -561,7 +622,7 @@ class VoiceInputService {
    );

    try {
-      await _vadHandler.startListening(
+      await vad.startListening(
        frameSamples: _vadFrameSamples,
        model: 'v5',
        minSpeechFrames: _vadMinSpeechFrames,
@@ -581,22 +642,59 @@ class VoiceInputService {
          noiseSuppress: true,
          androidConfig: AndroidRecordConfig(
            audioSource: AndroidAudioSource.voiceRecognition,
-            audioManagerMode: AudioManagerMode.modeInCommunication,
-            speakerphone: true,
+            // Use normal mode instead of modeInCommunication to avoid
+            // audio routing conflicts with TTS playback after recording stops.
+            audioManagerMode: AudioManagerMode.modeNormal,
+            speakerphone: false,
            manageBluetooth: true,
            useLegacy: false,
          ),
        ),
      );
    } catch (error) {
+      // If starting the audio stream fails (e.g. recorder disposed),
+      // drop this handler so the next session gets a clean instance.
+      if (identical(_vadHandler, vad)) {
+        _vadHandler = null;
+      }
+
+      // Known Android issue: the underlying AudioRecorder can be in a bad
+      // state after audio focus changes triggered by TTS playback. When
+      // this happens and local STT is available, transparently fall back
+      // to on-device STT instead of failing the entire voice turn.
+      final canFallbackToLocal = _localSttAvailable && !prefersServerOnly;
+      if (error is PlatformException &&
+          error.code == 'record' &&
+          (error.message ?? '').contains(
+            'Recorder has not yet been created or has already been disposed.',
+          ) &&
+          canFallbackToLocal &&
+          _isListening) {
+        debugPrint(
+          'VadHandler.startListening failed due to recorder error – '
+          'falling back to local STT.',
+        );
+        _usingServerStt = false;
+        try {
+          await _stopVadRecording();
+        } catch (_) {}
+        try {
+          await _startLocalRecognition(allowOnlineFallback: !prefersDeviceOnly);
+          return;
+        } catch (fallbackError) {
+          _textStreamController?.addError(fallbackError);
+          rethrow;
+        }
+      }
+
      _textStreamController?.addError(error);
      rethrow;
    }
  }

-  Future<void> _setupVadStreams() async {
+  Future<void> _setupVadStreams(VadHandler vad) async {
    await _vadSpeechEndSub?.cancel();
-    _vadSpeechEndSub = _vadHandler.onSpeechEnd.listen((samples) {
+    _vadSpeechEndSub = vad.onSpeechEnd.listen((samples) {
      if (!_isListening || !_usingServerStt) return;
      if (samples.isEmpty) return;
      _vadPendingSamples = samples;
@@ -606,7 +704,7 @@ class VoiceInputService {
    });

    await _vadFrameSub?.cancel();
-    _vadFrameSub = _vadHandler.onFrameProcessed.listen((frameData) {
+    _vadFrameSub = vad.onFrameProcessed.listen((frameData) {
      if (!_isListening) return;
      final intensity = _intensityFromVadFrame(frameData.frame);
      _lastIntensity = intensity;
@@ -616,7 +714,7 @@ class VoiceInputService {
    });

    await _vadErrorSub?.cancel();
-    _vadErrorSub = _vadHandler.onError.listen((message) {
+    _vadErrorSub = vad.onError.listen((message) {
      _textStreamController?.addError(Exception(message));
      if (_isListening) {
        unawaited(_stopListening());
@@ -625,9 +723,12 @@ class VoiceInputService {
  }

  Future<void> _stopVadRecording() async {
-    try {
-      await _vadHandler.stopListening();
-    } catch (_) {}
+    final vad = _vadHandler;
+    if (vad != null) {
+      try {
+        await vad.stopListening();
+      } catch (_) {}
+    }
    await _vadSpeechEndSub?.cancel();
    _vadSpeechEndSub = null;
    await _vadFrameSub?.cancel();
@@ -636,6 +737,16 @@ class VoiceInputService {
    _vadErrorSub = null;
  }

+  Future<void> _disposeVadHandler() async {
+    final vad = _vadHandler;
+    _vadHandler = null;
+    if (vad != null) {
+      try {
+        await vad.dispose();
+      } catch (_) {}
+    }
+  }
+
  Future<void> _processVadSamples(List<double> samples) async {
    final api = _api;
    if (api == null) return;
@@ -861,7 +972,7 @@ class VoiceInputService {

  void dispose() {
    stopListening();
-    unawaited(_vadHandler.dispose());
+    unawaited(_disposeVadHandler());
    unawaited(_microphonePermissionProbe.dispose());
    try {
      _speech.stop();