feat(voice-input): improve STT locale selection and Android handling
This commit is contained in:
@@ -122,6 +122,4 @@
|
|||||||
android:name="flutterEmbedding"
|
android:name="flutterEmbedding"
|
||||||
android:value="2" />
|
android:value="2" />
|
||||||
</application>
|
</application>
|
||||||
|
|
||||||
<!-- Queries for speech recognition removed; using server transcription -->
|
|
||||||
</manifest>
|
</manifest>
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import 'dart:convert';
|
|||||||
import 'dart:io' show Platform;
|
import 'dart:io' show Platform;
|
||||||
import 'dart:typed_data';
|
import 'dart:typed_data';
|
||||||
|
|
||||||
|
import 'package:flutter/services.dart';
|
||||||
import 'package:flutter/widgets.dart';
|
import 'package:flutter/widgets.dart';
|
||||||
import 'package:flutter_riverpod/flutter_riverpod.dart';
|
import 'package:flutter_riverpod/flutter_riverpod.dart';
|
||||||
import 'package:record/record.dart';
|
import 'package:record/record.dart';
|
||||||
@@ -36,7 +37,7 @@ class VoiceInputService {
|
|||||||
static const Duration _localeFetchTimeout = Duration(seconds: 2);
|
static const Duration _localeFetchTimeout = Duration(seconds: 2);
|
||||||
static const String _backgroundSttStreamId = 'voice-input-stt';
|
static const String _backgroundSttStreamId = 'voice-input-stt';
|
||||||
|
|
||||||
final VadHandler _vadHandler = VadHandler.create();
|
VadHandler? _vadHandler;
|
||||||
final SpeechToText _speech = SpeechToText();
|
final SpeechToText _speech = SpeechToText();
|
||||||
final AudioRecorder _microphonePermissionProbe = AudioRecorder();
|
final AudioRecorder _microphonePermissionProbe = AudioRecorder();
|
||||||
final ApiService? _api;
|
final ApiService? _api;
|
||||||
@@ -53,6 +54,7 @@ class VoiceInputService {
|
|||||||
Future<void>? _startingLocalStt;
|
Future<void>? _startingLocalStt;
|
||||||
StreamController<String>? _textStreamController;
|
StreamController<String>? _textStreamController;
|
||||||
String _currentText = '';
|
String _currentText = '';
|
||||||
|
bool _receivedFinalResult = false;
|
||||||
StreamController<int>? _intensityController;
|
StreamController<int>? _intensityController;
|
||||||
Stream<int> get intensityStream =>
|
Stream<int> get intensityStream =>
|
||||||
_intensityController?.stream ?? const Stream<int>.empty();
|
_intensityController?.stream ?? const Stream<int>.empty();
|
||||||
@@ -124,10 +126,31 @@ class VoiceInputService {
|
|||||||
// properly close the stream so voice call service can restart
|
// properly close the stream so voice call service can restart
|
||||||
if (wasActive && _isListening && !_usingServerStt) {
|
if (wasActive && _isListening && !_usingServerStt) {
|
||||||
debugPrint('Platform stopped listening, closing stream');
|
debugPrint('Platform stopped listening, closing stream');
|
||||||
|
// On Android, the 'done' status often fires BEFORE the final result
|
||||||
|
// callback arrives. Wait for the final result to avoid cutting off
|
||||||
|
// the last word.
|
||||||
|
if (Platform.isAndroid && !_receivedFinalResult) {
|
||||||
|
_waitForFinalResultThenStop();
|
||||||
|
} else {
|
||||||
unawaited(_stopListening());
|
unawaited(_stopListening());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Waits briefly for Android to deliver the final STT result before stopping.
|
||||||
|
void _waitForFinalResultThenStop() {
|
||||||
|
Future(() async {
|
||||||
|
// Wait up to 300ms for the final result to arrive
|
||||||
|
for (var i = 0; i < 6; i++) {
|
||||||
|
await Future.delayed(const Duration(milliseconds: 50));
|
||||||
|
if (_receivedFinalResult || !_isListening) break;
|
||||||
|
}
|
||||||
|
if (_isListening) {
|
||||||
|
await _stopListening();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
void _handleSttError(dynamic error) {
|
void _handleSttError(dynamic error) {
|
||||||
debugPrint('Local STT Error: $error');
|
debugPrint('Local STT Error: $error');
|
||||||
@@ -234,13 +257,29 @@ class VoiceInputService {
|
|||||||
if (sttLocales.isEmpty) {
|
if (sttLocales.isEmpty) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Map speech_to_text LocaleName to our own LocaleName class
|
// Map speech_to_text LocaleName to our own LocaleName class
|
||||||
_locales = sttLocales
|
_locales = sttLocales
|
||||||
.map((loc) => LocaleName(loc.localeId, loc.name))
|
.map((loc) => LocaleName(loc.localeId, loc.name))
|
||||||
.toList();
|
.toList();
|
||||||
_usingFallbackLocales = false;
|
_usingFallbackLocales = false;
|
||||||
final match = _matchLocale(deviceTag);
|
|
||||||
|
// Prefer the STT engine's own system locale when available, since
|
||||||
|
// it may differ from Flutter's UI locale on some Android devices.
|
||||||
|
final systemLocale = await _speech.systemLocale();
|
||||||
|
final systemTag = systemLocale?.localeId;
|
||||||
|
final tagForMatch = (systemTag != null && systemTag.isNotEmpty)
|
||||||
|
? systemTag
|
||||||
|
: deviceTag;
|
||||||
|
|
||||||
|
final match = _matchLocale(tagForMatch);
|
||||||
_selectedLocaleId = match.localeId;
|
_selectedLocaleId = match.localeId;
|
||||||
|
|
||||||
|
debugPrint(
|
||||||
|
'VoiceInputService: deviceTag=$deviceTag, '
|
||||||
|
'systemLocale=$systemTag, '
|
||||||
|
'selectedLocaleId=$_selectedLocaleId',
|
||||||
|
);
|
||||||
} catch (_) {
|
} catch (_) {
|
||||||
// Some engines may not support locale listing
|
// Some engines may not support locale listing
|
||||||
}
|
}
|
||||||
@@ -359,15 +398,15 @@ class VoiceInputService {
|
|||||||
final prevLen = _currentText.length;
|
final prevLen = _currentText.length;
|
||||||
_currentText = result.recognizedWords;
|
_currentText = result.recognizedWords;
|
||||||
_textStreamController?.add(_currentText);
|
_textStreamController?.add(_currentText);
|
||||||
|
if (result.finalResult) {
|
||||||
|
_receivedFinalResult = true;
|
||||||
|
}
|
||||||
final delta = (_currentText.length - prevLen).clamp(0, 50);
|
final delta = (_currentText.length - prevLen).clamp(0, 50);
|
||||||
final mapped = (delta / 5.0).ceil();
|
final mapped = (delta / 5.0).ceil();
|
||||||
_lastIntensity = mapped.clamp(0, 10);
|
_lastIntensity = mapped.clamp(0, 10);
|
||||||
try {
|
try {
|
||||||
_intensityController?.add(_lastIntensity);
|
_intensityController?.add(_lastIntensity);
|
||||||
} catch (_) {}
|
} catch (_) {}
|
||||||
if (result.finalResult) {
|
|
||||||
unawaited(_stopListening());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Future<Stream<String>> startListening() async {
|
Future<Stream<String>> startListening() async {
|
||||||
@@ -388,10 +427,19 @@ class VoiceInputService {
|
|||||||
_textStreamController = StreamController<String>.broadcast();
|
_textStreamController = StreamController<String>.broadcast();
|
||||||
_currentText = '';
|
_currentText = '';
|
||||||
_isListening = true;
|
_isListening = true;
|
||||||
|
_receivedFinalResult = false;
|
||||||
_intensityController = StreamController<int>.broadcast();
|
_intensityController = StreamController<int>.broadcast();
|
||||||
_lastIntensity = 0;
|
_lastIntensity = 0;
|
||||||
_usingServerStt = false;
|
_usingServerStt = false;
|
||||||
|
|
||||||
|
// Optional haptic feedback when listening starts
|
||||||
|
final hapticsEnabled = _ref?.read(hapticEnabledProvider) ?? false;
|
||||||
|
if (hapticsEnabled) {
|
||||||
|
try {
|
||||||
|
HapticFeedback.heavyImpact();
|
||||||
|
} catch (_) {}
|
||||||
|
}
|
||||||
|
|
||||||
_startIntensityDecayTimer();
|
_startIntensityDecayTimer();
|
||||||
|
|
||||||
final bool canUseLocal = _localSttAvailable;
|
final bool canUseLocal = _localSttAvailable;
|
||||||
@@ -489,12 +537,11 @@ class VoiceInputService {
|
|||||||
Future<void> _stopListening() async {
|
Future<void> _stopListening() async {
|
||||||
if (!_isListening) return;
|
if (!_isListening) return;
|
||||||
|
|
||||||
_isListening = false;
|
|
||||||
|
|
||||||
_autoStopTimer?.cancel();
|
_autoStopTimer?.cancel();
|
||||||
_autoStopTimer = null;
|
_autoStopTimer = null;
|
||||||
|
|
||||||
if (_usingServerStt) {
|
if (_usingServerStt) {
|
||||||
|
_isListening = false;
|
||||||
await _stopVadRecording();
|
await _stopVadRecording();
|
||||||
final samples = _vadPendingSamples;
|
final samples = _vadPendingSamples;
|
||||||
_vadPendingSamples = null;
|
_vadPendingSamples = null;
|
||||||
@@ -502,7 +549,17 @@ class VoiceInputService {
|
|||||||
await _processVadSamples(samples);
|
await _processVadSamples(samples);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// On Android, stop() triggers a final result with any buffered words.
|
||||||
|
// Keep _isListening true until after stop() so _handleSttResult accepts it.
|
||||||
await _stopLocalStt();
|
await _stopLocalStt();
|
||||||
|
// Wait for Android's STT engine to deliver the final result callback
|
||||||
|
if (Platform.isAndroid && !_receivedFinalResult) {
|
||||||
|
for (var i = 0; i < 6; i++) {
|
||||||
|
await Future.delayed(const Duration(milliseconds: 50));
|
||||||
|
if (_receivedFinalResult) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_isListening = false;
|
||||||
if (_currentText.isNotEmpty) {
|
if (_currentText.isNotEmpty) {
|
||||||
_textStreamController?.add(_currentText);
|
_textStreamController?.add(_currentText);
|
||||||
}
|
}
|
||||||
@@ -552,7 +609,11 @@ class VoiceInputService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Future<void> _startServerRecording() async {
|
Future<void> _startServerRecording() async {
|
||||||
await _setupVadStreams();
|
// Create a fresh VadHandler for this session to avoid reusing any
|
||||||
|
// internal AudioRecorder that may be in a bad state after errors.
|
||||||
|
final vad = VadHandler.create();
|
||||||
|
_vadHandler = vad;
|
||||||
|
await _setupVadStreams(vad);
|
||||||
final settings = _ref?.read(appSettingsProvider);
|
final settings = _ref?.read(appSettingsProvider);
|
||||||
final silenceMs = settings?.voiceSilenceDuration ?? 2000;
|
final silenceMs = settings?.voiceSilenceDuration ?? 2000;
|
||||||
final redemptionFrames = _silenceDurationToFrames(
|
final redemptionFrames = _silenceDurationToFrames(
|
||||||
@@ -561,7 +622,7 @@ class VoiceInputService {
|
|||||||
);
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await _vadHandler.startListening(
|
await vad.startListening(
|
||||||
frameSamples: _vadFrameSamples,
|
frameSamples: _vadFrameSamples,
|
||||||
model: 'v5',
|
model: 'v5',
|
||||||
minSpeechFrames: _vadMinSpeechFrames,
|
minSpeechFrames: _vadMinSpeechFrames,
|
||||||
@@ -581,22 +642,59 @@ class VoiceInputService {
|
|||||||
noiseSuppress: true,
|
noiseSuppress: true,
|
||||||
androidConfig: AndroidRecordConfig(
|
androidConfig: AndroidRecordConfig(
|
||||||
audioSource: AndroidAudioSource.voiceRecognition,
|
audioSource: AndroidAudioSource.voiceRecognition,
|
||||||
audioManagerMode: AudioManagerMode.modeInCommunication,
|
// Use normal mode instead of modeInCommunication to avoid
|
||||||
speakerphone: true,
|
// audio routing conflicts with TTS playback after recording stops.
|
||||||
|
audioManagerMode: AudioManagerMode.modeNormal,
|
||||||
|
speakerphone: false,
|
||||||
manageBluetooth: true,
|
manageBluetooth: true,
|
||||||
useLegacy: false,
|
useLegacy: false,
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
// If starting the audio stream fails (e.g. recorder disposed),
|
||||||
|
// drop this handler so the next session gets a clean instance.
|
||||||
|
if (identical(_vadHandler, vad)) {
|
||||||
|
_vadHandler = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Known Android issue: the underlying AudioRecorder can be in a bad
|
||||||
|
// state after audio focus changes triggered by TTS playback. When
|
||||||
|
// this happens and local STT is available, transparently fall back
|
||||||
|
// to on-device STT instead of failing the entire voice turn.
|
||||||
|
final canFallbackToLocal = _localSttAvailable && !prefersServerOnly;
|
||||||
|
if (error is PlatformException &&
|
||||||
|
error.code == 'record' &&
|
||||||
|
(error.message ?? '').contains(
|
||||||
|
'Recorder has not yet been created or has already been disposed.',
|
||||||
|
) &&
|
||||||
|
canFallbackToLocal &&
|
||||||
|
_isListening) {
|
||||||
|
debugPrint(
|
||||||
|
'VadHandler.startListening failed due to recorder error – '
|
||||||
|
'falling back to local STT.',
|
||||||
|
);
|
||||||
|
_usingServerStt = false;
|
||||||
|
try {
|
||||||
|
await _stopVadRecording();
|
||||||
|
} catch (_) {}
|
||||||
|
try {
|
||||||
|
await _startLocalRecognition(allowOnlineFallback: !prefersDeviceOnly);
|
||||||
|
return;
|
||||||
|
} catch (fallbackError) {
|
||||||
|
_textStreamController?.addError(fallbackError);
|
||||||
|
rethrow;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
_textStreamController?.addError(error);
|
_textStreamController?.addError(error);
|
||||||
rethrow;
|
rethrow;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Future<void> _setupVadStreams() async {
|
Future<void> _setupVadStreams(VadHandler vad) async {
|
||||||
await _vadSpeechEndSub?.cancel();
|
await _vadSpeechEndSub?.cancel();
|
||||||
_vadSpeechEndSub = _vadHandler.onSpeechEnd.listen((samples) {
|
_vadSpeechEndSub = vad.onSpeechEnd.listen((samples) {
|
||||||
if (!_isListening || !_usingServerStt) return;
|
if (!_isListening || !_usingServerStt) return;
|
||||||
if (samples.isEmpty) return;
|
if (samples.isEmpty) return;
|
||||||
_vadPendingSamples = samples;
|
_vadPendingSamples = samples;
|
||||||
@@ -606,7 +704,7 @@ class VoiceInputService {
|
|||||||
});
|
});
|
||||||
|
|
||||||
await _vadFrameSub?.cancel();
|
await _vadFrameSub?.cancel();
|
||||||
_vadFrameSub = _vadHandler.onFrameProcessed.listen((frameData) {
|
_vadFrameSub = vad.onFrameProcessed.listen((frameData) {
|
||||||
if (!_isListening) return;
|
if (!_isListening) return;
|
||||||
final intensity = _intensityFromVadFrame(frameData.frame);
|
final intensity = _intensityFromVadFrame(frameData.frame);
|
||||||
_lastIntensity = intensity;
|
_lastIntensity = intensity;
|
||||||
@@ -616,7 +714,7 @@ class VoiceInputService {
|
|||||||
});
|
});
|
||||||
|
|
||||||
await _vadErrorSub?.cancel();
|
await _vadErrorSub?.cancel();
|
||||||
_vadErrorSub = _vadHandler.onError.listen((message) {
|
_vadErrorSub = vad.onError.listen((message) {
|
||||||
_textStreamController?.addError(Exception(message));
|
_textStreamController?.addError(Exception(message));
|
||||||
if (_isListening) {
|
if (_isListening) {
|
||||||
unawaited(_stopListening());
|
unawaited(_stopListening());
|
||||||
@@ -625,9 +723,12 @@ class VoiceInputService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Future<void> _stopVadRecording() async {
|
Future<void> _stopVadRecording() async {
|
||||||
|
final vad = _vadHandler;
|
||||||
|
if (vad != null) {
|
||||||
try {
|
try {
|
||||||
await _vadHandler.stopListening();
|
await vad.stopListening();
|
||||||
} catch (_) {}
|
} catch (_) {}
|
||||||
|
}
|
||||||
await _vadSpeechEndSub?.cancel();
|
await _vadSpeechEndSub?.cancel();
|
||||||
_vadSpeechEndSub = null;
|
_vadSpeechEndSub = null;
|
||||||
await _vadFrameSub?.cancel();
|
await _vadFrameSub?.cancel();
|
||||||
@@ -636,6 +737,16 @@ class VoiceInputService {
|
|||||||
_vadErrorSub = null;
|
_vadErrorSub = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Future<void> _disposeVadHandler() async {
|
||||||
|
final vad = _vadHandler;
|
||||||
|
_vadHandler = null;
|
||||||
|
if (vad != null) {
|
||||||
|
try {
|
||||||
|
await vad.dispose();
|
||||||
|
} catch (_) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Future<void> _processVadSamples(List<double> samples) async {
|
Future<void> _processVadSamples(List<double> samples) async {
|
||||||
final api = _api;
|
final api = _api;
|
||||||
if (api == null) return;
|
if (api == null) return;
|
||||||
@@ -861,7 +972,7 @@ class VoiceInputService {
|
|||||||
|
|
||||||
void dispose() {
|
void dispose() {
|
||||||
stopListening();
|
stopListening();
|
||||||
unawaited(_vadHandler.dispose());
|
unawaited(_disposeVadHandler());
|
||||||
unawaited(_microphonePermissionProbe.dispose());
|
unawaited(_microphonePermissionProbe.dispose());
|
||||||
try {
|
try {
|
||||||
_speech.stop();
|
_speech.stop();
|
||||||
|
|||||||
Reference in New Issue
Block a user