feat(voice-input): Optimize VAD parameters for improved speech detection
This commit is contained in:
@@ -27,7 +27,12 @@ class LocaleName {
|
|||||||
|
|
||||||
class VoiceInputService {
|
class VoiceInputService {
|
||||||
static const int _vadSampleRate = 16000;
|
static const int _vadSampleRate = 16000;
|
||||||
static const int _vadFrameSamples = 1536;
|
static const int _vadFrameSamples = 512;
|
||||||
|
static const int _vadPreSpeechPadFrames = 16;
|
||||||
|
static const int _vadMinSpeechFrames = 8;
|
||||||
|
static const int _vadEndSpeechPadFrames = 6;
|
||||||
|
static const double _vadPositiveSpeechThreshold = 0.6;
|
||||||
|
static const double _vadNegativeSpeechThreshold = 0.35;
|
||||||
static const Duration _localeFetchTimeout = Duration(seconds: 2);
|
static const Duration _localeFetchTimeout = Duration(seconds: 2);
|
||||||
static const String _backgroundSttStreamId = 'voice-input-stt';
|
static const String _backgroundSttStreamId = 'voice-input-stt';
|
||||||
|
|
||||||
@@ -580,7 +585,6 @@ class VoiceInputService {
|
|||||||
category: IosAudioCategory.playAndRecord,
|
category: IosAudioCategory.playAndRecord,
|
||||||
options: [
|
options: [
|
||||||
IosAudioCategoryOptions.allowBluetooth,
|
IosAudioCategoryOptions.allowBluetooth,
|
||||||
IosAudioCategoryOptions.allowBluetoothA2DP,
|
|
||||||
IosAudioCategoryOptions.defaultToSpeaker,
|
IosAudioCategoryOptions.defaultToSpeaker,
|
||||||
IosAudioCategoryOptions.duckOthers,
|
IosAudioCategoryOptions.duckOthers,
|
||||||
],
|
],
|
||||||
@@ -593,18 +597,21 @@ class VoiceInputService {
|
|||||||
await _setupVadStreams();
|
await _setupVadStreams();
|
||||||
final settings = _ref?.read(appSettingsProvider);
|
final settings = _ref?.read(appSettingsProvider);
|
||||||
final silenceMs = settings?.voiceSilenceDuration ?? 2000;
|
final silenceMs = settings?.voiceSilenceDuration ?? 2000;
|
||||||
final redemptionFrames = _silenceDurationToFrames(silenceMs);
|
final redemptionFrames = _silenceDurationToFrames(
|
||||||
final endPadFrames = redemptionFrames > 4
|
silenceMs,
|
||||||
? (redemptionFrames / 4).round().clamp(1, redemptionFrames)
|
frameSamples: _vadFrameSamples,
|
||||||
: 1;
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await _vadHandler.startListening(
|
await _vadHandler.startListening(
|
||||||
frameSamples: _vadFrameSamples,
|
frameSamples: _vadFrameSamples,
|
||||||
|
model: 'v5',
|
||||||
|
minSpeechFrames: _vadMinSpeechFrames,
|
||||||
|
preSpeechPadFrames: _vadPreSpeechPadFrames,
|
||||||
redemptionFrames: redemptionFrames,
|
redemptionFrames: redemptionFrames,
|
||||||
endSpeechPadFrames: endPadFrames,
|
endSpeechPadFrames: _vadEndSpeechPadFrames,
|
||||||
preSpeechPadFrames: 2,
|
positiveSpeechThreshold: _vadPositiveSpeechThreshold,
|
||||||
minSpeechFrames: 3,
|
negativeSpeechThreshold: _vadNegativeSpeechThreshold,
|
||||||
submitUserSpeechOnPause: true,
|
submitUserSpeechOnPause: true,
|
||||||
recordConfig: const RecordConfig(
|
recordConfig: const RecordConfig(
|
||||||
encoder: AudioEncoder.pcm16bits,
|
encoder: AudioEncoder.pcm16bits,
|
||||||
@@ -612,15 +619,22 @@ class VoiceInputService {
|
|||||||
numChannels: 1,
|
numChannels: 1,
|
||||||
bitRate: 16,
|
bitRate: 16,
|
||||||
echoCancel: true,
|
echoCancel: true,
|
||||||
autoGain: true,
|
autoGain: false,
|
||||||
noiseSuppress: true,
|
noiseSuppress: true,
|
||||||
androidConfig: AndroidRecordConfig(
|
androidConfig: AndroidRecordConfig(
|
||||||
audioSource: AndroidAudioSource.voiceCommunication,
|
audioSource: AndroidAudioSource.voiceRecognition,
|
||||||
audioManagerMode: AudioManagerMode.modeInCommunication,
|
audioManagerMode: AudioManagerMode.modeInCommunication,
|
||||||
speakerphone: true,
|
speakerphone: true,
|
||||||
manageBluetooth: true,
|
manageBluetooth: true,
|
||||||
useLegacy: false,
|
useLegacy: false,
|
||||||
),
|
),
|
||||||
|
iosConfig: IosRecordConfig(
|
||||||
|
categoryOptions: [
|
||||||
|
IosAudioCategoryOption.allowBluetooth,
|
||||||
|
IosAudioCategoryOption.defaultToSpeaker,
|
||||||
|
IosAudioCategoryOption.duckOthers,
|
||||||
|
],
|
||||||
|
),
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -699,8 +713,9 @@ class VoiceInputService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int _silenceDurationToFrames(int milliseconds) {
|
int _silenceDurationToFrames(int milliseconds, {int? frameSamples}) {
|
||||||
final frameDurationMs = (_vadFrameSamples / _vadSampleRate) * 1000;
|
final samples = frameSamples ?? _vadFrameSamples;
|
||||||
|
final frameDurationMs = (samples / _vadSampleRate) * 1000;
|
||||||
final frames = (milliseconds / frameDurationMs).round();
|
final frames = (milliseconds / frameDurationMs).round();
|
||||||
return frames.clamp(4, 50);
|
return frames.clamp(4, 50);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user