feat(sts): add server side speech-to-text
This commit is contained in:
@@ -11,6 +11,7 @@ final class PreferenceKeys {
|
||||
static const String voiceLocaleId = 'voice_locale_id';
|
||||
static const String voiceHoldToTalk = 'voice_hold_to_talk';
|
||||
static const String voiceAutoSendFinal = 'voice_auto_send_final';
|
||||
static const String voiceSttPreference = 'voice_stt_preference';
|
||||
static const String socketTransportMode = 'socket_transport_mode';
|
||||
static const String quickPills = 'quick_pills';
|
||||
static const String sendOnEnterKey = 'send_on_enter';
|
||||
|
||||
@@ -90,6 +90,7 @@ class PersistenceMigrator {
|
||||
copyString(PreferenceKeys.voiceLocaleId);
|
||||
copyBool(PreferenceKeys.voiceHoldToTalk);
|
||||
copyBool(PreferenceKeys.voiceAutoSendFinal);
|
||||
copyString(PreferenceKeys.voiceSttPreference);
|
||||
copyString(PreferenceKeys.socketTransportMode);
|
||||
copyStringList(PreferenceKeys.quickPills);
|
||||
copyBool(PreferenceKeys.sendOnEnterKey);
|
||||
@@ -194,6 +195,7 @@ class PersistenceMigrator {
|
||||
PreferenceKeys.voiceLocaleId,
|
||||
PreferenceKeys.voiceHoldToTalk,
|
||||
PreferenceKeys.voiceAutoSendFinal,
|
||||
PreferenceKeys.voiceSttPreference,
|
||||
PreferenceKeys.socketTransportMode,
|
||||
PreferenceKeys.quickPills,
|
||||
PreferenceKeys.sendOnEnterKey,
|
||||
|
||||
@@ -4,7 +4,7 @@ import 'dart:io';
|
||||
import 'package:dio/dio.dart';
|
||||
import 'package:dio/io.dart';
|
||||
import 'package:flutter/foundation.dart';
|
||||
// import 'package:http_parser/http_parser.dart';
|
||||
import 'package:http_parser/http_parser.dart';
|
||||
// Removed legacy websocket/socket.io imports
|
||||
import 'package:uuid/uuid.dart';
|
||||
import '../models/backend_config.dart';
|
||||
@@ -1607,6 +1607,55 @@ class ApiService {
|
||||
return [];
|
||||
}
|
||||
|
||||
Future<Map<String, dynamic>> transcribeSpeech({
|
||||
required Uint8List audioBytes,
|
||||
String? fileName,
|
||||
String? mimeType,
|
||||
String? language,
|
||||
}) async {
|
||||
if (audioBytes.isEmpty) {
|
||||
throw ArgumentError('audioBytes cannot be empty for transcription');
|
||||
}
|
||||
|
||||
final sanitizedFileName = (fileName != null && fileName.trim().isNotEmpty
|
||||
? fileName.trim()
|
||||
: 'audio.m4a');
|
||||
final resolvedMimeType = (mimeType != null && mimeType.trim().isNotEmpty)
|
||||
? mimeType.trim()
|
||||
: _inferMimeTypeFromName(sanitizedFileName);
|
||||
|
||||
_traceApi(
|
||||
'Uploading $sanitizedFileName (${audioBytes.length} bytes) for transcription',
|
||||
);
|
||||
|
||||
final formData = FormData.fromMap({
|
||||
'file': MultipartFile.fromBytes(
|
||||
audioBytes,
|
||||
filename: sanitizedFileName,
|
||||
contentType: _parseMediaType(resolvedMimeType),
|
||||
),
|
||||
if (language != null && language.trim().isNotEmpty)
|
||||
'language': language.trim(),
|
||||
});
|
||||
|
||||
final response = await _dio.post(
|
||||
'/api/v1/audio/transcriptions',
|
||||
data: formData,
|
||||
options: Options(headers: const {'accept': 'application/json'}),
|
||||
);
|
||||
|
||||
final data = response.data;
|
||||
if (data is Map<String, dynamic>) {
|
||||
return data;
|
||||
}
|
||||
if (data is String) {
|
||||
return {'text': data};
|
||||
}
|
||||
throw StateError(
|
||||
'Unexpected transcription response type: ${data.runtimeType}',
|
||||
);
|
||||
}
|
||||
|
||||
Future<({Uint8List bytes, String mimeType})> generateSpeech({
|
||||
required String text,
|
||||
String? voice,
|
||||
@@ -1690,7 +1739,43 @@ class ApiService {
|
||||
return bytes.length >= 2 && bytes[0] == 0xFF && (bytes[1] & 0xE0) == 0xE0;
|
||||
}
|
||||
|
||||
// Server audio transcription removed; rely on on-device STT in UI layer
|
||||
String _inferMimeTypeFromName(String name) {
|
||||
final dotIndex = name.lastIndexOf('.');
|
||||
if (dotIndex == -1 || dotIndex == name.length - 1) {
|
||||
return 'audio/mpeg';
|
||||
}
|
||||
final ext = name.substring(dotIndex + 1).toLowerCase();
|
||||
switch (ext) {
|
||||
case 'wav':
|
||||
return 'audio/wav';
|
||||
case 'ogg':
|
||||
return 'audio/ogg';
|
||||
case 'm4a':
|
||||
case 'mp4':
|
||||
return 'audio/mp4';
|
||||
case 'aac':
|
||||
return 'audio/aac';
|
||||
case 'webm':
|
||||
return 'audio/webm';
|
||||
case 'flac':
|
||||
return 'audio/flac';
|
||||
case 'mp3':
|
||||
return 'audio/mpeg';
|
||||
default:
|
||||
return 'audio/mpeg';
|
||||
}
|
||||
}
|
||||
|
||||
MediaType? _parseMediaType(String? value) {
|
||||
if (value == null || value.isEmpty) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return MediaType.parse(value);
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Image Generation
|
||||
Future<List<Map<String, dynamic>>> getImageModels() async {
|
||||
|
||||
@@ -8,6 +8,9 @@ import 'animation_service.dart';
|
||||
|
||||
part 'settings_service.g.dart';
|
||||
|
||||
/// Speech-to-text preference selection.
|
||||
enum SttPreference { auto, deviceOnly, serverOnly }
|
||||
|
||||
/// TTS engine selection
|
||||
enum TtsEngine { device, server }
|
||||
|
||||
@@ -151,6 +154,9 @@ class SettingsService {
|
||||
ttsServerVoiceId: box.get(PreferenceKeys.ttsServerVoiceId) as String?,
|
||||
ttsServerVoiceName:
|
||||
box.get(PreferenceKeys.ttsServerVoiceName) as String?,
|
||||
sttPreference: _parseSttPreference(
|
||||
box.get(PreferenceKeys.voiceSttPreference) as String?,
|
||||
),
|
||||
),
|
||||
);
|
||||
}
|
||||
@@ -174,6 +180,7 @@ class SettingsService {
|
||||
PreferenceKeys.ttsPitch: settings.ttsPitch,
|
||||
PreferenceKeys.ttsVolume: settings.ttsVolume,
|
||||
PreferenceKeys.ttsEngine: settings.ttsEngine.name,
|
||||
PreferenceKeys.voiceSttPreference: settings.sttPreference.name,
|
||||
};
|
||||
|
||||
await box.putAll(updates);
|
||||
@@ -224,6 +231,22 @@ class SettingsService {
|
||||
}
|
||||
}
|
||||
|
||||
static SttPreference _parseSttPreference(String? raw) {
|
||||
switch ((raw ?? '').toLowerCase()) {
|
||||
case 'deviceonly':
|
||||
case 'device_only':
|
||||
case 'device':
|
||||
return SttPreference.deviceOnly;
|
||||
case 'serveronly':
|
||||
case 'server_only':
|
||||
case 'server':
|
||||
return SttPreference.serverOnly;
|
||||
case 'auto':
|
||||
default:
|
||||
return SttPreference.auto;
|
||||
}
|
||||
}
|
||||
|
||||
// Voice input specific settings
|
||||
static Future<String?> getVoiceLocaleId() {
|
||||
final value = _preferencesBox().get(_voiceLocaleKey) as String?;
|
||||
@@ -359,6 +382,7 @@ class AppSettings {
|
||||
final String socketTransportMode; // 'polling' or 'ws'
|
||||
final List<String> quickPills; // e.g., ['web','image']
|
||||
final bool sendOnEnter;
|
||||
final SttPreference sttPreference;
|
||||
final String? ttsVoice;
|
||||
final double ttsSpeechRate;
|
||||
final double ttsPitch;
|
||||
@@ -380,6 +404,7 @@ class AppSettings {
|
||||
this.socketTransportMode = 'ws',
|
||||
this.quickPills = const [],
|
||||
this.sendOnEnter = false,
|
||||
this.sttPreference = SttPreference.auto,
|
||||
this.ttsVoice,
|
||||
this.ttsSpeechRate = 0.5,
|
||||
this.ttsPitch = 1.0,
|
||||
@@ -403,6 +428,7 @@ class AppSettings {
|
||||
String? socketTransportMode,
|
||||
List<String>? quickPills,
|
||||
bool? sendOnEnter,
|
||||
SttPreference? sttPreference,
|
||||
Object? ttsVoice = const _DefaultValue(),
|
||||
double? ttsSpeechRate,
|
||||
double? ttsPitch,
|
||||
@@ -429,6 +455,7 @@ class AppSettings {
|
||||
socketTransportMode: socketTransportMode ?? this.socketTransportMode,
|
||||
quickPills: quickPills ?? this.quickPills,
|
||||
sendOnEnter: sendOnEnter ?? this.sendOnEnter,
|
||||
sttPreference: sttPreference ?? this.sttPreference,
|
||||
ttsVoice: ttsVoice is _DefaultValue ? this.ttsVoice : ttsVoice as String?,
|
||||
ttsSpeechRate: ttsSpeechRate ?? this.ttsSpeechRate,
|
||||
ttsPitch: ttsPitch ?? this.ttsPitch,
|
||||
@@ -457,6 +484,7 @@ class AppSettings {
|
||||
other.voiceLocaleId == voiceLocaleId &&
|
||||
other.voiceHoldToTalk == voiceHoldToTalk &&
|
||||
other.voiceAutoSendFinal == voiceAutoSendFinal &&
|
||||
other.sttPreference == sttPreference &&
|
||||
other.sendOnEnter == sendOnEnter &&
|
||||
other.ttsVoice == ttsVoice &&
|
||||
other.ttsSpeechRate == ttsSpeechRate &&
|
||||
@@ -471,7 +499,7 @@ class AppSettings {
|
||||
|
||||
@override
|
||||
int get hashCode {
|
||||
return Object.hash(
|
||||
return Object.hashAll([
|
||||
reduceMotion,
|
||||
animationSpeed,
|
||||
hapticFeedback,
|
||||
@@ -482,6 +510,7 @@ class AppSettings {
|
||||
voiceLocaleId,
|
||||
voiceHoldToTalk,
|
||||
voiceAutoSendFinal,
|
||||
sttPreference,
|
||||
socketTransportMode,
|
||||
sendOnEnter,
|
||||
ttsVoice,
|
||||
@@ -492,7 +521,7 @@ class AppSettings {
|
||||
ttsServerVoiceId,
|
||||
ttsServerVoiceName,
|
||||
Object.hashAllUnordered(quickPills),
|
||||
);
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -603,6 +632,14 @@ class AppSettingsNotifier extends _$AppSettingsNotifier {
|
||||
await SettingsService.setSendOnEnter(value);
|
||||
}
|
||||
|
||||
Future<void> setSttPreference(SttPreference preference) async {
|
||||
if (state.sttPreference == preference) {
|
||||
return;
|
||||
}
|
||||
state = state.copyWith(sttPreference: preference);
|
||||
await SettingsService.saveSettings(state);
|
||||
}
|
||||
|
||||
Future<void> setTtsVoice(String? voice) async {
|
||||
state = state.copyWith(ttsVoice: voice);
|
||||
await SettingsService.saveSettings(state);
|
||||
|
||||
Reference in New Issue
Block a user