feat(sts): add server side speech-to-text

This commit is contained in:
cogwheel0
2025-11-02 19:02:37 +05:30
parent 99f867bf54
commit 86339715b1
16 changed files with 916 additions and 92 deletions

View File

@@ -4,7 +4,7 @@ import 'dart:io';
import 'package:dio/dio.dart';
import 'package:dio/io.dart';
import 'package:flutter/foundation.dart';
// import 'package:http_parser/http_parser.dart';
import 'package:http_parser/http_parser.dart';
// Removed legacy websocket/socket.io imports
import 'package:uuid/uuid.dart';
import '../models/backend_config.dart';
@@ -1607,6 +1607,55 @@ class ApiService {
return [];
}
Future<Map<String, dynamic>> transcribeSpeech({
required Uint8List audioBytes,
String? fileName,
String? mimeType,
String? language,
}) async {
if (audioBytes.isEmpty) {
throw ArgumentError('audioBytes cannot be empty for transcription');
}
final sanitizedFileName = (fileName != null && fileName.trim().isNotEmpty
? fileName.trim()
: 'audio.m4a');
final resolvedMimeType = (mimeType != null && mimeType.trim().isNotEmpty)
? mimeType.trim()
: _inferMimeTypeFromName(sanitizedFileName);
_traceApi(
'Uploading $sanitizedFileName (${audioBytes.length} bytes) for transcription',
);
final formData = FormData.fromMap({
'file': MultipartFile.fromBytes(
audioBytes,
filename: sanitizedFileName,
contentType: _parseMediaType(resolvedMimeType),
),
if (language != null && language.trim().isNotEmpty)
'language': language.trim(),
});
final response = await _dio.post(
'/api/v1/audio/transcriptions',
data: formData,
options: Options(headers: const {'accept': 'application/json'}),
);
final data = response.data;
if (data is Map<String, dynamic>) {
return data;
}
if (data is String) {
return {'text': data};
}
throw StateError(
'Unexpected transcription response type: ${data.runtimeType}',
);
}
Future<({Uint8List bytes, String mimeType})> generateSpeech({
required String text,
String? voice,
@@ -1690,7 +1739,43 @@ class ApiService {
return bytes.length >= 2 && bytes[0] == 0xFF && (bytes[1] & 0xE0) == 0xE0;
}
// Server audio transcription removed; rely on on-device STT in UI layer
String _inferMimeTypeFromName(String name) {
final dotIndex = name.lastIndexOf('.');
if (dotIndex == -1 || dotIndex == name.length - 1) {
return 'audio/mpeg';
}
final ext = name.substring(dotIndex + 1).toLowerCase();
switch (ext) {
case 'wav':
return 'audio/wav';
case 'ogg':
return 'audio/ogg';
case 'm4a':
case 'mp4':
return 'audio/mp4';
case 'aac':
return 'audio/aac';
case 'webm':
return 'audio/webm';
case 'flac':
return 'audio/flac';
case 'mp3':
return 'audio/mpeg';
default:
return 'audio/mpeg';
}
}
MediaType? _parseMediaType(String? value) {
if (value == null || value.isEmpty) {
return null;
}
try {
return MediaType.parse(value);
} catch (_) {
return null;
}
}
// Image Generation
Future<List<Map<String, dynamic>>> getImageModels() async {

View File

@@ -8,6 +8,9 @@ import 'animation_service.dart';
part 'settings_service.g.dart';
/// Speech-to-text preference selection.
enum SttPreference { auto, deviceOnly, serverOnly }
/// TTS engine selection
enum TtsEngine { device, server }
@@ -151,6 +154,9 @@ class SettingsService {
ttsServerVoiceId: box.get(PreferenceKeys.ttsServerVoiceId) as String?,
ttsServerVoiceName:
box.get(PreferenceKeys.ttsServerVoiceName) as String?,
sttPreference: _parseSttPreference(
box.get(PreferenceKeys.voiceSttPreference) as String?,
),
),
);
}
@@ -174,6 +180,7 @@ class SettingsService {
PreferenceKeys.ttsPitch: settings.ttsPitch,
PreferenceKeys.ttsVolume: settings.ttsVolume,
PreferenceKeys.ttsEngine: settings.ttsEngine.name,
PreferenceKeys.voiceSttPreference: settings.sttPreference.name,
};
await box.putAll(updates);
@@ -224,6 +231,22 @@ class SettingsService {
}
}
static SttPreference _parseSttPreference(String? raw) {
switch ((raw ?? '').toLowerCase()) {
case 'deviceonly':
case 'device_only':
case 'device':
return SttPreference.deviceOnly;
case 'serveronly':
case 'server_only':
case 'server':
return SttPreference.serverOnly;
case 'auto':
default:
return SttPreference.auto;
}
}
// Voice input specific settings
static Future<String?> getVoiceLocaleId() {
final value = _preferencesBox().get(_voiceLocaleKey) as String?;
@@ -359,6 +382,7 @@ class AppSettings {
final String socketTransportMode; // 'polling' or 'ws'
final List<String> quickPills; // e.g., ['web','image']
final bool sendOnEnter;
final SttPreference sttPreference;
final String? ttsVoice;
final double ttsSpeechRate;
final double ttsPitch;
@@ -380,6 +404,7 @@ class AppSettings {
this.socketTransportMode = 'ws',
this.quickPills = const [],
this.sendOnEnter = false,
this.sttPreference = SttPreference.auto,
this.ttsVoice,
this.ttsSpeechRate = 0.5,
this.ttsPitch = 1.0,
@@ -403,6 +428,7 @@ class AppSettings {
String? socketTransportMode,
List<String>? quickPills,
bool? sendOnEnter,
SttPreference? sttPreference,
Object? ttsVoice = const _DefaultValue(),
double? ttsSpeechRate,
double? ttsPitch,
@@ -429,6 +455,7 @@ class AppSettings {
socketTransportMode: socketTransportMode ?? this.socketTransportMode,
quickPills: quickPills ?? this.quickPills,
sendOnEnter: sendOnEnter ?? this.sendOnEnter,
sttPreference: sttPreference ?? this.sttPreference,
ttsVoice: ttsVoice is _DefaultValue ? this.ttsVoice : ttsVoice as String?,
ttsSpeechRate: ttsSpeechRate ?? this.ttsSpeechRate,
ttsPitch: ttsPitch ?? this.ttsPitch,
@@ -457,6 +484,7 @@ class AppSettings {
other.voiceLocaleId == voiceLocaleId &&
other.voiceHoldToTalk == voiceHoldToTalk &&
other.voiceAutoSendFinal == voiceAutoSendFinal &&
other.sttPreference == sttPreference &&
other.sendOnEnter == sendOnEnter &&
other.ttsVoice == ttsVoice &&
other.ttsSpeechRate == ttsSpeechRate &&
@@ -471,7 +499,7 @@ class AppSettings {
@override
int get hashCode {
return Object.hash(
return Object.hashAll([
reduceMotion,
animationSpeed,
hapticFeedback,
@@ -482,6 +510,7 @@ class AppSettings {
voiceLocaleId,
voiceHoldToTalk,
voiceAutoSendFinal,
sttPreference,
socketTransportMode,
sendOnEnter,
ttsVoice,
@@ -492,7 +521,7 @@ class AppSettings {
ttsServerVoiceId,
ttsServerVoiceName,
Object.hashAllUnordered(quickPills),
);
]);
}
}
@@ -603,6 +632,14 @@ class AppSettingsNotifier extends _$AppSettingsNotifier {
await SettingsService.setSendOnEnter(value);
}
Future<void> setSttPreference(SttPreference preference) async {
if (state.sttPreference == preference) {
return;
}
state = state.copyWith(sttPreference: preference);
await SettingsService.saveSettings(state);
}
Future<void> setTtsVoice(String? voice) async {
state = state.copyWith(ttsVoice: voice);
await SettingsService.saveSettings(state);