feat(sts): add server side speech-to-text

This commit is contained in:
cogwheel0
2025-11-02 19:02:37 +05:30
parent 99f867bf54
commit 86339715b1
16 changed files with 916 additions and 92 deletions

View File

@@ -4,7 +4,7 @@ import 'dart:io';
import 'package:dio/dio.dart';
import 'package:dio/io.dart';
import 'package:flutter/foundation.dart';
// import 'package:http_parser/http_parser.dart';
import 'package:http_parser/http_parser.dart';
// Removed legacy websocket/socket.io imports
import 'package:uuid/uuid.dart';
import '../models/backend_config.dart';
@@ -1607,6 +1607,55 @@ class ApiService {
return [];
}
Future<Map<String, dynamic>> transcribeSpeech({
required Uint8List audioBytes,
String? fileName,
String? mimeType,
String? language,
}) async {
if (audioBytes.isEmpty) {
throw ArgumentError('audioBytes cannot be empty for transcription');
}
final sanitizedFileName = (fileName != null && fileName.trim().isNotEmpty
? fileName.trim()
: 'audio.m4a');
final resolvedMimeType = (mimeType != null && mimeType.trim().isNotEmpty)
? mimeType.trim()
: _inferMimeTypeFromName(sanitizedFileName);
_traceApi(
'Uploading $sanitizedFileName (${audioBytes.length} bytes) for transcription',
);
final formData = FormData.fromMap({
'file': MultipartFile.fromBytes(
audioBytes,
filename: sanitizedFileName,
contentType: _parseMediaType(resolvedMimeType),
),
if (language != null && language.trim().isNotEmpty)
'language': language.trim(),
});
final response = await _dio.post(
'/api/v1/audio/transcriptions',
data: formData,
options: Options(headers: const {'accept': 'application/json'}),
);
final data = response.data;
if (data is Map<String, dynamic>) {
return data;
}
if (data is String) {
return {'text': data};
}
throw StateError(
'Unexpected transcription response type: ${data.runtimeType}',
);
}
Future<({Uint8List bytes, String mimeType})> generateSpeech({
required String text,
String? voice,
@@ -1690,7 +1739,43 @@ class ApiService {
return bytes.length >= 2 && bytes[0] == 0xFF && (bytes[1] & 0xE0) == 0xE0;
}
// Server audio transcription removed; rely on on-device STT in UI layer
String _inferMimeTypeFromName(String name) {
final dotIndex = name.lastIndexOf('.');
if (dotIndex == -1 || dotIndex == name.length - 1) {
return 'audio/mpeg';
}
final ext = name.substring(dotIndex + 1).toLowerCase();
switch (ext) {
case 'wav':
return 'audio/wav';
case 'ogg':
return 'audio/ogg';
case 'm4a':
case 'mp4':
return 'audio/mp4';
case 'aac':
return 'audio/aac';
case 'webm':
return 'audio/webm';
case 'flac':
return 'audio/flac';
case 'mp3':
return 'audio/mpeg';
default:
return 'audio/mpeg';
}
}
MediaType? _parseMediaType(String? value) {
if (value == null || value.isEmpty) {
return null;
}
try {
return MediaType.parse(value);
} catch (_) {
return null;
}
}
// Image Generation
Future<List<Map<String, dynamic>>> getImageModels() async {