feat(chat): add llama.cpp server token speed parsing support

2025-12-16 15:44:44 +05:30
parent 67bcd242bc
commit 5e078c05f7
1 changed files with 32 additions and 5 deletions
@@ -1532,10 +1532,26 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
        ? _parseNum(completionDetails['reasoning_tokens'])
        : null;

+    // llama.cpp server format: pre-calculated tokens/second values
+    final predictedPerSecond = _parseNum(usage['predicted_per_second']);
+    final promptPerSecond = _parseNum(usage['prompt_per_second']);
+    final predictedN = _parseNum(usage['predicted_n']);
+    final promptN = _parseNum(usage['prompt_n']);
+
    // --- Token Generation Speed ---
-    // Priority: Ollama format > Groq/OpenAI extended format > token count only
-    if (evalCount != null && evalDuration != null && evalDuration > 0) {
-      // Ollama/llama.cpp: duration in nanoseconds
+    // Priority: llama.cpp direct > Ollama calculated > Groq/OpenAI > count only
+    if (predictedPerSecond != null && predictedPerSecond > 0) {
+      // llama.cpp server: pre-calculated tokens/second
+      stats.add(
+        _UsageStatRow(
+          label: l10n.usageTokenGeneration,
+          value: l10n.usageTokensPerSecond(predictedPerSecond.toStringAsFixed(1)),
+          detail: predictedN != null ? l10n.usageTokenCount(predictedN.toInt()) : null,
+          theme: theme,
+        ),
+      );
+    } else if (evalCount != null && evalDuration != null && evalDuration > 0) {
+      // Ollama: duration in nanoseconds
      final tgSpeed = evalCount / (evalDuration / 1e9);
      stats.add(
        _UsageStatRow(
@@ -1570,10 +1586,21 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
    }

    // --- Prompt Processing Speed ---
-    if (promptEvalCount != null &&
+    // Priority: llama.cpp direct > Ollama calculated > Groq/OpenAI > count only
+    if (promptPerSecond != null && promptPerSecond > 0) {
+      // llama.cpp server: pre-calculated tokens/second
+      stats.add(
+        _UsageStatRow(
+          label: l10n.usagePromptEval,
+          value: l10n.usageTokensPerSecond(promptPerSecond.toStringAsFixed(1)),
+          detail: promptN != null ? l10n.usageTokenCount(promptN.toInt()) : null,
+          theme: theme,
+        ),
+      );
+    } else if (promptEvalCount != null &&
        promptEvalDuration != null &&
        promptEvalDuration > 0) {
-      // Ollama/llama.cpp: duration in nanoseconds
+      // Ollama: duration in nanoseconds
      final ppSpeed = promptEvalCount / (promptEvalDuration / 1e9);
      stats.add(
        _UsageStatRow(