refactor: optimize regex handling in markdown and tool calls parsing

- Improved performance by pre-compiling regex patterns in the ConduitMarkdownPreprocessor for better efficiency during streaming. - Enhanced the ToolCallsParser to conditionally run cleanup regex only when necessary, ensuring cleaner and more efficient text processing. - Updated the AssistantMessageWidget to perform quick checks before cleaning raw tags, reducing unnecessary operations and improving overall performance.
2025-10-05 23:51:48 +05:30
parent 9dd27bb4e5
commit 3af46b379b
3 changed files with 81 additions and 60 deletions
@@ -180,17 +180,22 @@ class ToolCallsParser {
      if (seg.isToolCall && seg.entry != null) {
        calls.add(seg.entry!);
      } else if (seg.text != null && seg.text!.isNotEmpty) {
-        // Remove any embedded tool_calls blocks that may have slipped into text
-        final cleaned = seg.text!
-            .replaceAll(
-              RegExp(
-                r'<details\s+type=\"tool_calls\"[^>]*>[\s\S]*?<\/details>',
-                multiLine: true,
-                dotAll: true,
-              ),
-              '',
-            )
-            .trim();
+        final text = seg.text!;
+        // Quick check: only run cleanup regex if tool_calls details might exist
+        // (they should already be parsed as segments, but this is a safety net)
+        String cleaned = text;
+        if (text.contains('<details') && text.contains('tool_calls')) {
+          // Remove any embedded tool_calls blocks that may have slipped into text
+          cleaned = text.replaceAll(
+            RegExp(
+              r'<details\s+type=\"tool_calls\"[^>]*>[\s\S]*?<\/details>',
+              multiLine: true,
+              dotAll: true,
+            ),
+            '',
+          );
+        }
+        cleaned = cleaned.trim();
        if (cleaned.isNotEmpty) buf.write(cleaned);
      }
    }
@@ -290,16 +295,21 @@ class ToolCallsParser {
        }
        buf.write(out);
      } else {
-        // Keep the raw text, but also remove any stray non-tool_calls details blocks
-        final t = (seg.text ?? '').replaceAll(
-          RegExp(
-            r'<details(?!\s+type=\"tool_calls\")[^>]*>[\s\S]*?<\/details>',
-            multiLine: true,
-            dotAll: true,
-          ),
-          '',
-        );
-        if (t.isNotEmpty) buf.write(t);
+        final text = seg.text ?? '';
+        // Quick check: only run cleanup regex if details tags exist
+        String cleaned = text;
+        if (text.contains('<details')) {
+          // Keep the raw text, but also remove any stray non-tool_calls details blocks
+          cleaned = text.replaceAll(
+            RegExp(
+              r'<details(?!\s+type=\"tool_calls\")[^>]*>[\s\S]*?<\/details>',
+              multiLine: true,
+              dotAll: true,
+            ),
+            '',
+          );
+        }
+        if (cleaned.isNotEmpty) buf.write(cleaned);
      }
    }

@@ -724,22 +724,25 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
    // and type="tool_calls") via a custom block syntax, so they won't be rendered as
    // plain text during streaming. This prevents character flashing.

-    // We still clean raw reasoning tags (<think>, <reasoning>) as a fallback.
-    // The server normally converts these to <details> format, but raw mode or
-    // direct API responses might still use them.
-    String cleaned = content
-        .replaceAll(
-          RegExp(r'<think>[\s\S]*?<\/think>', multiLine: true, dotAll: true),
-          '',
-        )
-        .replaceAll(
-          RegExp(
-            r'<reasoning>[\s\S]*?<\/reasoning>',
-            multiLine: true,
-            dotAll: true,
-          ),
-          '',
-        );
+    // Quick check: only run cleanup if raw tags might exist (rare case)
+    String cleaned = content;
+    if (content.contains('<think>') || content.contains('<reasoning>')) {
+      // Clean raw reasoning tags as a fallback for raw mode or direct API responses.
+      // The server normally converts these to <details> format.
+      cleaned = content
+          .replaceAll(
+            RegExp(r'<think>[\s\S]*?<\/think>', multiLine: true, dotAll: true),
+            '',
+          )
+          .replaceAll(
+            RegExp(
+              r'<reasoning>[\s\S]*?<\/reasoning>',
+              multiLine: true,
+              dotAll: true,
+            ),
+            '',
+          );
+    }

    // Process images in the remaining text
    final processedContent = _processContentForImages(cleaned);
@@ -4,6 +4,30 @@
 class ConduitMarkdownPreprocessor {
  const ConduitMarkdownPreprocessor._();

+  // Pre-compile regex patterns for better performance during streaming
+  static final _bulletFenceRegex = RegExp(
+    r'^(\s*(?:[*+-]|\d+\.)\s+)```([^\s`]*)\s*$',
+    multiLine: true,
+  );
+  static final _dedentOpenRegex = RegExp(
+    r'^[ \t]+```([^\n`]*)\s*$',
+    multiLine: true,
+  );
+  static final _dedentCloseRegex = RegExp(
+    r'^[ \t]+```\s*$',
+    multiLine: true,
+  );
+  static final _inlineClosingRegex = RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))');
+  static final _labelThenDashRegex = RegExp(
+    r'^(\*\*[^\n*]+\*\*.*)\n(\s*-{3,}\s*$)',
+    multiLine: true,
+  );
+  static final _atxEnumRegex = RegExp(
+    r'^(\s{0,3}#{1,6}\s+\d+)\.(\s*)(\S)',
+    multiLine: true,
+  );
+  static final _fenceAtBolRegex = RegExp(r'^\s*```', multiLine: true);
+
  /// Normalises common fence and hard-break issues produced by LLMs.
  static String normalize(String input) {
    if (input.isEmpty) {
@@ -14,58 +38,42 @@ class ConduitMarkdownPreprocessor {

    // Move fenced code blocks that start on the same line as a list item onto
    // their own line so the parser does not treat them as list text.
-    final bulletFence = RegExp(
-      r'^(\s*(?:[*+-]|\d+\.)\s+)```([^\s`]*)\s*$',
-      multiLine: true,
-    );
    output = output.replaceAllMapped(
-      bulletFence,
+      _bulletFenceRegex,
      (match) => '${match[1]}\n```${match[2]}',
    );

    // Dedent opening fences to avoid partial code-block detection when the
    // model indents fences by accident.
-    final dedentOpen = RegExp(r'^[ \t]+```([^\n`]*)\s*$', multiLine: true);
-    output = output.replaceAllMapped(dedentOpen, (match) => '```${match[1]}');
+    output = output.replaceAllMapped(_dedentOpenRegex, (match) => '```${match[1]}');

    // Dedent closing fences for the same reason as the opening fences.
-    final dedentClose = RegExp(r'^[ \t]+```\s*$', multiLine: true);
-    output = output.replaceAllMapped(dedentClose, (_) => '```');
+    output = output.replaceAllMapped(_dedentCloseRegex, (_) => '```');

    // Ensure closing fences stand alone. Prevents situations like `}\n```foo`
    // from keeping trailing braces inside the code block.
-    final inlineClosing = RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))');
    output = output.replaceAllMapped(
-      inlineClosing,
+      _inlineClosingRegex,
      (match) => '${match[1]}\n```',
    );

    // Insert a blank line when a "label: value" line is followed by a
    // horizontal rule so it is not treated as a Setext heading underline.
-    final labelThenDash = RegExp(
-      r'^(\*\*[^\n*]+\*\*.*)\n(\s*-{3,}\s*$)',
-      multiLine: true,
-    );
    output = output.replaceAllMapped(
-      labelThenDash,
+      _labelThenDashRegex,
      (match) => '${match[1]}\n\n${match[2]}',
    );

    // Allow headings like "## 1. Summary" without triggering ordered-list
    // parsing by inserting a zero-width joiner after the numeric marker.
-    final atxEnum = RegExp(
-      r'^(\s{0,3}#{1,6}\s+\d+)\.(\s*)(\S)',
-      multiLine: true,
-    );
    output = output.replaceAllMapped(
-      atxEnum,
+      _atxEnumRegex,
      (match) => '${match[1]}.\u200C${match[2]}${match[3]}',
    );

    // Auto-close an unmatched opening fence at EOF to avoid the entire tail
    // of the message rendering as code.
-    final fenceAtBol = RegExp(r'^\s*```', multiLine: true);
-    final fenceCount = fenceAtBol.allMatches(output).length;
+    final fenceCount = _fenceAtBolRegex.allMatches(output).length;
    if (fenceCount.isOdd) {
      if (!output.endsWith('\n')) {
        output += '\n';