refactor: optimize regex handling in markdown and tool calls parsing

- Improved performance by pre-compiling regex patterns in the ConduitMarkdownPreprocessor for better efficiency during streaming.
- Enhanced the ToolCallsParser to conditionally run cleanup regex only when necessary, ensuring cleaner and more efficient text processing.
- Updated the AssistantMessageWidget to perform quick checks before cleaning raw tags, reducing unnecessary operations and improving overall performance.
This commit is contained in:
cogwheel0
2025-10-05 23:51:48 +05:30
parent 9dd27bb4e5
commit 3af46b379b
3 changed files with 81 additions and 60 deletions

View File

@@ -180,17 +180,22 @@ class ToolCallsParser {
if (seg.isToolCall && seg.entry != null) { if (seg.isToolCall && seg.entry != null) {
calls.add(seg.entry!); calls.add(seg.entry!);
} else if (seg.text != null && seg.text!.isNotEmpty) { } else if (seg.text != null && seg.text!.isNotEmpty) {
// Remove any embedded tool_calls blocks that may have slipped into text final text = seg.text!;
final cleaned = seg.text! // Quick check: only run cleanup regex if tool_calls details might exist
.replaceAll( // (they should already be parsed as segments, but this is a safety net)
RegExp( String cleaned = text;
r'<details\s+type=\"tool_calls\"[^>]*>[\s\S]*?<\/details>', if (text.contains('<details') && text.contains('tool_calls')) {
multiLine: true, // Remove any embedded tool_calls blocks that may have slipped into text
dotAll: true, cleaned = text.replaceAll(
), RegExp(
'', r'<details\s+type=\"tool_calls\"[^>]*>[\s\S]*?<\/details>',
) multiLine: true,
.trim(); dotAll: true,
),
'',
);
}
cleaned = cleaned.trim();
if (cleaned.isNotEmpty) buf.write(cleaned); if (cleaned.isNotEmpty) buf.write(cleaned);
} }
} }
@@ -290,16 +295,21 @@ class ToolCallsParser {
} }
buf.write(out); buf.write(out);
} else { } else {
// Keep the raw text, but also remove any stray non-tool_calls details blocks final text = seg.text ?? '';
final t = (seg.text ?? '').replaceAll( // Quick check: only run cleanup regex if details tags exist
RegExp( String cleaned = text;
r'<details(?!\s+type=\"tool_calls\")[^>]*>[\s\S]*?<\/details>', if (text.contains('<details')) {
multiLine: true, // Keep the raw text, but also remove any stray non-tool_calls details blocks
dotAll: true, cleaned = text.replaceAll(
), RegExp(
'', r'<details(?!\s+type=\"tool_calls\")[^>]*>[\s\S]*?<\/details>',
); multiLine: true,
if (t.isNotEmpty) buf.write(t); dotAll: true,
),
'',
);
}
if (cleaned.isNotEmpty) buf.write(cleaned);
} }
} }

View File

@@ -724,22 +724,25 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
// and type="tool_calls") via a custom block syntax, so they won't be rendered as // and type="tool_calls") via a custom block syntax, so they won't be rendered as
// plain text during streaming. This prevents character flashing. // plain text during streaming. This prevents character flashing.
// We still clean raw reasoning tags (<think>, <reasoning>) as a fallback. // Quick check: only run cleanup if raw tags might exist (rare case)
// The server normally converts these to <details> format, but raw mode or String cleaned = content;
// direct API responses might still use them. if (content.contains('<think>') || content.contains('<reasoning>')) {
String cleaned = content // Clean raw reasoning tags as a fallback for raw mode or direct API responses.
.replaceAll( // The server normally converts these to <details> format.
RegExp(r'<think>[\s\S]*?<\/think>', multiLine: true, dotAll: true), cleaned = content
'', .replaceAll(
) RegExp(r'<think>[\s\S]*?<\/think>', multiLine: true, dotAll: true),
.replaceAll( '',
RegExp( )
r'<reasoning>[\s\S]*?<\/reasoning>', .replaceAll(
multiLine: true, RegExp(
dotAll: true, r'<reasoning>[\s\S]*?<\/reasoning>',
), multiLine: true,
'', dotAll: true,
); ),
'',
);
}
// Process images in the remaining text // Process images in the remaining text
final processedContent = _processContentForImages(cleaned); final processedContent = _processContentForImages(cleaned);

View File

@@ -4,6 +4,30 @@
class ConduitMarkdownPreprocessor { class ConduitMarkdownPreprocessor {
const ConduitMarkdownPreprocessor._(); const ConduitMarkdownPreprocessor._();
// Pre-compile regex patterns for better performance during streaming
static final _bulletFenceRegex = RegExp(
r'^(\s*(?:[*+-]|\d+\.)\s+)```([^\s`]*)\s*$',
multiLine: true,
);
static final _dedentOpenRegex = RegExp(
r'^[ \t]+```([^\n`]*)\s*$',
multiLine: true,
);
static final _dedentCloseRegex = RegExp(
r'^[ \t]+```\s*$',
multiLine: true,
);
static final _inlineClosingRegex = RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))');
static final _labelThenDashRegex = RegExp(
r'^(\*\*[^\n*]+\*\*.*)\n(\s*-{3,}\s*$)',
multiLine: true,
);
static final _atxEnumRegex = RegExp(
r'^(\s{0,3}#{1,6}\s+\d+)\.(\s*)(\S)',
multiLine: true,
);
static final _fenceAtBolRegex = RegExp(r'^\s*```', multiLine: true);
/// Normalises common fence and hard-break issues produced by LLMs. /// Normalises common fence and hard-break issues produced by LLMs.
static String normalize(String input) { static String normalize(String input) {
if (input.isEmpty) { if (input.isEmpty) {
@@ -14,58 +38,42 @@ class ConduitMarkdownPreprocessor {
// Move fenced code blocks that start on the same line as a list item onto // Move fenced code blocks that start on the same line as a list item onto
// their own line so the parser does not treat them as list text. // their own line so the parser does not treat them as list text.
final bulletFence = RegExp(
r'^(\s*(?:[*+-]|\d+\.)\s+)```([^\s`]*)\s*$',
multiLine: true,
);
output = output.replaceAllMapped( output = output.replaceAllMapped(
bulletFence, _bulletFenceRegex,
(match) => '${match[1]}\n```${match[2]}', (match) => '${match[1]}\n```${match[2]}',
); );
// Dedent opening fences to avoid partial code-block detection when the // Dedent opening fences to avoid partial code-block detection when the
// model indents fences by accident. // model indents fences by accident.
final dedentOpen = RegExp(r'^[ \t]+```([^\n`]*)\s*$', multiLine: true); output = output.replaceAllMapped(_dedentOpenRegex, (match) => '```${match[1]}');
output = output.replaceAllMapped(dedentOpen, (match) => '```${match[1]}');
// Dedent closing fences for the same reason as the opening fences. // Dedent closing fences for the same reason as the opening fences.
final dedentClose = RegExp(r'^[ \t]+```\s*$', multiLine: true); output = output.replaceAllMapped(_dedentCloseRegex, (_) => '```');
output = output.replaceAllMapped(dedentClose, (_) => '```');
// Ensure closing fences stand alone. Prevents situations like `}\n```foo` // Ensure closing fences stand alone. Prevents situations like `}\n```foo`
// from keeping trailing braces inside the code block. // from keeping trailing braces inside the code block.
final inlineClosing = RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))');
output = output.replaceAllMapped( output = output.replaceAllMapped(
inlineClosing, _inlineClosingRegex,
(match) => '${match[1]}\n```', (match) => '${match[1]}\n```',
); );
// Insert a blank line when a "label: value" line is followed by a // Insert a blank line when a "label: value" line is followed by a
// horizontal rule so it is not treated as a Setext heading underline. // horizontal rule so it is not treated as a Setext heading underline.
final labelThenDash = RegExp(
r'^(\*\*[^\n*]+\*\*.*)\n(\s*-{3,}\s*$)',
multiLine: true,
);
output = output.replaceAllMapped( output = output.replaceAllMapped(
labelThenDash, _labelThenDashRegex,
(match) => '${match[1]}\n\n${match[2]}', (match) => '${match[1]}\n\n${match[2]}',
); );
// Allow headings like "## 1. Summary" without triggering ordered-list // Allow headings like "## 1. Summary" without triggering ordered-list
// parsing by inserting a zero-width joiner after the numeric marker. // parsing by inserting a zero-width joiner after the numeric marker.
final atxEnum = RegExp(
r'^(\s{0,3}#{1,6}\s+\d+)\.(\s*)(\S)',
multiLine: true,
);
output = output.replaceAllMapped( output = output.replaceAllMapped(
atxEnum, _atxEnumRegex,
(match) => '${match[1]}.\u200C${match[2]}${match[3]}', (match) => '${match[1]}.\u200C${match[2]}${match[3]}',
); );
// Auto-close an unmatched opening fence at EOF to avoid the entire tail // Auto-close an unmatched opening fence at EOF to avoid the entire tail
// of the message rendering as code. // of the message rendering as code.
final fenceAtBol = RegExp(r'^\s*```', multiLine: true); final fenceCount = _fenceAtBolRegex.allMatches(output).length;
final fenceCount = fenceAtBol.allMatches(output).length;
if (fenceCount.isOdd) { if (fenceCount.isOdd) {
if (!output.endsWith('\n')) { if (!output.endsWith('\n')) {
output += '\n'; output += '\n';