refactor: optimize regex handling in markdown and tool calls parsing
- Improved performance by pre-compiling regex patterns in the ConduitMarkdownPreprocessor for better efficiency during streaming. - Enhanced the ToolCallsParser to conditionally run cleanup regex only when necessary, ensuring cleaner and more efficient text processing. - Updated the AssistantMessageWidget to perform quick checks before cleaning raw tags, reducing unnecessary operations and improving overall performance.
This commit is contained in:
@@ -180,17 +180,22 @@ class ToolCallsParser {
|
|||||||
if (seg.isToolCall && seg.entry != null) {
|
if (seg.isToolCall && seg.entry != null) {
|
||||||
calls.add(seg.entry!);
|
calls.add(seg.entry!);
|
||||||
} else if (seg.text != null && seg.text!.isNotEmpty) {
|
} else if (seg.text != null && seg.text!.isNotEmpty) {
|
||||||
// Remove any embedded tool_calls blocks that may have slipped into text
|
final text = seg.text!;
|
||||||
final cleaned = seg.text!
|
// Quick check: only run cleanup regex if tool_calls details might exist
|
||||||
.replaceAll(
|
// (they should already be parsed as segments, but this is a safety net)
|
||||||
RegExp(
|
String cleaned = text;
|
||||||
r'<details\s+type=\"tool_calls\"[^>]*>[\s\S]*?<\/details>',
|
if (text.contains('<details') && text.contains('tool_calls')) {
|
||||||
multiLine: true,
|
// Remove any embedded tool_calls blocks that may have slipped into text
|
||||||
dotAll: true,
|
cleaned = text.replaceAll(
|
||||||
),
|
RegExp(
|
||||||
'',
|
r'<details\s+type=\"tool_calls\"[^>]*>[\s\S]*?<\/details>',
|
||||||
)
|
multiLine: true,
|
||||||
.trim();
|
dotAll: true,
|
||||||
|
),
|
||||||
|
'',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
cleaned = cleaned.trim();
|
||||||
if (cleaned.isNotEmpty) buf.write(cleaned);
|
if (cleaned.isNotEmpty) buf.write(cleaned);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -290,16 +295,21 @@ class ToolCallsParser {
|
|||||||
}
|
}
|
||||||
buf.write(out);
|
buf.write(out);
|
||||||
} else {
|
} else {
|
||||||
// Keep the raw text, but also remove any stray non-tool_calls details blocks
|
final text = seg.text ?? '';
|
||||||
final t = (seg.text ?? '').replaceAll(
|
// Quick check: only run cleanup regex if details tags exist
|
||||||
RegExp(
|
String cleaned = text;
|
||||||
r'<details(?!\s+type=\"tool_calls\")[^>]*>[\s\S]*?<\/details>',
|
if (text.contains('<details')) {
|
||||||
multiLine: true,
|
// Keep the raw text, but also remove any stray non-tool_calls details blocks
|
||||||
dotAll: true,
|
cleaned = text.replaceAll(
|
||||||
),
|
RegExp(
|
||||||
'',
|
r'<details(?!\s+type=\"tool_calls\")[^>]*>[\s\S]*?<\/details>',
|
||||||
);
|
multiLine: true,
|
||||||
if (t.isNotEmpty) buf.write(t);
|
dotAll: true,
|
||||||
|
),
|
||||||
|
'',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (cleaned.isNotEmpty) buf.write(cleaned);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -724,22 +724,25 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
|
|||||||
// and type="tool_calls") via a custom block syntax, so they won't be rendered as
|
// and type="tool_calls") via a custom block syntax, so they won't be rendered as
|
||||||
// plain text during streaming. This prevents character flashing.
|
// plain text during streaming. This prevents character flashing.
|
||||||
|
|
||||||
// We still clean raw reasoning tags (<think>, <reasoning>) as a fallback.
|
// Quick check: only run cleanup if raw tags might exist (rare case)
|
||||||
// The server normally converts these to <details> format, but raw mode or
|
String cleaned = content;
|
||||||
// direct API responses might still use them.
|
if (content.contains('<think>') || content.contains('<reasoning>')) {
|
||||||
String cleaned = content
|
// Clean raw reasoning tags as a fallback for raw mode or direct API responses.
|
||||||
.replaceAll(
|
// The server normally converts these to <details> format.
|
||||||
RegExp(r'<think>[\s\S]*?<\/think>', multiLine: true, dotAll: true),
|
cleaned = content
|
||||||
'',
|
.replaceAll(
|
||||||
)
|
RegExp(r'<think>[\s\S]*?<\/think>', multiLine: true, dotAll: true),
|
||||||
.replaceAll(
|
'',
|
||||||
RegExp(
|
)
|
||||||
r'<reasoning>[\s\S]*?<\/reasoning>',
|
.replaceAll(
|
||||||
multiLine: true,
|
RegExp(
|
||||||
dotAll: true,
|
r'<reasoning>[\s\S]*?<\/reasoning>',
|
||||||
),
|
multiLine: true,
|
||||||
'',
|
dotAll: true,
|
||||||
);
|
),
|
||||||
|
'',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Process images in the remaining text
|
// Process images in the remaining text
|
||||||
final processedContent = _processContentForImages(cleaned);
|
final processedContent = _processContentForImages(cleaned);
|
||||||
|
|||||||
@@ -4,6 +4,30 @@
|
|||||||
class ConduitMarkdownPreprocessor {
|
class ConduitMarkdownPreprocessor {
|
||||||
const ConduitMarkdownPreprocessor._();
|
const ConduitMarkdownPreprocessor._();
|
||||||
|
|
||||||
|
// Pre-compile regex patterns for better performance during streaming
|
||||||
|
static final _bulletFenceRegex = RegExp(
|
||||||
|
r'^(\s*(?:[*+-]|\d+\.)\s+)```([^\s`]*)\s*$',
|
||||||
|
multiLine: true,
|
||||||
|
);
|
||||||
|
static final _dedentOpenRegex = RegExp(
|
||||||
|
r'^[ \t]+```([^\n`]*)\s*$',
|
||||||
|
multiLine: true,
|
||||||
|
);
|
||||||
|
static final _dedentCloseRegex = RegExp(
|
||||||
|
r'^[ \t]+```\s*$',
|
||||||
|
multiLine: true,
|
||||||
|
);
|
||||||
|
static final _inlineClosingRegex = RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))');
|
||||||
|
static final _labelThenDashRegex = RegExp(
|
||||||
|
r'^(\*\*[^\n*]+\*\*.*)\n(\s*-{3,}\s*$)',
|
||||||
|
multiLine: true,
|
||||||
|
);
|
||||||
|
static final _atxEnumRegex = RegExp(
|
||||||
|
r'^(\s{0,3}#{1,6}\s+\d+)\.(\s*)(\S)',
|
||||||
|
multiLine: true,
|
||||||
|
);
|
||||||
|
static final _fenceAtBolRegex = RegExp(r'^\s*```', multiLine: true);
|
||||||
|
|
||||||
/// Normalises common fence and hard-break issues produced by LLMs.
|
/// Normalises common fence and hard-break issues produced by LLMs.
|
||||||
static String normalize(String input) {
|
static String normalize(String input) {
|
||||||
if (input.isEmpty) {
|
if (input.isEmpty) {
|
||||||
@@ -14,58 +38,42 @@ class ConduitMarkdownPreprocessor {
|
|||||||
|
|
||||||
// Move fenced code blocks that start on the same line as a list item onto
|
// Move fenced code blocks that start on the same line as a list item onto
|
||||||
// their own line so the parser does not treat them as list text.
|
// their own line so the parser does not treat them as list text.
|
||||||
final bulletFence = RegExp(
|
|
||||||
r'^(\s*(?:[*+-]|\d+\.)\s+)```([^\s`]*)\s*$',
|
|
||||||
multiLine: true,
|
|
||||||
);
|
|
||||||
output = output.replaceAllMapped(
|
output = output.replaceAllMapped(
|
||||||
bulletFence,
|
_bulletFenceRegex,
|
||||||
(match) => '${match[1]}\n```${match[2]}',
|
(match) => '${match[1]}\n```${match[2]}',
|
||||||
);
|
);
|
||||||
|
|
||||||
// Dedent opening fences to avoid partial code-block detection when the
|
// Dedent opening fences to avoid partial code-block detection when the
|
||||||
// model indents fences by accident.
|
// model indents fences by accident.
|
||||||
final dedentOpen = RegExp(r'^[ \t]+```([^\n`]*)\s*$', multiLine: true);
|
output = output.replaceAllMapped(_dedentOpenRegex, (match) => '```${match[1]}');
|
||||||
output = output.replaceAllMapped(dedentOpen, (match) => '```${match[1]}');
|
|
||||||
|
|
||||||
// Dedent closing fences for the same reason as the opening fences.
|
// Dedent closing fences for the same reason as the opening fences.
|
||||||
final dedentClose = RegExp(r'^[ \t]+```\s*$', multiLine: true);
|
output = output.replaceAllMapped(_dedentCloseRegex, (_) => '```');
|
||||||
output = output.replaceAllMapped(dedentClose, (_) => '```');
|
|
||||||
|
|
||||||
// Ensure closing fences stand alone. Prevents situations like `}\n```foo`
|
// Ensure closing fences stand alone. Prevents situations like `}\n```foo`
|
||||||
// from keeping trailing braces inside the code block.
|
// from keeping trailing braces inside the code block.
|
||||||
final inlineClosing = RegExp(r'([^\r\n`])```(?=\s*(?:\r?\n|$))');
|
|
||||||
output = output.replaceAllMapped(
|
output = output.replaceAllMapped(
|
||||||
inlineClosing,
|
_inlineClosingRegex,
|
||||||
(match) => '${match[1]}\n```',
|
(match) => '${match[1]}\n```',
|
||||||
);
|
);
|
||||||
|
|
||||||
// Insert a blank line when a "label: value" line is followed by a
|
// Insert a blank line when a "label: value" line is followed by a
|
||||||
// horizontal rule so it is not treated as a Setext heading underline.
|
// horizontal rule so it is not treated as a Setext heading underline.
|
||||||
final labelThenDash = RegExp(
|
|
||||||
r'^(\*\*[^\n*]+\*\*.*)\n(\s*-{3,}\s*$)',
|
|
||||||
multiLine: true,
|
|
||||||
);
|
|
||||||
output = output.replaceAllMapped(
|
output = output.replaceAllMapped(
|
||||||
labelThenDash,
|
_labelThenDashRegex,
|
||||||
(match) => '${match[1]}\n\n${match[2]}',
|
(match) => '${match[1]}\n\n${match[2]}',
|
||||||
);
|
);
|
||||||
|
|
||||||
// Allow headings like "## 1. Summary" without triggering ordered-list
|
// Allow headings like "## 1. Summary" without triggering ordered-list
|
||||||
// parsing by inserting a zero-width joiner after the numeric marker.
|
// parsing by inserting a zero-width joiner after the numeric marker.
|
||||||
final atxEnum = RegExp(
|
|
||||||
r'^(\s{0,3}#{1,6}\s+\d+)\.(\s*)(\S)',
|
|
||||||
multiLine: true,
|
|
||||||
);
|
|
||||||
output = output.replaceAllMapped(
|
output = output.replaceAllMapped(
|
||||||
atxEnum,
|
_atxEnumRegex,
|
||||||
(match) => '${match[1]}.\u200C${match[2]}${match[3]}',
|
(match) => '${match[1]}.\u200C${match[2]}${match[3]}',
|
||||||
);
|
);
|
||||||
|
|
||||||
// Auto-close an unmatched opening fence at EOF to avoid the entire tail
|
// Auto-close an unmatched opening fence at EOF to avoid the entire tail
|
||||||
// of the message rendering as code.
|
// of the message rendering as code.
|
||||||
final fenceAtBol = RegExp(r'^\s*```', multiLine: true);
|
final fenceCount = _fenceAtBolRegex.allMatches(output).length;
|
||||||
final fenceCount = fenceAtBol.allMatches(output).length;
|
|
||||||
if (fenceCount.isOdd) {
|
if (fenceCount.isOdd) {
|
||||||
if (!output.endsWith('\n')) {
|
if (!output.endsWith('\n')) {
|
||||||
output += '\n';
|
output += '\n';
|
||||||
|
|||||||
Reference in New Issue
Block a user