refactor: optimize regex patterns for image and markdown processing

- Introduced pre-compiled regex patterns across various components, including streaming_helper, markdown_stream_formatter, and assistant_message_widget, to enhance performance during image extraction and markdown formatting.
- Updated the AssistantMessageWidget to utilize these optimized patterns for TTS sanitization and image processing, reducing unnecessary regex evaluations.
- Improved overall efficiency in handling markdown content by leveraging pre-compiled patterns for common markdown syntax detection.
This commit is contained in:
cogwheel0
2025-10-06 00:09:52 +05:30
parent 3af46b379b
commit a2e5f46d62
4 changed files with 94 additions and 80 deletions

View File

@@ -20,6 +20,28 @@ import 'streaming_response_controller.dart';
// Keep local verbosity toggle for socket logs // Keep local verbosity toggle for socket logs
const bool kSocketVerboseLogging = false; const bool kSocketVerboseLogging = false;
// Pre-compiled regex patterns for image extraction (performance optimization)
final _base64ImagePattern = RegExp(
r'data:image/[^;\s]+;base64,[A-Za-z0-9+/]+=*',
);
final _urlImagePattern = RegExp(
r'https?://[^\s<>\"]+\.(jpg|jpeg|png|gif|webp)',
caseSensitive: false,
);
final _jsonImagePattern = RegExp(
r'\{[^}]*"url"[^}]*:[^}]*"(data:image/[^"]+|https?://[^"]+\.(jpg|jpeg|png|gif|webp))"[^}]*\}',
caseSensitive: false,
);
final _jsonUrlExtractPattern = RegExp(r'"url"[^:]*:[^"]*"([^"]+)"');
final _partialResultsPattern = RegExp(
r'(result|files)="([^"]*(?:data:image/[^"]*|https?://[^"]*\.(jpg|jpeg|png|gif|webp))[^"]*)"',
caseSensitive: false,
);
final _imageFilePattern = RegExp(
r'https?://[^\s]+\.(jpg|jpeg|png|gif|webp)$',
caseSensitive: false,
);
class ActiveSocketStream { class ActiveSocketStream {
ActiveSocketStream({ ActiveSocketStream({
required this.controller, required this.controller,
@@ -194,7 +216,8 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
final collected = <Map<String, dynamic>>[]; final collected = <Map<String, dynamic>>[];
if (content.contains('<details')) { // Quick check: only parse tool calls if complete details blocks exist
if (content.contains('<details') && content.contains('</details>')) {
final parsed = ToolCallsParser.parse(content); final parsed = ToolCallsParser.parse(content);
if (parsed != null) { if (parsed != null) {
for (final entry in parsed.toolCalls) { for (final entry in parsed.toolCalls) {
@@ -209,10 +232,8 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
} }
if (collected.isEmpty) { if (collected.isEmpty) {
final base64Pattern = RegExp( // Use pre-compiled patterns for better performance
r'data:image/[^;\s]+;base64,[A-Za-z0-9+/]+=*', final base64Matches = _base64ImagePattern.allMatches(content);
);
final base64Matches = base64Pattern.allMatches(content);
for (final match in base64Matches) { for (final match in base64Matches) {
final url = match.group(0); final url = match.group(0);
if (url != null && url.isNotEmpty) { if (url != null && url.isNotEmpty) {
@@ -220,11 +241,7 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
} }
} }
final urlPattern = RegExp( final urlMatches = _urlImagePattern.allMatches(content);
r'https?://[^\s<>\"]+\.(jpg|jpeg|png|gif|webp)',
caseSensitive: false,
);
final urlMatches = urlPattern.allMatches(content);
for (final match in urlMatches) { for (final match in urlMatches) {
final url = match.group(0); final url = match.group(0);
if (url != null && url.isNotEmpty) { if (url != null && url.isNotEmpty) {
@@ -232,25 +249,17 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
} }
} }
final jsonPattern = RegExp( final jsonMatches = _jsonImagePattern.allMatches(content);
r'\{[^}]*"url"[^}]*:[^}]*"(data:image/[^"]+|https?://[^"]+\.(jpg|jpeg|png|gif|webp))"[^}]*\}',
caseSensitive: false,
);
final jsonMatches = jsonPattern.allMatches(content);
for (final match in jsonMatches) { for (final match in jsonMatches) {
final url = RegExp( final url = _jsonUrlExtractPattern
r'"url"[^:]*:[^"]*"([^"]+)"', .firstMatch(match.group(0) ?? '')
).firstMatch(match.group(0) ?? '')?.group(1); ?.group(1);
if (url != null && url.isNotEmpty) { if (url != null && url.isNotEmpty) {
collected.add({'type': 'image', 'url': url}); collected.add({'type': 'image', 'url': url});
} }
} }
final partialResultsPattern = RegExp( final partialMatches = _partialResultsPattern.allMatches(content);
r'(result|files)="([^"]*(?:data:image/[^"]*|https?://[^"]*\.(jpg|jpeg|png|gif|webp))[^"]*)"',
caseSensitive: false,
);
final partialMatches = partialResultsPattern.allMatches(content);
for (final match in partialMatches) { for (final match in partialMatches) {
final attrValue = match.group(2); final attrValue = match.group(2);
if (attrValue != null) { if (attrValue != null) {
@@ -259,10 +268,7 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
collected.addAll(_extractFilesFromResult(decoded)); collected.addAll(_extractFilesFromResult(decoded));
} catch (_) { } catch (_) {
if (attrValue.startsWith('data:image/') || if (attrValue.startsWith('data:image/') ||
RegExp( _imageFilePattern.hasMatch(attrValue)) {
r'https?://[^\s]+\.(jpg|jpeg|png|gif|webp)$',
caseSensitive: false,
).hasMatch(attrValue)) {
collected.add({'type': 'image', 'url': attrValue}); collected.add({'type': 'image', 'url': attrValue});
} }
} }
@@ -410,14 +416,9 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
: null; : null;
if (name is String && name.isNotEmpty) { if (name is String && name.isNotEmpty) {
final msgs = getMessages(); final msgs = getMessages();
final exists = // Quick string check before expensive regex
(msgs.isNotEmpty) && final exists = (msgs.isNotEmpty) &&
RegExp( msgs.last.content.contains('name="$name"');
r'<details\s+type=\"tool_calls\"[^>]*\bname=\"' +
RegExp.escape(name) +
r'\"',
multiLine: true,
).hasMatch(msgs.last.content);
if (!exists) { if (!exists) {
final status = final status =
'\n<details type="tool_calls" done="false" name="$name"><summary>Executing...</summary>\n</details>\n'; '\n<details type="tool_calls" done="false" name="$name"><summary>Executing...</summary>\n</details>\n';
@@ -517,14 +518,9 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
: null; : null;
if (name is String && name.isNotEmpty) { if (name is String && name.isNotEmpty) {
final msgs = getMessages(); final msgs = getMessages();
final exists = // Quick string check before expensive regex
(msgs.isNotEmpty) && final exists = (msgs.isNotEmpty) &&
RegExp( msgs.last.content.contains('name="$name"');
r'<details\s+type=\"tool_calls\"[^>]*\bname=\"' +
RegExp.escape(name) +
r'\"',
multiLine: true,
).hasMatch(msgs.last.content);
if (!exists) { if (!exists) {
final status = final status =
'\n<details type="tool_calls" done="false" name="$name"><summary>Executing...</summary>\n</details>\n'; '\n<details type="tool_calls" done="false" name="$name"><summary>Executing...</summary>\n</details>\n';
@@ -552,14 +548,9 @@ ActiveSocketStream attachUnifiedChunkedStreaming({
: null; : null;
if (name is String && name.isNotEmpty) { if (name is String && name.isNotEmpty) {
final msgs = getMessages(); final msgs = getMessages();
final exists = // Quick string check before expensive regex
(msgs.isNotEmpty) && final exists = (msgs.isNotEmpty) &&
RegExp( msgs.last.content.contains('name="$name"');
r'<details\s+type=\"tool_calls\"[^>]*\bname=\"' +
RegExp.escape(name) +
r'\"',
multiLine: true,
).hasMatch(msgs.last.content);
if (!exists) { if (!exists) {
final status = final status =
'\n<details type="tool_calls" done="false" name="$name"><summary>Executing...</summary>\n</details>\n'; '\n<details type="tool_calls" done="false" name="$name"><summary>Executing...</summary>\n</details>\n';

View File

@@ -1,3 +1,7 @@
// Pre-compiled regex patterns for markdown syntax detection (performance optimization)
final _boldPattern = RegExp(r'\*\*');
final _italicPattern = RegExp(r'(?<!\*)\*(?!\*)');
/// Maintains a raw markdown buffer for streaming content and generates /// Maintains a raw markdown buffer for streaming content and generates
/// preview-safe output by appending synthetic closing tokens when necessary. /// preview-safe output by appending synthetic closing tokens when necessary.
class MarkdownStreamFormatter { class MarkdownStreamFormatter {
@@ -40,12 +44,12 @@ class MarkdownStreamFormatter {
buffer.writeln('```'); buffer.writeln('```');
} }
final boldCount = RegExp(r'\*\*').allMatches(content).length; final boldCount = _boldPattern.allMatches(content).length;
if (boldCount.isOdd) { if (boldCount.isOdd) {
buffer.write('**'); buffer.write('**');
} }
final italicCount = RegExp(r'(?<!\*)\*(?!\*)').allMatches(content).length; final italicCount = _italicPattern.allMatches(content).length;
if (italicCount.isOdd) { if (italicCount.isOdd) {
buffer.write('*'); buffer.write('*');
} }

View File

@@ -23,6 +23,25 @@ import '../../../core/utils/debug_logger.dart';
import 'sources/openwebui_sources.dart'; import 'sources/openwebui_sources.dart';
import '../providers/assistant_response_builder_provider.dart'; import '../providers/assistant_response_builder_provider.dart';
// Pre-compiled regex patterns for TTS sanitization (performance optimization)
final _ttsCodeBlockPattern = RegExp(r'```');
final _ttsInlineCodePattern = RegExp(r'`');
final _ttsImagePattern = RegExp(r'!\[(.*?)\]\((.*?)\)');
final _ttsLinkPattern = RegExp(r'\[(.*?)\]\((.*?)\)');
final _ttsBoldPattern1 = RegExp(r'\*\*');
final _ttsBoldPattern2 = RegExp(r'__');
final _ttsItalicPattern1 = RegExp(r'\*');
final _ttsItalicPattern2 = RegExp(r'_');
final _ttsStrikePattern = RegExp(r'~');
final _ttsListPattern = RegExp(r'^[-*+]\s+', multiLine: true);
final _ttsQuotePattern = RegExp(r'^>\s?', multiLine: true);
final _ttsMultiSpacePattern = RegExp(r'[ \t]{2,}');
final _ttsMultiNewlinePattern = RegExp(r'\n{3,}');
// Pre-compiled regex patterns for image processing (performance optimization)
final _base64ImagePattern = RegExp(r'data:image/[^;]+;base64,[A-Za-z0-9+/]+=*');
final _fileIdPattern = RegExp(r'/api/v1/files/([^/]+)/content');
class AssistantMessageWidget extends ConsumerStatefulWidget { class AssistantMessageWidget extends ConsumerStatefulWidget {
final dynamic message; final dynamic message;
final bool isStreaming; final bool isStreaming;
@@ -270,23 +289,24 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
} }
var text = input; var text = input;
text = text.replaceAll(RegExp(r'```'), ' '); // Use pre-compiled regex patterns for better performance
text = text.replaceAll(RegExp(r'`'), ''); text = text.replaceAll(_ttsCodeBlockPattern, ' ');
text = text.replaceAll(RegExp(r'!\[(.*?)\]\((.*?)\)'), r'$1'); text = text.replaceAll(_ttsInlineCodePattern, '');
text = text.replaceAll(RegExp(r'\[(.*?)\]\((.*?)\)'), r'$1'); text = text.replaceAll(_ttsImagePattern, r'$1');
text = text.replaceAll(RegExp(r'\*\*'), ''); text = text.replaceAll(_ttsLinkPattern, r'$1');
text = text.replaceAll(RegExp(r'__'), ''); text = text.replaceAll(_ttsBoldPattern1, '');
text = text.replaceAll(RegExp(r'\*'), ''); text = text.replaceAll(_ttsBoldPattern2, '');
text = text.replaceAll(RegExp(r'_'), ''); text = text.replaceAll(_ttsItalicPattern1, '');
text = text.replaceAll(RegExp(r'~'), ''); text = text.replaceAll(_ttsItalicPattern2, '');
text = text.replaceAll(RegExp(r'^[-*+]\s+', multiLine: true), ''); text = text.replaceAll(_ttsStrikePattern, '');
text = text.replaceAll(RegExp(r'^>\s?', multiLine: true), ''); text = text.replaceAll(_ttsListPattern, '');
text = text.replaceAll(_ttsQuotePattern, '');
text = text.replaceAll('&nbsp;', ' '); text = text.replaceAll('&nbsp;', ' ');
text = text.replaceAll('&amp;', '&'); text = text.replaceAll('&amp;', '&');
text = text.replaceAll('&lt;', '<'); text = text.replaceAll('&lt;', '<');
text = text.replaceAll('&gt;', '>'); text = text.replaceAll('&gt;', '>');
text = text.replaceAll(RegExp(r'[ \t]{2,}'), ' '); text = text.replaceAll(_ttsMultiSpacePattern, ' ');
text = text.replaceAll(RegExp(r'\n{3,}'), '\n\n'); text = text.replaceAll(_ttsMultiNewlinePattern, '\n\n');
return text.trim(); return text.trim();
} }
@@ -771,18 +791,17 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
// Check if content contains image markdown or base64 data URLs // Check if content contains image markdown or base64 data URLs
// This ensures images generated by AI are properly formatted // This ensures images generated by AI are properly formatted
// Pattern to detect base64 images that might not be in markdown format // Quick check: only process if we have base64 images and no markdown
final base64Pattern = RegExp(r'data:image/[^;]+;base64,[A-Za-z0-9+/]+=*'); if (!content.contains('data:image/') || content.contains('![')) {
return content;
}
// If we find base64 images not wrapped in markdown, wrap them // If we find base64 images not wrapped in markdown, wrap them
if (base64Pattern.hasMatch(content) && !content.contains('![')) { if (_base64ImagePattern.hasMatch(content)) {
content = content.replaceAllMapped(base64Pattern, (match) { content = content.replaceAllMapped(_base64ImagePattern, (match) {
final imageData = match.group(0)!; final imageData = match.group(0)!;
// Check if this image is already in markdown format // Check if this image is already in markdown format (simple string check)
final markdownCheck = RegExp( if (!content.contains('![$imageData)')) {
r'!\[.*?\]\(' + RegExp.escape(imageData) + r'\)',
);
if (!markdownCheck.hasMatch(content)) {
return '\n![Generated Image]($imageData)\n'; return '\n![Generated Image]($imageData)\n';
} }
return imageData; return imageData;
@@ -951,9 +970,7 @@ class _AssistantMessageWidgetState extends ConsumerState<AssistantMessageWidget>
String attachmentId = fileUrl; String attachmentId = fileUrl;
if (fileUrl.contains('/api/v1/files/') && if (fileUrl.contains('/api/v1/files/') &&
fileUrl.contains('/content')) { fileUrl.contains('/content')) {
final fileIdMatch = RegExp( final fileIdMatch = _fileIdPattern.firstMatch(fileUrl);
r'/api/v1/files/([^/]+)/content',
).firstMatch(fileUrl);
if (fileIdMatch != null) { if (fileIdMatch != null) {
attachmentId = fileIdMatch.group(1)!; attachmentId = fileIdMatch.group(1)!;
} }

View File

@@ -4,6 +4,9 @@ import '../../theme/theme_extensions.dart';
import 'markdown_config.dart'; import 'markdown_config.dart';
import 'markdown_preprocessor.dart'; import 'markdown_preprocessor.dart';
// Pre-compiled regex for mermaid diagram detection (performance optimization)
final _mermaidRegex = RegExp(r'```mermaid\s*([\s\S]*?)```', multiLine: true);
class StreamingMarkdownWidget extends StatelessWidget { class StreamingMarkdownWidget extends StatelessWidget {
const StreamingMarkdownWidget({ const StreamingMarkdownWidget({
super.key, super.key,
@@ -23,8 +26,7 @@ class StreamingMarkdownWidget extends StatelessWidget {
} }
final normalized = ConduitMarkdownPreprocessor.normalize(content); final normalized = ConduitMarkdownPreprocessor.normalize(content);
final mermaidRegex = RegExp(r'```mermaid\s*([\s\S]*?)```', multiLine: true); final matches = _mermaidRegex.allMatches(normalized).toList();
final matches = mermaidRegex.allMatches(normalized).toList();
Widget buildMarkdown(String data) { Widget buildMarkdown(String data) {
return ConduitMarkdown.buildBlock( return ConduitMarkdown.buildBlock(