refactor(markdown): remove deprecated stream formatter and enhance preprocessor
This commit is contained in:
@@ -1,18 +0,0 @@
|
||||
/// HTML entity utilities for parsing content.
|
||||
///
|
||||
/// Reference: openwebui-src/src/lib/utils/index.ts (unescapeHtml)
|
||||
library;
|
||||
|
||||
import 'package:html_unescape/html_unescape.dart';
|
||||
|
||||
/// Utility class for HTML entity handling.
|
||||
class HtmlUtils {
|
||||
/// HTML entity unescaper instance.
|
||||
static final _unescape = HtmlUnescape();
|
||||
|
||||
/// Unescape HTML entities in a string.
|
||||
///
|
||||
/// Handles all Named, Decimal, and Hexadecimal Character References.
|
||||
static String unescapeHtml(String s) => _unescape.convert(s);
|
||||
}
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
// Pre-compiled regex patterns for markdown syntax detection (performance optimization)
|
||||
final _boldPattern = RegExp(r'\*\*');
|
||||
final _italicPattern = RegExp(r'(?<!\*)\*(?!\*)');
|
||||
|
||||
/// Maintains a raw markdown buffer for streaming content and generates
|
||||
/// preview-safe output by appending synthetic closing tokens when necessary.
|
||||
class MarkdownStreamFormatter {
|
||||
StringBuffer _raw = StringBuffer();
|
||||
|
||||
/// Seeds the formatter with existing markdown content.
|
||||
void seed(String content) {
|
||||
_raw = StringBuffer(content);
|
||||
}
|
||||
|
||||
/// Adds a streaming chunk to the internal buffer and returns a preview-ready
|
||||
/// string with any required synthetic closing markers.
|
||||
String ingest(String chunk) {
|
||||
if (chunk.isNotEmpty) {
|
||||
_raw.write(chunk);
|
||||
}
|
||||
return preview();
|
||||
}
|
||||
|
||||
/// Replaces the current buffer with the provided [content].
|
||||
String replace(String content) {
|
||||
seed(content);
|
||||
return preview();
|
||||
}
|
||||
|
||||
/// Returns the preview-safe markdown string.
|
||||
String preview() {
|
||||
final raw = _raw.toString();
|
||||
return raw + _syntheticClosures(raw);
|
||||
}
|
||||
|
||||
/// Returns the raw markdown accumulated so far.
|
||||
String finalize() => _raw.toString();
|
||||
|
||||
String _syntheticClosures(String content) {
|
||||
final buffer = StringBuffer();
|
||||
|
||||
final fenceCount = '```'.allMatches(content).length;
|
||||
if (fenceCount.isOdd) {
|
||||
buffer.writeln('```');
|
||||
}
|
||||
|
||||
final boldCount = _boldPattern.allMatches(content).length;
|
||||
if (boldCount.isOdd) {
|
||||
buffer.write('**');
|
||||
}
|
||||
|
||||
final italicCount = _italicPattern.allMatches(content).length;
|
||||
if (italicCount.isOdd) {
|
||||
buffer.write('*');
|
||||
}
|
||||
|
||||
final openBrackets = '['.allMatches(content).length;
|
||||
final closeBrackets = ']'.allMatches(content).length;
|
||||
if (openBrackets > closeBrackets) {
|
||||
buffer.write(List.filled(openBrackets - closeBrackets, ']').join());
|
||||
}
|
||||
|
||||
final openParens = '('.allMatches(content).length;
|
||||
final closeParens = ')'.allMatches(content).length;
|
||||
if (openParens > closeParens) {
|
||||
buffer.write(List.filled(openParens - closeParens, ')').join());
|
||||
}
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
||||
@@ -1,160 +0,0 @@
|
||||
/// Converts markdown text to plain text suitable for text-to-speech.
|
||||
///
|
||||
/// Strips formatting while preserving the semantic meaning and readability
|
||||
/// of the content for audio consumption.
|
||||
class MarkdownToText {
|
||||
const MarkdownToText._();
|
||||
|
||||
static final _thinkingBlockRegex = RegExp(
|
||||
r'<details\s+type="reasoning"[^>]*>.*?</details>',
|
||||
multiLine: true,
|
||||
dotAll: true,
|
||||
);
|
||||
static final _thinkTagRegex = RegExp(
|
||||
r'<think>.*?</think>',
|
||||
multiLine: true,
|
||||
dotAll: true,
|
||||
);
|
||||
static final _reasoningTagRegex = RegExp(
|
||||
r'<reasoning>.*?</reasoning>',
|
||||
multiLine: true,
|
||||
dotAll: true,
|
||||
);
|
||||
static final _emojiRegex = RegExp(
|
||||
r'[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA00}-\u{1FA6F}]|[\u{1FA70}-\u{1FAFF}]|[\u{FE00}-\u{FE0F}]|[\u{1F018}-\u{1F270}]|[\u{238C}-\u{2454}]|[\u{20D0}-\u{20FF}]',
|
||||
unicode: true,
|
||||
);
|
||||
static final _codeBlockRegex = RegExp(
|
||||
r'```[^\n]*\n(.*?)```',
|
||||
multiLine: true,
|
||||
dotAll: true,
|
||||
);
|
||||
static final _inlineCodeRegex = RegExp(r'`([^`]+)`');
|
||||
static final _boldItalicRegex = RegExp(r'\*\*\*([^*]+)\*\*\*');
|
||||
static final _boldRegex = RegExp(r'\*\*([^*]+)\*\*');
|
||||
static final _italicRegex = RegExp(r'\*([^*]+)\*|_([^_]+)_');
|
||||
static final _strikethroughRegex = RegExp(r'~~([^~]+)~~');
|
||||
static final _linkRegex = RegExp(r'\[([^\]]+)\]\([^)]+\)');
|
||||
static final _imageRegex = RegExp(r'!\[([^\]]*)\]\([^)]+\)');
|
||||
static final _headingRegex = RegExp(r'^#{1,6}\s+(.+)$', multiLine: true);
|
||||
static final _listItemRegex = RegExp(r'^[\s]*[-*+]\s+(.+)$', multiLine: true);
|
||||
static final _orderedListRegex = RegExp(
|
||||
r'^[\s]*\d+\.\s+(.+)$',
|
||||
multiLine: true,
|
||||
);
|
||||
static final _blockquoteRegex = RegExp(r'^>\s*(.+)$', multiLine: true);
|
||||
static final _horizontalRuleRegex = RegExp(
|
||||
r'^[\s]*[-*_]{3,}[\s]*$',
|
||||
multiLine: true,
|
||||
);
|
||||
static final _htmlTagRegex = RegExp(r'<[^>]+>');
|
||||
static final _htmlEntityRegex = RegExp(r'&[a-z]+;|&#\d+;|&#x[0-9a-f]+;');
|
||||
static final _multipleNewlinesRegex = RegExp(r'\n{3,}');
|
||||
static final _multipleSpacesRegex = RegExp(r' {2,}');
|
||||
|
||||
/// Converts markdown text to plain text suitable for TTS.
|
||||
///
|
||||
/// - Removes thinking/reasoning blocks
|
||||
/// - Removes emojis
|
||||
/// - Removes code blocks (replaces with descriptive text)
|
||||
/// - Strips all formatting (bold, italic, strikethrough)
|
||||
/// - Converts links to just their text
|
||||
/// - Removes images (or converts to alt text)
|
||||
/// - Simplifies headings
|
||||
/// - Preserves list structure with natural pauses
|
||||
/// - Removes HTML tags and entities
|
||||
/// - Normalizes whitespace
|
||||
static String convert(String markdown) {
|
||||
if (markdown.trim().isEmpty) {
|
||||
return '';
|
||||
}
|
||||
|
||||
var text = markdown;
|
||||
|
||||
// Remove thinking/reasoning blocks (must be done before general HTML tag removal)
|
||||
text = text.replaceAll(_thinkingBlockRegex, '');
|
||||
text = text.replaceAll(_thinkTagRegex, '');
|
||||
text = text.replaceAll(_reasoningTagRegex, '');
|
||||
|
||||
// Remove emojis
|
||||
text = text.replaceAll(_emojiRegex, '');
|
||||
|
||||
// Remove or replace code blocks with descriptive text
|
||||
text = text.replaceAllMapped(_codeBlockRegex, (match) {
|
||||
final code = match[1]?.trim() ?? '';
|
||||
if (code.isEmpty) {
|
||||
return '';
|
||||
}
|
||||
return ' (code block) ';
|
||||
});
|
||||
|
||||
// Remove inline code backticks but keep the content
|
||||
text = text.replaceAllMapped(_inlineCodeRegex, (match) => match[1] ?? '');
|
||||
|
||||
// Strip bold/italic/strikethrough formatting
|
||||
text = text.replaceAllMapped(_boldItalicRegex, (match) => match[1] ?? '');
|
||||
text = text.replaceAllMapped(_boldRegex, (match) => match[1] ?? '');
|
||||
text = text.replaceAllMapped(
|
||||
_italicRegex,
|
||||
(match) => match[1] ?? match[2] ?? '',
|
||||
);
|
||||
text = text.replaceAllMapped(
|
||||
_strikethroughRegex,
|
||||
(match) => match[1] ?? '',
|
||||
);
|
||||
|
||||
// Convert links to just their text
|
||||
text = text.replaceAllMapped(_linkRegex, (match) => match[1] ?? '');
|
||||
|
||||
// Remove images (or use alt text if available)
|
||||
text = text.replaceAllMapped(_imageRegex, (match) {
|
||||
final alt = match[1]?.trim() ?? '';
|
||||
return alt.isNotEmpty ? ' ($alt image) ' : '';
|
||||
});
|
||||
|
||||
// Simplify headings (remove # symbols)
|
||||
text = text.replaceAllMapped(_headingRegex, (match) {
|
||||
final heading = match[1] ?? '';
|
||||
return '$heading.\n';
|
||||
});
|
||||
|
||||
// Preserve list items with natural pauses
|
||||
text = text.replaceAllMapped(_listItemRegex, (match) => '${match[1]}. ');
|
||||
text = text.replaceAllMapped(_orderedListRegex, (match) => '${match[1]}. ');
|
||||
|
||||
// Remove blockquote markers
|
||||
text = text.replaceAllMapped(_blockquoteRegex, (match) => match[1] ?? '');
|
||||
|
||||
// Remove horizontal rules
|
||||
text = text.replaceAll(_horizontalRuleRegex, '');
|
||||
|
||||
// Remove HTML tags
|
||||
text = text.replaceAll(_htmlTagRegex, '');
|
||||
|
||||
// Decode HTML entities
|
||||
text = text.replaceAllMapped(_htmlEntityRegex, (match) {
|
||||
final entity = match[0] ?? '';
|
||||
return switch (entity) {
|
||||
' ' => ' ',
|
||||
'&' => '&',
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
'"' => '"',
|
||||
''' => "'",
|
||||
_ => entity,
|
||||
};
|
||||
});
|
||||
|
||||
// Normalize whitespace
|
||||
text = text.replaceAll(_multipleNewlinesRegex, '\n\n');
|
||||
text = text.replaceAll(_multipleSpacesRegex, ' ');
|
||||
|
||||
// Convert newlines to spaces for natural speech flow
|
||||
text = text.replaceAll('\n', ' ');
|
||||
|
||||
// Final cleanup
|
||||
text = text.trim();
|
||||
|
||||
return text;
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,12 @@
|
||||
/// Reference: openwebui-src/backend/open_webui/utils/middleware.py DEFAULT_REASONING_TAGS
|
||||
library;
|
||||
|
||||
import 'html_utils.dart';
|
||||
import 'package:html_unescape/html_unescape.dart';
|
||||
|
||||
final _htmlUnescape = HtmlUnescape();
|
||||
|
||||
/// Unescape HTML entities in reasoning content.
|
||||
String _unescapeHtml(String s) => _htmlUnescape.convert(s);
|
||||
|
||||
/// All reasoning tag pairs supported by Open WebUI.
|
||||
/// Reference: DEFAULT_REASONING_TAGS in middleware.py
|
||||
@@ -181,9 +186,25 @@ class ReasoningParser {
|
||||
}
|
||||
|
||||
// Check for raw tag pairs
|
||||
// Supports tags with optional attributes like <think foo="bar">
|
||||
// Reference: openwebui-src/backend/open_webui/utils/middleware.py
|
||||
for (final pair in tagPairs) {
|
||||
final startTag = pair.$1;
|
||||
final idx = content.indexOf(startTag, index);
|
||||
int idx = -1;
|
||||
|
||||
// For XML-like tags (e.g., <think>), match with optional attributes
|
||||
if (startTag.startsWith('<') && startTag.endsWith('>')) {
|
||||
final tagName = startTag.substring(1, startTag.length - 1);
|
||||
final pattern = RegExp('<${RegExp.escape(tagName)}(\\s[^>]*)?>');
|
||||
final match = pattern.firstMatch(content.substring(index));
|
||||
if (match != null) {
|
||||
idx = index + match.start;
|
||||
}
|
||||
} else {
|
||||
// For non-XML tags (e.g., ◁think▷), use exact matching
|
||||
idx = content.indexOf(startTag, index);
|
||||
}
|
||||
|
||||
if (idx != -1 && (nextRawIdx == -1 || idx < nextRawIdx)) {
|
||||
nextRawIdx = idx;
|
||||
matchedRawPair = pair;
|
||||
@@ -336,8 +357,8 @@ class ReasoningParser {
|
||||
|
||||
return _DetailsResult(
|
||||
entry: ReasoningEntry(
|
||||
reasoning: HtmlUtils.unescapeHtml(summaryResult.remaining),
|
||||
summary: HtmlUtils.unescapeHtml(summaryResult.summary),
|
||||
reasoning: _unescapeHtml(summaryResult.remaining),
|
||||
summary: _unescapeHtml(summaryResult.summary),
|
||||
duration: effectiveDuration,
|
||||
isDone: false,
|
||||
blockType: blockType,
|
||||
@@ -368,8 +389,8 @@ class ReasoningParser {
|
||||
|
||||
return _DetailsResult(
|
||||
entry: ReasoningEntry(
|
||||
reasoning: HtmlUtils.unescapeHtml(summaryResult.remaining),
|
||||
summary: HtmlUtils.unescapeHtml(summaryResult.summary),
|
||||
reasoning: _unescapeHtml(summaryResult.remaining),
|
||||
summary: _unescapeHtml(summaryResult.summary),
|
||||
duration: effectiveDuration,
|
||||
isDone: isDone,
|
||||
blockType: blockType,
|
||||
@@ -381,20 +402,47 @@ class ReasoningParser {
|
||||
}
|
||||
|
||||
/// Parse a raw reasoning tag pair (e.g., `<think>...</think>`).
|
||||
/// Supports tags with optional attributes like `<think foo="bar">`.
|
||||
///
|
||||
/// Reference: openwebui-src/backend/open_webui/utils/middleware.py
|
||||
static _ReasoningResult _parseRawReasoning(
|
||||
String content,
|
||||
int startIdx,
|
||||
String startTag,
|
||||
String endTag,
|
||||
) {
|
||||
final endIdx = content.indexOf(endTag, startIdx + startTag.length);
|
||||
// Find the actual end of the opening tag (handles attributes)
|
||||
int contentStartIdx;
|
||||
if (startTag.startsWith('<') && startTag.endsWith('>')) {
|
||||
// For XML-like tags, find the closing '>' to skip any attributes
|
||||
final tagCloseIdx = content.indexOf('>', startIdx);
|
||||
if (tagCloseIdx == -1) {
|
||||
// Incomplete opening tag
|
||||
return _ReasoningResult(
|
||||
entry: ReasoningEntry(
|
||||
reasoning: '',
|
||||
summary: '',
|
||||
duration: 0,
|
||||
isDone: false,
|
||||
),
|
||||
endIndex: content.length,
|
||||
isComplete: false,
|
||||
);
|
||||
}
|
||||
contentStartIdx = tagCloseIdx + 1;
|
||||
} else {
|
||||
// For non-XML tags, use exact tag length
|
||||
contentStartIdx = startIdx + startTag.length;
|
||||
}
|
||||
|
||||
final endIdx = content.indexOf(endTag, contentStartIdx);
|
||||
|
||||
if (endIdx == -1) {
|
||||
// Incomplete block (streaming)
|
||||
final innerContent = content.substring(startIdx + startTag.length);
|
||||
final innerContent = content.substring(contentStartIdx);
|
||||
return _ReasoningResult(
|
||||
entry: ReasoningEntry(
|
||||
reasoning: HtmlUtils.unescapeHtml(innerContent.trim()),
|
||||
reasoning: _unescapeHtml(innerContent.trim()),
|
||||
summary: '',
|
||||
duration: 0,
|
||||
isDone: false,
|
||||
@@ -405,10 +453,10 @@ class ReasoningParser {
|
||||
}
|
||||
|
||||
// Complete block
|
||||
final innerContent = content.substring(startIdx + startTag.length, endIdx);
|
||||
final innerContent = content.substring(contentStartIdx, endIdx);
|
||||
return _ReasoningResult(
|
||||
entry: ReasoningEntry(
|
||||
reasoning: HtmlUtils.unescapeHtml(innerContent.trim()),
|
||||
reasoning: _unescapeHtml(innerContent.trim()),
|
||||
summary: '',
|
||||
duration: 0,
|
||||
isDone: true,
|
||||
@@ -533,23 +581,33 @@ class ReasoningParser {
|
||||
}
|
||||
|
||||
/// Formats the duration for display.
|
||||
/// Mirrors Open WebUI's formatting:
|
||||
/// Mirrors Open WebUI's dayjs.duration(seconds, 'seconds').humanize():
|
||||
/// - < 1: "less than a second"
|
||||
/// - < 60: "X seconds"
|
||||
/// - >= 60: humanized (e.g., "2 minutes")
|
||||
/// - >= 60: humanized (e.g., "a minute", "2 minutes", "about an hour")
|
||||
///
|
||||
/// Reference: openwebui-src/src/lib/components/common/Collapsible.svelte
|
||||
static String formatDuration(int seconds) {
|
||||
if (seconds < 1) return 'less than a second';
|
||||
if (seconds < 60) return '$seconds second${seconds == 1 ? '' : 's'}';
|
||||
|
||||
final minutes = seconds ~/ 60;
|
||||
final remainingSeconds = seconds % 60;
|
||||
|
||||
if (remainingSeconds == 0) {
|
||||
return '$minutes minute${minutes == 1 ? '' : 's'}';
|
||||
// Match dayjs.duration().humanize() behavior
|
||||
// Reference: https://day.js.org/docs/en/durations/humanize
|
||||
if (seconds < 90) return 'a minute';
|
||||
if (seconds < 2700) {
|
||||
// 45 minutes
|
||||
final minutes = (seconds / 60).round();
|
||||
return '$minutes minutes';
|
||||
}
|
||||
|
||||
// For mixed minutes and seconds, use abbreviated format
|
||||
return '$minutes min ${remainingSeconds}s';
|
||||
if (seconds < 5400) return 'about an hour'; // 90 minutes
|
||||
if (seconds < 79200) {
|
||||
// 22 hours
|
||||
final hours = (seconds / 3600).round();
|
||||
return '$hours hours';
|
||||
}
|
||||
if (seconds < 129600) return 'a day'; // 36 hours
|
||||
final days = (seconds / 86400).round();
|
||||
return '$days days';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import 'dart:convert';
|
||||
|
||||
import '../../shared/widgets/markdown/markdown_preprocessor.dart';
|
||||
|
||||
/// Parsed representation of one tool call emitted as a `<details type="tool_calls" ...>` block
|
||||
class ToolCallEntry {
|
||||
final String id;
|
||||
@@ -255,18 +257,8 @@ class ToolCallsParser {
|
||||
static String sanitizeForApi(String content) {
|
||||
if (content.isEmpty) return content;
|
||||
|
||||
// Remove blocks we never want to include in conversation context
|
||||
final removeTypes = ['reasoning', 'code_interpreter'];
|
||||
for (final t in removeTypes) {
|
||||
content = content.replaceAll(
|
||||
RegExp(
|
||||
'<details\\s+type="$t"[^>]*>[\\s\\S]*?</details>',
|
||||
multiLine: true,
|
||||
dotAll: true,
|
||||
),
|
||||
'',
|
||||
);
|
||||
}
|
||||
// Remove annotations and reasoning blocks
|
||||
content = ConduitMarkdownPreprocessor.sanitize(content);
|
||||
|
||||
if (!content.contains('<details')) return content.trim();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user