refactor: Enhance markdown processing for text-to-speech conversion
- Introduced new regex patterns to remove thinking and reasoning blocks from markdown input. - Added functionality to strip emojis from the text, improving clarity for TTS. - Implemented HTML entity decoding to ensure proper text representation. - Replaced the existing sanitization method with a more comprehensive markdown-to-text conversion approach, enhancing performance and maintainability.
This commit is contained in:
@@ -5,6 +5,25 @@
|
||||
class MarkdownToText {
|
||||
const MarkdownToText._();
|
||||
|
||||
static final _thinkingBlockRegex = RegExp(
|
||||
r'<details\s+type="reasoning"[^>]*>.*?</details>',
|
||||
multiLine: true,
|
||||
dotAll: true,
|
||||
);
|
||||
static final _thinkTagRegex = RegExp(
|
||||
r'<think>.*?</think>',
|
||||
multiLine: true,
|
||||
dotAll: true,
|
||||
);
|
||||
static final _reasoningTagRegex = RegExp(
|
||||
r'<reasoning>.*?</reasoning>',
|
||||
multiLine: true,
|
||||
dotAll: true,
|
||||
);
|
||||
static final _emojiRegex = RegExp(
|
||||
r'[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA00}-\u{1FA6F}]|[\u{1FA70}-\u{1FAFF}]|[\u{FE00}-\u{FE0F}]|[\u{1F018}-\u{1F270}]|[\u{238C}-\u{2454}]|[\u{20D0}-\u{20FF}]',
|
||||
unicode: true,
|
||||
);
|
||||
static final _codeBlockRegex = RegExp(
|
||||
r'```[^\n]*\n(.*?)```',
|
||||
multiLine: true,
|
||||
@@ -29,18 +48,21 @@ class MarkdownToText {
|
||||
multiLine: true,
|
||||
);
|
||||
static final _htmlTagRegex = RegExp(r'<[^>]+>');
|
||||
static final _htmlEntityRegex = RegExp(r'&[a-z]+;|&#\d+;|&#x[0-9a-f]+;');
|
||||
static final _multipleNewlinesRegex = RegExp(r'\n{3,}');
|
||||
static final _multipleSpacesRegex = RegExp(r' {2,}');
|
||||
|
||||
/// Converts markdown text to plain text suitable for TTS.
|
||||
///
|
||||
/// - Removes thinking/reasoning blocks
|
||||
/// - Removes emojis
|
||||
/// - Removes code blocks (replaces with descriptive text)
|
||||
/// - Strips all formatting (bold, italic, strikethrough)
|
||||
/// - Converts links to just their text
|
||||
/// - Removes images (or converts to alt text)
|
||||
/// - Simplifies headings
|
||||
/// - Preserves list structure with natural pauses
|
||||
/// - Removes HTML tags
|
||||
/// - Removes HTML tags and entities
|
||||
/// - Normalizes whitespace
|
||||
static String convert(String markdown) {
|
||||
if (markdown.trim().isEmpty) {
|
||||
@@ -49,13 +71,20 @@ class MarkdownToText {
|
||||
|
||||
var text = markdown;
|
||||
|
||||
// Remove thinking/reasoning blocks (must be done before general HTML tag removal)
|
||||
text = text.replaceAll(_thinkingBlockRegex, '');
|
||||
text = text.replaceAll(_thinkTagRegex, '');
|
||||
text = text.replaceAll(_reasoningTagRegex, '');
|
||||
|
||||
// Remove emojis
|
||||
text = text.replaceAll(_emojiRegex, '');
|
||||
|
||||
// Remove or replace code blocks with descriptive text
|
||||
text = text.replaceAllMapped(_codeBlockRegex, (match) {
|
||||
final code = match[1]?.trim() ?? '';
|
||||
if (code.isEmpty) {
|
||||
return '';
|
||||
}
|
||||
// For TTS, skip code blocks or use a brief description
|
||||
return ' (code block) ';
|
||||
});
|
||||
|
||||
@@ -86,7 +115,6 @@ class MarkdownToText {
|
||||
// Simplify headings (remove # symbols)
|
||||
text = text.replaceAllMapped(_headingRegex, (match) {
|
||||
final heading = match[1] ?? '';
|
||||
// Add a pause after headings for natural speech flow
|
||||
return '$heading.\n';
|
||||
});
|
||||
|
||||
@@ -103,6 +131,20 @@ class MarkdownToText {
|
||||
// Remove HTML tags
|
||||
text = text.replaceAll(_htmlTagRegex, '');
|
||||
|
||||
// Decode HTML entities
|
||||
text = text.replaceAllMapped(_htmlEntityRegex, (match) {
|
||||
final entity = match[0] ?? '';
|
||||
return switch (entity) {
|
||||
' ' => ' ',
|
||||
'&' => '&',
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
'"' => '"',
|
||||
''' => "'",
|
||||
_ => entity,
|
||||
};
|
||||
});
|
||||
|
||||
// Normalize whitespace
|
||||
text = text.replaceAll(_multipleNewlinesRegex, '\n\n');
|
||||
text = text.replaceAll(_multipleSpacesRegex, ' ');
|
||||
|
||||
Reference in New Issue
Block a user