2025-10-09 00:20:36 +05:30
|
|
|
/// Converts markdown text to plain text suitable for text-to-speech.
|
|
|
|
|
///
|
|
|
|
|
/// Strips formatting while preserving the semantic meaning and readability
|
|
|
|
|
/// of the content for audio consumption.
|
|
|
|
|
class MarkdownToText {
|
|
|
|
|
const MarkdownToText._();
|
|
|
|
|
|
2025-10-20 23:53:07 +05:30
|
|
|
static final _thinkingBlockRegex = RegExp(
|
|
|
|
|
r'<details\s+type="reasoning"[^>]*>.*?</details>',
|
|
|
|
|
multiLine: true,
|
|
|
|
|
dotAll: true,
|
|
|
|
|
);
|
|
|
|
|
static final _thinkTagRegex = RegExp(
|
|
|
|
|
r'<think>.*?</think>',
|
|
|
|
|
multiLine: true,
|
|
|
|
|
dotAll: true,
|
|
|
|
|
);
|
|
|
|
|
static final _reasoningTagRegex = RegExp(
|
|
|
|
|
r'<reasoning>.*?</reasoning>',
|
|
|
|
|
multiLine: true,
|
|
|
|
|
dotAll: true,
|
|
|
|
|
);
|
|
|
|
|
static final _emojiRegex = RegExp(
|
|
|
|
|
r'[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|[\u{1F900}-\u{1F9FF}]|[\u{1FA00}-\u{1FA6F}]|[\u{1FA70}-\u{1FAFF}]|[\u{FE00}-\u{FE0F}]|[\u{1F018}-\u{1F270}]|[\u{238C}-\u{2454}]|[\u{20D0}-\u{20FF}]',
|
|
|
|
|
unicode: true,
|
|
|
|
|
);
|
2025-10-09 00:20:36 +05:30
|
|
|
static final _codeBlockRegex = RegExp(
|
|
|
|
|
r'```[^\n]*\n(.*?)```',
|
|
|
|
|
multiLine: true,
|
|
|
|
|
dotAll: true,
|
|
|
|
|
);
|
|
|
|
|
static final _inlineCodeRegex = RegExp(r'`([^`]+)`');
|
|
|
|
|
static final _boldItalicRegex = RegExp(r'\*\*\*([^*]+)\*\*\*');
|
|
|
|
|
static final _boldRegex = RegExp(r'\*\*([^*]+)\*\*');
|
|
|
|
|
static final _italicRegex = RegExp(r'\*([^*]+)\*|_([^_]+)_');
|
|
|
|
|
static final _strikethroughRegex = RegExp(r'~~([^~]+)~~');
|
|
|
|
|
static final _linkRegex = RegExp(r'\[([^\]]+)\]\([^)]+\)');
|
|
|
|
|
static final _imageRegex = RegExp(r'!\[([^\]]*)\]\([^)]+\)');
|
|
|
|
|
static final _headingRegex = RegExp(r'^#{1,6}\s+(.+)$', multiLine: true);
|
|
|
|
|
static final _listItemRegex = RegExp(r'^[\s]*[-*+]\s+(.+)$', multiLine: true);
|
|
|
|
|
static final _orderedListRegex = RegExp(
|
|
|
|
|
r'^[\s]*\d+\.\s+(.+)$',
|
|
|
|
|
multiLine: true,
|
|
|
|
|
);
|
|
|
|
|
static final _blockquoteRegex = RegExp(r'^>\s*(.+)$', multiLine: true);
|
|
|
|
|
static final _horizontalRuleRegex = RegExp(
|
|
|
|
|
r'^[\s]*[-*_]{3,}[\s]*$',
|
|
|
|
|
multiLine: true,
|
|
|
|
|
);
|
|
|
|
|
static final _htmlTagRegex = RegExp(r'<[^>]+>');
|
2025-10-20 23:53:07 +05:30
|
|
|
static final _htmlEntityRegex = RegExp(r'&[a-z]+;|&#\d+;|&#x[0-9a-f]+;');
|
2025-10-09 00:20:36 +05:30
|
|
|
static final _multipleNewlinesRegex = RegExp(r'\n{3,}');
|
|
|
|
|
static final _multipleSpacesRegex = RegExp(r' {2,}');
|
|
|
|
|
|
|
|
|
|
/// Converts markdown text to plain text suitable for TTS.
|
|
|
|
|
///
|
2025-10-20 23:53:07 +05:30
|
|
|
/// - Removes thinking/reasoning blocks
|
|
|
|
|
/// - Removes emojis
|
2025-10-09 00:20:36 +05:30
|
|
|
/// - Removes code blocks (replaces with descriptive text)
|
|
|
|
|
/// - Strips all formatting (bold, italic, strikethrough)
|
|
|
|
|
/// - Converts links to just their text
|
|
|
|
|
/// - Removes images (or converts to alt text)
|
|
|
|
|
/// - Simplifies headings
|
|
|
|
|
/// - Preserves list structure with natural pauses
|
2025-10-20 23:53:07 +05:30
|
|
|
/// - Removes HTML tags and entities
|
2025-10-09 00:20:36 +05:30
|
|
|
/// - Normalizes whitespace
|
|
|
|
|
static String convert(String markdown) {
|
|
|
|
|
if (markdown.trim().isEmpty) {
|
|
|
|
|
return '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var text = markdown;
|
|
|
|
|
|
2025-10-20 23:53:07 +05:30
|
|
|
// Remove thinking/reasoning blocks (must be done before general HTML tag removal)
|
|
|
|
|
text = text.replaceAll(_thinkingBlockRegex, '');
|
|
|
|
|
text = text.replaceAll(_thinkTagRegex, '');
|
|
|
|
|
text = text.replaceAll(_reasoningTagRegex, '');
|
|
|
|
|
|
|
|
|
|
// Remove emojis
|
|
|
|
|
text = text.replaceAll(_emojiRegex, '');
|
|
|
|
|
|
2025-10-09 00:20:36 +05:30
|
|
|
// Remove or replace code blocks with descriptive text
|
|
|
|
|
text = text.replaceAllMapped(_codeBlockRegex, (match) {
|
|
|
|
|
final code = match[1]?.trim() ?? '';
|
|
|
|
|
if (code.isEmpty) {
|
|
|
|
|
return '';
|
|
|
|
|
}
|
|
|
|
|
return ' (code block) ';
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Remove inline code backticks but keep the content
|
|
|
|
|
text = text.replaceAllMapped(_inlineCodeRegex, (match) => match[1] ?? '');
|
|
|
|
|
|
|
|
|
|
// Strip bold/italic/strikethrough formatting
|
|
|
|
|
text = text.replaceAllMapped(_boldItalicRegex, (match) => match[1] ?? '');
|
|
|
|
|
text = text.replaceAllMapped(_boldRegex, (match) => match[1] ?? '');
|
|
|
|
|
text = text.replaceAllMapped(
|
|
|
|
|
_italicRegex,
|
|
|
|
|
(match) => match[1] ?? match[2] ?? '',
|
|
|
|
|
);
|
|
|
|
|
text = text.replaceAllMapped(
|
|
|
|
|
_strikethroughRegex,
|
|
|
|
|
(match) => match[1] ?? '',
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Convert links to just their text
|
|
|
|
|
text = text.replaceAllMapped(_linkRegex, (match) => match[1] ?? '');
|
|
|
|
|
|
|
|
|
|
// Remove images (or use alt text if available)
|
|
|
|
|
text = text.replaceAllMapped(_imageRegex, (match) {
|
|
|
|
|
final alt = match[1]?.trim() ?? '';
|
|
|
|
|
return alt.isNotEmpty ? ' ($alt image) ' : '';
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Simplify headings (remove # symbols)
|
|
|
|
|
text = text.replaceAllMapped(_headingRegex, (match) {
|
|
|
|
|
final heading = match[1] ?? '';
|
|
|
|
|
return '$heading.\n';
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Preserve list items with natural pauses
|
|
|
|
|
text = text.replaceAllMapped(_listItemRegex, (match) => '${match[1]}. ');
|
|
|
|
|
text = text.replaceAllMapped(_orderedListRegex, (match) => '${match[1]}. ');
|
|
|
|
|
|
|
|
|
|
// Remove blockquote markers
|
|
|
|
|
text = text.replaceAllMapped(_blockquoteRegex, (match) => match[1] ?? '');
|
|
|
|
|
|
|
|
|
|
// Remove horizontal rules
|
|
|
|
|
text = text.replaceAll(_horizontalRuleRegex, '');
|
|
|
|
|
|
|
|
|
|
// Remove HTML tags
|
|
|
|
|
text = text.replaceAll(_htmlTagRegex, '');
|
|
|
|
|
|
2025-10-20 23:53:07 +05:30
|
|
|
// Decode HTML entities
|
|
|
|
|
text = text.replaceAllMapped(_htmlEntityRegex, (match) {
|
|
|
|
|
final entity = match[0] ?? '';
|
|
|
|
|
return switch (entity) {
|
|
|
|
|
' ' => ' ',
|
|
|
|
|
'&' => '&',
|
|
|
|
|
'<' => '<',
|
|
|
|
|
'>' => '>',
|
|
|
|
|
'"' => '"',
|
|
|
|
|
''' => "'",
|
|
|
|
|
_ => entity,
|
|
|
|
|
};
|
|
|
|
|
});
|
|
|
|
|
|
2025-10-09 00:20:36 +05:30
|
|
|
// Normalize whitespace
|
|
|
|
|
text = text.replaceAll(_multipleNewlinesRegex, '\n\n');
|
|
|
|
|
text = text.replaceAll(_multipleSpacesRegex, ' ');
|
|
|
|
|
|
|
|
|
|
// Convert newlines to spaces for natural speech flow
|
|
|
|
|
text = text.replaceAll('\n', ' ');
|
|
|
|
|
|
|
|
|
|
// Final cleanup
|
|
|
|
|
text = text.trim();
|
|
|
|
|
|
|
|
|
|
return text;
|
|
|
|
|
}
|
|
|
|
|
}
|