diff --git a/crates/ruma-events/CHANGELOG.md b/crates/ruma-events/CHANGELOG.md index 7d5f8412..f024b5b3 100644 --- a/crates/ruma-events/CHANGELOG.md +++ b/crates/ruma-events/CHANGELOG.md @@ -7,6 +7,8 @@ Bug fixes: - Fix serialization of `room::message::Relation` and `room::encrypted::Relation` which could cause duplicate `rel_type` keys. - `Restricted` no longer fails to deserialize when the `allow` field is missing +- Markdown text constructors now also detect markdown syntax like backslash + escapes and entity references to decide if the text should be sent as HTML. Improvements: diff --git a/crates/ruma-events/src/room/message.rs b/crates/ruma-events/src/room/message.rs index bdc1fd14..39f7475a 100644 --- a/crates/ruma-events/src/room/message.rs +++ b/crates/ruma-events/src/room/message.rs @@ -858,11 +858,12 @@ pub struct CustomEventContent { #[cfg(feature = "markdown")] pub(crate) fn parse_markdown(text: &str) -> Option { - use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd}; + use pulldown_cmark::{CowStr, Event, Options, Parser, Tag, TagEnd}; const OPTIONS: Options = Options::ENABLE_TABLES.union(Options::ENABLE_STRIKETHROUGH); let mut found_first_paragraph = false; + let mut previous_event_was_text = false; let parser_events: Vec<_> = Parser::new_ext(text, OPTIONS) .map(|event| match event { @@ -871,8 +872,29 @@ pub(crate) fn parse_markdown(text: &str) -> Option { }) .collect(); let has_markdown = parser_events.iter().any(|ref event| { - let is_text = matches!(event, Event::Text(_)); + // Numeric references should be replaced by their UTF-8 equivalent, so encountering a + // non-borrowed string means that there is markdown syntax. + let is_borrowed_text = matches!(event, Event::Text(CowStr::Borrowed(_))); + + if is_borrowed_text { + if previous_event_was_text { + // The text was split, so a character was likely removed, like in the case of + // backslash escapes, or replaced by a static string, like for entity references, so + // there is markdown syntax. + return true; + } else { + previous_event_was_text = true; + } + } else { + previous_event_was_text = false; + } + + // A hard break happens when a newline is encountered, which is not necessarily markdown + // syntax. let is_break = matches!(event, Event::HardBreak); + + // The parser always wraps the string into a paragraph, so the first paragraph should be + // ignored, it is not due to markdown syntax. let is_first_paragraph_start = if matches!(event, Event::Start(Tag::Paragraph)) { if found_first_paragraph { false @@ -885,7 +907,7 @@ pub(crate) fn parse_markdown(text: &str) -> Option { }; let is_paragraph_end = matches!(event, Event::End(TagEnd::Paragraph)); - !is_text && !is_break && !is_first_paragraph_start && !is_paragraph_end + !is_borrowed_text && !is_break && !is_first_paragraph_start && !is_paragraph_end }); if !has_markdown { @@ -897,3 +919,41 @@ pub(crate) fn parse_markdown(text: &str) -> Option { Some(html_body) } + +#[cfg(all(test, feature = "markdown"))] +mod tests { + use assert_matches2::assert_matches; + + use super::parse_markdown; + + #[test] + fn detect_markdown() { + // Simple single-line text. + let text = "Hello world."; + assert_matches!(parse_markdown(text), None); + + // Simple double-line text. + let text = "Hello\nworld."; + assert_matches!(parse_markdown(text), None); + + // With new paragraph. + let text = "Hello\n\nworld."; + assert_matches!(parse_markdown(text), Some(_)); + + // With tagged element. + let text = "Hello **world**."; + assert_matches!(parse_markdown(text), Some(_)); + + // With backslash escapes. + let text = r#"Hello \."#; + assert_matches!(parse_markdown(text), Some(_)); + + // With entity reference. + let text = r#"Hello <world>."#; + assert_matches!(parse_markdown(text), Some(_)); + + // With numeric reference. + let text = "Hello w⊕rld."; + assert_matches!(parse_markdown(text), Some(_)); + } +} diff --git a/crates/ruma-events/tests/it/room_message.rs b/crates/ruma-events/tests/it/room_message.rs index f8e942a8..acc2422b 100644 --- a/crates/ruma-events/tests/it/room_message.rs +++ b/crates/ruma-events/tests/it/room_message.rs @@ -201,9 +201,9 @@ fn markdown_detection() { let formatted_body = FormattedBody::markdown("A message\nwith\n\nmultiple\n\nparagraphs"); formatted_body.unwrap(); - // HTML entities don't trigger markdown. + // "Less than" symbol triggers markdown. let formatted_body = FormattedBody::markdown("A message with & HTML < entities"); - assert_matches!(formatted_body, None); + assert_matches!(formatted_body, Some(_)); // HTML triggers markdown. let formatted_body = FormattedBody::markdown("An HTML message");