diff --git a/crates/ruma-events/CHANGELOG.md b/crates/ruma-events/CHANGELOG.md index 3fc1c0b6..58054070 100644 --- a/crates/ruma-events/CHANGELOG.md +++ b/crates/ruma-events/CHANGELOG.md @@ -2,6 +2,18 @@ Bug fixes: +- Markdown text constructors now detect even more markdown syntax like removed + whitespace at the end or beginning of a line to decide if the text should be + sent as HTML. + +Improvements: + +- Add unstable support for MSC4059 for bundling link previews in chat text messages + +# 0.29.0 + +Bug fixes: + - Fix missing `relates_to` field on `StickerEventContent` - Fix deserialization of `AnyGlobalAccountDataEvent` for variants with a type fragment. diff --git a/crates/ruma-events/src/room/message.rs b/crates/ruma-events/src/room/message.rs index 84a98414..776d4062 100644 --- a/crates/ruma-events/src/room/message.rs +++ b/crates/ruma-events/src/room/message.rs @@ -863,82 +863,86 @@ pub struct CustomEventContent { #[cfg(feature = "markdown")] pub(crate) fn parse_markdown(text: &str) -> Option { - use pulldown_cmark::{CowStr, Event, Options, Parser, Tag, TagEnd}; + use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd}; const OPTIONS: Options = Options::ENABLE_TABLES.union(Options::ENABLE_STRIKETHROUGH); - let mut found_first_paragraph = false; - let mut previous_event_was_text = false; - let parser_events: Vec<_> = Parser::new_ext(text, OPTIONS) .map(|event| match event { Event::SoftBreak => Event::HardBreak, _ => event, }) .collect(); - let has_markdown = parser_events.iter().any(|ref event| { - // Numeric references should be replaced by their UTF-8 equivalent, so encountering a - // non-borrowed string means that there is markdown syntax. - let is_borrowed_text = matches!(event, Event::Text(CowStr::Borrowed(_))); - if is_borrowed_text { - if previous_event_was_text { - // The text was split, so a character was likely removed, like in the case of - // backslash escapes, or replaced by a static string, like for entity references, so - // there is markdown syntax. - return true; - } else { - previous_event_was_text = true; - } - } else { - previous_event_was_text = false; - } - - // A hard break happens when a newline is encountered, which is not necessarily markdown - // syntax. - let is_break = matches!(event, Event::HardBreak); - - // The parser always wraps the string into a paragraph, so the first paragraph should be - // ignored, it is not due to markdown syntax. - let is_first_paragraph_start = if matches!(event, Event::Start(Tag::Paragraph)) { - if found_first_paragraph { - false - } else { - found_first_paragraph = true; - true - } - } else { - false - }; - let is_paragraph_end = matches!(event, Event::End(TagEnd::Paragraph)); - - !is_borrowed_text && !is_break && !is_first_paragraph_start && !is_paragraph_end - }); - - if !has_markdown { - return None; - } - - // If the markdown contains only inline tags, i.e. if there is only one paragraph, remove the - // wrapping paragraph, as instructed by the spec. + // Text that does not contain markdown syntax is always inline because when we encounter several + // blocks we convert them to HTML. Inline text is always wrapped by a single paragraph. let first_event_is_paragraph_start = parser_events.first().is_some_and(|event| matches!(event, Event::Start(Tag::Paragraph))); let last_event_is_paragraph_end = parser_events.last().is_some_and(|event| matches!(event, Event::End(TagEnd::Paragraph))); - let mut has_several_blocks = true; + let mut is_inline = first_event_is_paragraph_start && last_event_is_paragraph_end; + let mut has_markdown = !is_inline; - if first_event_is_paragraph_start && last_event_is_paragraph_end { - has_several_blocks = parser_events.iter().skip(1).any(|event| { - if let Event::Start(tag) = event { - is_block_tag(tag) - } else { - false + if !has_markdown { + // Check whether the events contain other blocks and whether they contain inline markdown + // syntax. + let mut pos = 0; + + for event in parser_events.iter().skip(1) { + match event { + Event::Text(s) => { + // If the string does not contain markdown, the only modification that should + // happen is that newlines are converted to hardbreaks. It means that we should + // find all the other characters from the original string in the text events. + // Let's check that by walking the original string. + if text[pos..].starts_with(s.as_ref()) { + pos += s.len(); + continue; + } + } + Event::HardBreak => { + // A hard break happens when a newline is encountered, which is not necessarily + // markdown syntax. Skip the newline in the original string for the walking + // above to work. + if text[pos..].starts_with("\r\n") { + pos += 2; + continue; + } else if text[pos..].starts_with(['\r', '\n']) { + pos += 1; + continue; + } + } + // A paragraph end is fine because we would detect markdown from the paragraph + // start. + Event::End(TagEnd::Paragraph) => continue, + // Any other event means there is markdown syntax. + Event::Start(tag) => { + is_inline &= !is_block_tag(tag); + } + _ => {} } - }); + + has_markdown = true; + + // Stop when we also know that there are several blocks. + if !is_inline { + break; + } + } + + // If we are not at the end of the string, some characters were removed. + has_markdown |= pos != text.len(); + } + + // If the string does not contain markdown, don't generate HTML. + if !has_markdown { + return None; } let mut events_iter = parser_events.into_iter(); - if !has_several_blocks { + + // If the content is inline, remove the wrapping paragraph, as instructed by the Matrix spec. + if is_inline { events_iter.next(); events_iter.next_back(); } @@ -1000,10 +1004,14 @@ mod tests { let text = "Hello **world**."; assert_eq!(parse_markdown(text).as_deref(), Some("Hello world.")); - // With backslash escapes. + // Containing backslash escapes. let text = r#"Hello \."#; assert_eq!(parse_markdown(text).as_deref(), Some("Hello <world>.")); + // Starting with backslash escape. + let text = r#"\> Hello world."#; + assert_eq!(parse_markdown(text).as_deref(), Some("> Hello world.")); + // With entity reference. let text = r#"Hello <world>."#; assert_eq!(parse_markdown(text).as_deref(), Some("Hello <world>.")); @@ -1012,4 +1020,78 @@ mod tests { let text = "Hello w⊕rld."; assert_eq!(parse_markdown(text).as_deref(), Some("Hello w⊕rld.")); } + + #[test] + fn detect_commonmark() { + // Examples from the CommonMark spec. + + let text = r#"\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~"#; + assert_eq!( + parse_markdown(text).as_deref(), + Some(r##"!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"##) + ); + + let text = r#"\→\A\a\ \3\φ\«"#; + assert_eq!(parse_markdown(text).as_deref(), None); + + let text = r#"\*not emphasized*"#; + assert_eq!(parse_markdown(text).as_deref(), Some("*not emphasized*")); + + let text = r#"\
not a tag"#; + assert_eq!(parse_markdown(text).as_deref(), Some("<br/> not a tag")); + + let text = r#"\[not a link](/foo)"#; + assert_eq!(parse_markdown(text).as_deref(), Some("[not a link](/foo)")); + + let text = r#"\`not code`"#; + assert_eq!(parse_markdown(text).as_deref(), Some("`not code`")); + + let text = r#"1\. not a list"#; + assert_eq!(parse_markdown(text).as_deref(), Some("1. not a list")); + + let text = r#"\* not a list"#; + assert_eq!(parse_markdown(text).as_deref(), Some("* not a list")); + + let text = r#"\# not a heading"#; + assert_eq!(parse_markdown(text).as_deref(), Some("# not a heading")); + + let text = r#"\[foo]: /url "not a reference""#; + assert_eq!(parse_markdown(text).as_deref(), Some(r#"[foo]: /url "not a reference""#)); + + let text = r#"\ö not a character entity"#; + assert_eq!(parse_markdown(text).as_deref(), Some("&ouml; not a character entity")); + + let text = r#"\\*emphasis*"#; + assert_eq!(parse_markdown(text).as_deref(), Some(r#"\emphasis"#)); + + let text = "foo\\\nbar"; + assert_eq!(parse_markdown(text).as_deref(), Some("foo
\nbar")); + + let text = " ***\n ***\n ***"; + assert_eq!(parse_markdown(text).as_deref(), Some("
\n
\n
\n")); + + let text = "Foo\n***\nbar"; + assert_eq!(parse_markdown(text).as_deref(), Some("

Foo

\n
\n

bar

\n")); + + let text = "\n*foo*"; + assert_eq!(parse_markdown(text).as_deref(), Some("\n*foo*")); + + let text = "
\n*foo*\n\n*bar*"; + assert_eq!(parse_markdown(text).as_deref(), Some("
\n*foo*\n

bar

\n")); + + let text = "aaa\nbbb\n\nccc\nddd"; + assert_eq!( + parse_markdown(text).as_deref(), + Some("

aaa
\nbbb

\n

ccc
\nddd

\n") + ); + + let text = " aaa\n bbb"; + assert_eq!(parse_markdown(text).as_deref(), Some("aaa
\nbbb")); + + let text = "aaa\n bbb\n ccc"; + assert_eq!(parse_markdown(text).as_deref(), Some("aaa
\nbbb
\nccc")); + + let text = "aaa \nbbb "; + assert_eq!(parse_markdown(text).as_deref(), Some("aaa
\nbbb")); + } } diff --git a/crates/ruma-events/tests/it/room_message.rs b/crates/ruma-events/tests/it/room_message.rs index 24f1f41f..37404c70 100644 --- a/crates/ruma-events/tests/it/room_message.rs +++ b/crates/ruma-events/tests/it/room_message.rs @@ -205,9 +205,9 @@ fn markdown_detection() { "

A message
\nwith

\n

multiple

\n

paragraphs

\n" ); - // "Less than" symbol triggers markdown. - let formatted_body = FormattedBody::markdown("A message with & HTML < entities").unwrap(); - assert_eq!(formatted_body.body, "A message with & HTML < entities"); + // HTML reserved symbols do not trigger markdown. + let formatted_body = FormattedBody::markdown("A message with & HTML < entities"); + assert_matches!(formatted_body, None); // HTML triggers markdown. let formatted_body = FormattedBody::markdown("An HTML message").unwrap();