events: Improve markdown syntax detection

We also detect backslash escapes and entity references.
This commit is contained in:
Kévin Commaille 2024-09-05 12:46:23 +02:00 committed by Kévin Commaille
parent 0ea496b138
commit dac38e4e17
3 changed files with 67 additions and 5 deletions

View File

@ -7,6 +7,8 @@ Bug fixes:
- Fix serialization of `room::message::Relation` and `room::encrypted::Relation`
which could cause duplicate `rel_type` keys.
- `Restricted` no longer fails to deserialize when the `allow` field is missing
- Markdown text constructors now also detect markdown syntax like backslash
escapes and entity references to decide if the text should be sent as HTML.
Improvements:

View File

@ -858,11 +858,12 @@ pub struct CustomEventContent {
#[cfg(feature = "markdown")]
pub(crate) fn parse_markdown(text: &str) -> Option<String> {
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
use pulldown_cmark::{CowStr, Event, Options, Parser, Tag, TagEnd};
const OPTIONS: Options = Options::ENABLE_TABLES.union(Options::ENABLE_STRIKETHROUGH);
let mut found_first_paragraph = false;
let mut previous_event_was_text = false;
let parser_events: Vec<_> = Parser::new_ext(text, OPTIONS)
.map(|event| match event {
@ -871,8 +872,29 @@ pub(crate) fn parse_markdown(text: &str) -> Option<String> {
})
.collect();
let has_markdown = parser_events.iter().any(|ref event| {
let is_text = matches!(event, Event::Text(_));
// Numeric references should be replaced by their UTF-8 equivalent, so encountering a
// non-borrowed string means that there is markdown syntax.
let is_borrowed_text = matches!(event, Event::Text(CowStr::Borrowed(_)));
if is_borrowed_text {
if previous_event_was_text {
// The text was split, so a character was likely removed, like in the case of
// backslash escapes, or replaced by a static string, like for entity references, so
// there is markdown syntax.
return true;
} else {
previous_event_was_text = true;
}
} else {
previous_event_was_text = false;
}
// A hard break happens when a newline is encountered, which is not necessarily markdown
// syntax.
let is_break = matches!(event, Event::HardBreak);
// The parser always wraps the string into a paragraph, so the first paragraph should be
// ignored, it is not due to markdown syntax.
let is_first_paragraph_start = if matches!(event, Event::Start(Tag::Paragraph)) {
if found_first_paragraph {
false
@ -885,7 +907,7 @@ pub(crate) fn parse_markdown(text: &str) -> Option<String> {
};
let is_paragraph_end = matches!(event, Event::End(TagEnd::Paragraph));
!is_text && !is_break && !is_first_paragraph_start && !is_paragraph_end
!is_borrowed_text && !is_break && !is_first_paragraph_start && !is_paragraph_end
});
if !has_markdown {
@ -897,3 +919,41 @@ pub(crate) fn parse_markdown(text: &str) -> Option<String> {
Some(html_body)
}
#[cfg(all(test, feature = "markdown"))]
mod tests {
use assert_matches2::assert_matches;
use super::parse_markdown;
#[test]
fn detect_markdown() {
// Simple single-line text.
let text = "Hello world.";
assert_matches!(parse_markdown(text), None);
// Simple double-line text.
let text = "Hello\nworld.";
assert_matches!(parse_markdown(text), None);
// With new paragraph.
let text = "Hello\n\nworld.";
assert_matches!(parse_markdown(text), Some(_));
// With tagged element.
let text = "Hello **world**.";
assert_matches!(parse_markdown(text), Some(_));
// With backslash escapes.
let text = r#"Hello \<world\>."#;
assert_matches!(parse_markdown(text), Some(_));
// With entity reference.
let text = r#"Hello &lt;world&gt;."#;
assert_matches!(parse_markdown(text), Some(_));
// With numeric reference.
let text = "Hello w&#8853;rld.";
assert_matches!(parse_markdown(text), Some(_));
}
}

View File

@ -201,9 +201,9 @@ fn markdown_detection() {
let formatted_body = FormattedBody::markdown("A message\nwith\n\nmultiple\n\nparagraphs");
formatted_body.unwrap();
// HTML entities don't trigger markdown.
// "Less than" symbol triggers markdown.
let formatted_body = FormattedBody::markdown("A message with & HTML < entities");
assert_matches!(formatted_body, None);
assert_matches!(formatted_body, Some(_));
// HTML triggers markdown.
let formatted_body = FormattedBody::markdown("<span>An HTML message</span>");