events: Improve markdown syntax detection

We also detect backslash escapes and entity references.
This commit is contained in:
Kévin Commaille 2024-09-05 12:46:23 +02:00 committed by Kévin Commaille
parent 0ea496b138
commit dac38e4e17
3 changed files with 67 additions and 5 deletions

View File

@ -7,6 +7,8 @@ Bug fixes:
- Fix serialization of `room::message::Relation` and `room::encrypted::Relation` - Fix serialization of `room::message::Relation` and `room::encrypted::Relation`
which could cause duplicate `rel_type` keys. which could cause duplicate `rel_type` keys.
- `Restricted` no longer fails to deserialize when the `allow` field is missing - `Restricted` no longer fails to deserialize when the `allow` field is missing
- Markdown text constructors now also detect markdown syntax like backslash
escapes and entity references to decide if the text should be sent as HTML.
Improvements: Improvements:

View File

@ -858,11 +858,12 @@ pub struct CustomEventContent {
#[cfg(feature = "markdown")] #[cfg(feature = "markdown")]
pub(crate) fn parse_markdown(text: &str) -> Option<String> { pub(crate) fn parse_markdown(text: &str) -> Option<String> {
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd}; use pulldown_cmark::{CowStr, Event, Options, Parser, Tag, TagEnd};
const OPTIONS: Options = Options::ENABLE_TABLES.union(Options::ENABLE_STRIKETHROUGH); const OPTIONS: Options = Options::ENABLE_TABLES.union(Options::ENABLE_STRIKETHROUGH);
let mut found_first_paragraph = false; let mut found_first_paragraph = false;
let mut previous_event_was_text = false;
let parser_events: Vec<_> = Parser::new_ext(text, OPTIONS) let parser_events: Vec<_> = Parser::new_ext(text, OPTIONS)
.map(|event| match event { .map(|event| match event {
@ -871,8 +872,29 @@ pub(crate) fn parse_markdown(text: &str) -> Option<String> {
}) })
.collect(); .collect();
let has_markdown = parser_events.iter().any(|ref event| { let has_markdown = parser_events.iter().any(|ref event| {
let is_text = matches!(event, Event::Text(_)); // Numeric references should be replaced by their UTF-8 equivalent, so encountering a
// non-borrowed string means that there is markdown syntax.
let is_borrowed_text = matches!(event, Event::Text(CowStr::Borrowed(_)));
if is_borrowed_text {
if previous_event_was_text {
// The text was split, so a character was likely removed, like in the case of
// backslash escapes, or replaced by a static string, like for entity references, so
// there is markdown syntax.
return true;
} else {
previous_event_was_text = true;
}
} else {
previous_event_was_text = false;
}
// A hard break happens when a newline is encountered, which is not necessarily markdown
// syntax.
let is_break = matches!(event, Event::HardBreak); let is_break = matches!(event, Event::HardBreak);
// The parser always wraps the string into a paragraph, so the first paragraph should be
// ignored, it is not due to markdown syntax.
let is_first_paragraph_start = if matches!(event, Event::Start(Tag::Paragraph)) { let is_first_paragraph_start = if matches!(event, Event::Start(Tag::Paragraph)) {
if found_first_paragraph { if found_first_paragraph {
false false
@ -885,7 +907,7 @@ pub(crate) fn parse_markdown(text: &str) -> Option<String> {
}; };
let is_paragraph_end = matches!(event, Event::End(TagEnd::Paragraph)); let is_paragraph_end = matches!(event, Event::End(TagEnd::Paragraph));
!is_text && !is_break && !is_first_paragraph_start && !is_paragraph_end !is_borrowed_text && !is_break && !is_first_paragraph_start && !is_paragraph_end
}); });
if !has_markdown { if !has_markdown {
@ -897,3 +919,41 @@ pub(crate) fn parse_markdown(text: &str) -> Option<String> {
Some(html_body) Some(html_body)
} }
#[cfg(all(test, feature = "markdown"))]
mod tests {
use assert_matches2::assert_matches;
use super::parse_markdown;
#[test]
fn detect_markdown() {
// Simple single-line text.
let text = "Hello world.";
assert_matches!(parse_markdown(text), None);
// Simple double-line text.
let text = "Hello\nworld.";
assert_matches!(parse_markdown(text), None);
// With new paragraph.
let text = "Hello\n\nworld.";
assert_matches!(parse_markdown(text), Some(_));
// With tagged element.
let text = "Hello **world**.";
assert_matches!(parse_markdown(text), Some(_));
// With backslash escapes.
let text = r#"Hello \<world\>."#;
assert_matches!(parse_markdown(text), Some(_));
// With entity reference.
let text = r#"Hello &lt;world&gt;."#;
assert_matches!(parse_markdown(text), Some(_));
// With numeric reference.
let text = "Hello w&#8853;rld.";
assert_matches!(parse_markdown(text), Some(_));
}
}

View File

@ -201,9 +201,9 @@ fn markdown_detection() {
let formatted_body = FormattedBody::markdown("A message\nwith\n\nmultiple\n\nparagraphs"); let formatted_body = FormattedBody::markdown("A message\nwith\n\nmultiple\n\nparagraphs");
formatted_body.unwrap(); formatted_body.unwrap();
// HTML entities don't trigger markdown. // "Less than" symbol triggers markdown.
let formatted_body = FormattedBody::markdown("A message with & HTML < entities"); let formatted_body = FormattedBody::markdown("A message with & HTML < entities");
assert_matches!(formatted_body, None); assert_matches!(formatted_body, Some(_));
// HTML triggers markdown. // HTML triggers markdown.
let formatted_body = FormattedBody::markdown("<span>An HTML message</span>"); let formatted_body = FormattedBody::markdown("<span>An HTML message</span>");