push: Allow wildcards when matching words

2022-06-23 15:57:54 +02:00 · 2022-06-23 15:57:54 +02:00 · d192184b3c
commit d192184b3c
parent 33e1a20c4b
3 changed files with 126 additions and 33 deletions
--- a/crates/ruma-common/CHANGELOG.md
+++ b/crates/ruma-common/CHANGELOG.md
@ -4,6 +4,9 @@ Bug fixes:

 * Expose `MatrixIdError`, `MatrixToError`, `MatrixUriError` and `MxcUriError` at
  the crate root
+* Allow wildcards for push conditions on `content.body`
+  * The spec clarified the behavior of the `event_match` condition:
+    <https://github.com/matrix-org/matrix-spec-proposals/pull/3690> 

 Breaking changes:

--- a/crates/ruma-common/Cargo.toml
+++ b/crates/ruma-common/Cargo.toml
@ -61,6 +61,7 @@ js_option = "0.1.0"
 percent-encoding = "2.1.0"
 pulldown-cmark = { version = "0.9.1", default-features = false, optional = true }
 rand_crate = { package = "rand", version = "0.8.3", optional = true }
+regex = { version = "1.5.6", default-features = false, features = ["std", "perf"] }
 ruma-identifiers-validation = { version = "0.8.1", path = "../ruma-identifiers-validation", default-features = false }
 ruma-macros = { version = "0.9.2", path = "../ruma-macros" }
 serde = { version = "1.0.118", features = ["derive"] }
--- a/crates/ruma-common/src/push/condition.rs
+++ b/crates/ruma-common/src/push/condition.rs
@ -1,6 +1,7 @@
 use std::{collections::BTreeMap, ops::RangeBounds, str::FromStr};

 use js_int::{Int, UInt};
+use regex::Regex;
 use serde::{Deserialize, Serialize};
 use serde_json::{to_value as to_json_value, value::Value as JsonValue};
 use tracing::{instrument, warn};
@ -12,6 +13,13 @@ mod room_member_count_is;

 pub use room_member_count_is::{ComparisonOperator, RoomMemberCountIs};

+/// The characters that are defined as a word boundary in the [Matrix spec].
+///
+/// Any character not in the sets `[A-Z]`, `[a-z]`, `[0-9]` or `_`.
+///
+/// [Matrix spec]: https://spec.matrix.org/v1.3/client-server-api/#conditions-1
+const WORD_BOUNDARY_CHARACTERS: &str = "[^A-Za-z0-9_]";
+
 /// A condition that must apply for an associated push rule's action to be taken.
 #[derive(Clone, Debug, Deserialize, Serialize)]
 #[cfg_attr(not(feature = "unstable-exhaustive-types"), non_exhaustive)]
@ -169,17 +177,27 @@ trait StrExt {

    /// Matches this string against `pattern`.
    ///
+    /// The pattern can be a glob with wildcards `*` and `?`.
+    ///
    /// The match is case insensitive.
    ///
-    /// If `match_words` is `true`, looks for `pattern` as a substring of `self`,
-    /// and checks that it is separated from other words. Otherwise, checks
-    /// `pattern` as a glob with wildcards `*` and `?`.
+    /// If `match_words` is `true`, checks that the pattern is separated from other words.
    fn matches_pattern(&self, pattern: &str, match_words: bool) -> bool;

    /// Matches this string against `pattern`, with word boundaries.
    ///
+    /// The pattern can be a glob with wildcards `*` and `?`.
+    ///
+    /// A word boundary is defined as the start or end of the value, or any character not in the
+    /// sets `[A-Z]`, `[a-z]`, `[0-9]` or `_`.
+    ///
    /// The match is case sensitive.
    fn matches_word(&self, pattern: &str) -> bool;
+
+    /// Translate the wildcards in `self` to a regex syntax.
+    ///
+    /// `self` must only contain wildcards.
+    fn wildcards_to_regex(&self) -> String;
 }

 impl StrExt for str {
@ -229,40 +247,92 @@ impl StrExt for str {
            return false;
        }

-        match self.find(pattern) {
-            Some(start) => {
-                let end = start + pattern.len();
+        let has_wildcards = pattern.contains(|c| matches!(c, '?' | '*'));

-                // Look if the match has word boundaries.
-                let word_boundary_start = !self.char_at(start).is_word_char()
-                    || self.find_prev_char(start).map_or(true, |c| !c.is_word_char());
+        if has_wildcards {
+            let mut chunks: Vec<String> = vec![];
+            let mut prev_wildcard = false;
+            let mut chunk_start = 0;

-                if word_boundary_start {
-                    let word_boundary_end = end == self.len()
-                        || !self.find_prev_char(end).unwrap().is_word_char()
-                        || !self.char_at(end).is_word_char();
-
-                    if word_boundary_end {
-                        return true;
+            for (i, c) in pattern.char_indices() {
+                if matches!(c, '?' | '*') && !prev_wildcard {
+                    if i != 0 {
+                        chunks.push(regex::escape(&pattern[chunk_start..i]));
+                        chunk_start = i;
                    }
+
+                    prev_wildcard = true;
+                } else if prev_wildcard {
+                    let chunk = &pattern[chunk_start..i];
+                    chunks.push(chunk.wildcards_to_regex());
+
+                    chunk_start = i;
+                    prev_wildcard = false;
                }
-
-                // Find next word.
-                let non_word_str = &self[start..];
-                let non_word = match non_word_str.find(|c: char| !c.is_word_char()) {
-                    Some(pos) => pos,
-                    None => return false,
-                };
-
-                let word_str = &non_word_str[non_word..];
-                let word = match word_str.find(|c: char| c.is_word_char()) {
-                    Some(pos) => pos,
-                    None => return false,
-                };
-
-                word_str[word..].matches_word(pattern)
            }
-            None => false,
+
+            let len = pattern.len();
+            if !prev_wildcard {
+                chunks.push(regex::escape(&pattern[chunk_start..len]));
+            } else if prev_wildcard {
+                let chunk = &pattern[chunk_start..len];
+                chunks.push(chunk.wildcards_to_regex());
+            }
+
+            let regex = format!(
+                "(?:^|{WORD_BOUNDARY_CHARACTERS}){}(?:{WORD_BOUNDARY_CHARACTERS}|$)",
+                chunks.concat()
+            );
+            Regex::new(&regex).ok().filter(|re| re.is_match(self)).is_some()
+        } else {
+            match self.find(pattern) {
+                Some(start) => {
+                    let end = start + pattern.len();
+
+                    // Look if the match has word boundaries.
+                    let word_boundary_start = !self.char_at(start).is_word_char()
+                        || self.find_prev_char(start).map_or(true, |c| !c.is_word_char());
+
+                    if word_boundary_start {
+                        let word_boundary_end = end == self.len()
+                            || !self.find_prev_char(end).unwrap().is_word_char()
+                            || !self.char_at(end).is_word_char();
+
+                        if word_boundary_end {
+                            return true;
+                        }
+                    }
+
+                    // Find next word.
+                    let non_word_str = &self[start..];
+                    let non_word = match non_word_str.find(|c: char| !c.is_word_char()) {
+                        Some(pos) => pos,
+                        None => return false,
+                    };
+
+                    let word_str = &non_word_str[non_word..];
+                    let word = match word_str.find(|c: char| c.is_word_char()) {
+                        Some(pos) => pos,
+                        None => return false,
+                    };
+
+                    word_str[word..].matches_word(pattern)
+                }
+                None => false,
+            }
+        }
+    }
+
+    fn wildcards_to_regex(&self) -> String {
+        // Simplify pattern to avoid performance issues:
+        // - The glob `?**?**?` is equivalent to the glob `???*`
+        // - The glob `???*` is equivalent to the regex `.{3,}`
+        let question_marks = self.matches('?').count();
+
+        if self.contains('*') {
+            format!(".{{{question_marks},}}")
+        } else {
+            format!(".{{{question_marks}}}")
        }
    }
 }
@ -443,6 +513,19 @@ mod tests {
        assert!("Ruma Dev👩‍💻".matches_word("Dev"));
        assert!("Ruma Dev👩‍💻".matches_word("👩‍💻"));
        assert!("Ruma Dev👩‍💻".matches_word("Dev👩‍💻"));
+
+        // Regex syntax is escaped
+        assert!(!"matrix".matches_word(r"\w*"));
+        assert!(r"\w".matches_word(r"\w*"));
+        assert!(!"matrix".matches_word("[a-z]*"));
+        assert!("[a-z] and [0-9]".matches_word("[a-z]*"));
+        assert!(!"m".matches_word("[[:alpha:]]?"));
+        assert!("[[:alpha:]]!".matches_word("[[:alpha:]]?"));
+
+        // From the spec: <https://spec.matrix.org/v1.3/client-server-api/#conditions-1>
+        assert!("An example event.".matches_word("ex*ple"));
+        assert!("exple".matches_word("ex*ple"));
+        assert!("An exciting triple-whammy".matches_word("ex*ple"));
    }

    #[test]
@ -451,7 +534,7 @@ mod tests {
        assert!("foo bar".matches_pattern("foo", true));
        assert!("Foo bar".matches_pattern("foo", true));
        assert!(!"foobar".matches_pattern("foo", true));
-        assert!(!"foo bar".matches_pattern("foo*", true));
+        assert!("foo bar".matches_pattern("foo*", true));
        assert!("".matches_pattern("", true));
        assert!(!"foo".matches_pattern("", true));

@ -467,6 +550,12 @@ mod tests {
        assert!("".matches_pattern("", false));
        assert!("".matches_pattern("*", false));
        assert!(!"foo".matches_pattern("", false));
+
+        // From the spec: <https://spec.matrix.org/v1.3/client-server-api/#conditions-1>
+        assert!("Lunch plans".matches_pattern("lunc?*", false));
+        assert!("LUNCH".matches_pattern("lunc?*", false));
+        assert!(!" lunch".matches_pattern("lunc?*", false));
+        assert!(!"lunc".matches_pattern("lunc?*", false));
    }

    #[test]