html: Add more settings to SanitizerConfig

Allows to use it with a whitelist or a blacklist,
and extend or reduce the lists from the Matrix spec.
This commit is contained in:
Kévin Commaille 2024-04-30 11:59:15 +02:00 committed by Kévin Commaille
parent e161a57eda
commit bd56071587
6 changed files with 1525 additions and 341 deletions

View File

@ -1,6 +1,6 @@
//! Convenience methods and types to sanitize HTML messages.
use crate::{Html, SanitizerConfig};
use crate::{Html, HtmlSanitizerMode, SanitizerConfig};
/// Sanitize the given HTML string.
///
@ -27,20 +27,6 @@ pub fn sanitize_html(
sanitize_inner(s, &config)
}
/// What HTML [tags and attributes] should be kept by the sanitizer.
///
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[allow(clippy::exhaustive_enums)]
pub enum HtmlSanitizerMode {
/// Keep only the tags and attributes listed in the Matrix specification.
Strict,
/// Like `Strict` mode, with additional tags and attributes that are not yet included in
/// the spec, but are reasonable to keep.
Compat,
}
/// Whether to remove the [rich reply fallback] while sanitizing.
///
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies

View File

@ -9,7 +9,9 @@ use ruma_common::{
IdParseError, MatrixToError, MatrixToUri, MatrixUri, MatrixUriError, MxcUri, OwnedMxcUri,
};
use crate::sanitizer_config::ALLOWED_SCHEMES_A_HREF_COMPAT;
use crate::sanitizer_config::clean::{
ALLOWED_SCHEMES_A_HREF_COMPAT, ALLOWED_SCHEMES_A_HREF_STRICT,
};
const CLASS_LANGUAGE_PREFIX: &str = "language-";
@ -417,8 +419,9 @@ impl AnchorUri {
let s = value.as_ref();
// Check if it starts with a supported scheme.
if !ALLOWED_SCHEMES_A_HREF_COMPAT.iter().any(|scheme| s.starts_with(&format!("{scheme}:")))
{
let mut allowed_schemes =
ALLOWED_SCHEMES_A_HREF_STRICT.iter().chain(ALLOWED_SCHEMES_A_HREF_COMPAT.iter());
if !allowed_schemes.any(|scheme| s.starts_with(&format!("{scheme}:"))) {
return None;
}

View File

@ -21,4 +21,27 @@ mod helpers;
mod html;
mod sanitizer_config;
pub use self::{helpers::*, html::*, sanitizer_config::SanitizerConfig};
pub use self::{helpers::*, html::*, sanitizer_config::*};
/// What [HTML elements and attributes] should be kept by the sanitizer.
///
/// [HTML elements and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[allow(clippy::exhaustive_enums)]
pub enum HtmlSanitizerMode {
/// Keep only the elements and attributes suggested in the Matrix specification.
///
/// In addition to filtering elements and attributes listed in the Matrix specification, it
/// also removes elements that are nested more than 100 levels deep.
///
/// Deprecated elements and attributes are also replaced when applicable.
Strict,
/// Like `Strict` mode, with additional elements and attributes that are not yet included in
/// the spec, but are reasonable to keep.
///
/// Differences with `Strict` mode:
///
/// * The `matrix` scheme is allowed in links.
Compat,
}

View File

@ -1,375 +1,498 @@
use html5ever::{tendril::StrTendril, Attribute, LocalName};
use phf::{phf_map, phf_set, Map, Set};
use wildmatch::WildMatch;
#![allow(clippy::disallowed_types)]
use crate::html::{ElementData, Html, NodeData};
use std::collections::{HashMap, HashSet};
/// Configuration to sanitize HTML tags and attributes.
pub(crate) mod clean;
use crate::HtmlSanitizerMode;
/// Configuration to sanitize HTML elements and attributes.
#[derive(Debug, Default, Clone)]
pub struct SanitizerConfig {
/// The allowed HTML tags.
/// The mode of the sanitizer, if any.
mode: Option<HtmlSanitizerMode>,
/// Change to the list of elements to replace.
///
/// If this is `None`, all tags are allowed.
allowed_tags: Option<&'static Set<&'static str>>,
/// The content is a map of element name to their replacement's element name.
replace_elements: Option<List<HashMap<&'static str, &'static str>>>,
/// The allowed deprecated HTML tags.
///
/// This is a map of allowed deprecated tag to their replacement tag.
deprecated_tags: Option<&'static Map<&'static str, &'static str>>,
/// Elements to remove.
remove_elements: Option<HashSet<&'static str>>,
/// The allowed attributes per tag.
///
/// If this is `None`, all attributes are allowed.
allowed_attrs: Option<&'static Map<&'static str, &'static Set<&'static str>>>,
/// The allowed deprecated attributes per tag.
///
/// This is a map of tag to a map of allowed deprecated attribute to their replacement
/// attribute.
deprecated_attrs: Option<&'static Map<&'static str, &'static Map<&'static str, &'static str>>>,
/// The allowed URI schemes per tag.
///
/// If this is `None`, all schemes are allowed.
allowed_schemes: Option<&'static Map<&'static str, &'static Set<&'static str>>>,
/// The allowed classes per tag.
///
/// If this is `None`, all classes are allowed.
allowed_classes: Option<&'static Map<&'static str, &'static Set<&'static str>>>,
/// The maximum nesting level of the tags.
max_depth: Option<u32>,
/// Whether to remove rich reply fallback.
/// Whether to remove the rich reply fallback.
remove_reply_fallback: bool,
/// Elements to ignore.
ignore_elements: Option<HashSet<&'static str>>,
/// Change to the list of elements to allow.
allow_elements: Option<List<HashSet<&'static str>>>,
/// Change to the list of attributes to replace per element.
///
/// The content is a map of element name to a map of attribute name to their replacement's
/// attribute name.
replace_attrs: Option<List<HashMap<&'static str, HashMap<&'static str, &'static str>>>>,
/// Removed attributes per element.
remove_attrs: Option<HashMap<&'static str, HashSet<&'static str>>>,
/// Change to the list of allowed attributes per element.
allow_attrs: Option<List<HashMap<&'static str, HashSet<&'static str>>>>,
/// Denied URI schemes per attribute per element.
///
/// The content is a map of element name to a map of attribute name to a set of schemes.
deny_schemes: Option<HashMap<&'static str, HashMap<&'static str, HashSet<&'static str>>>>,
/// Change to the list of allowed URI schemes per attribute per element.
///
/// The content is a map of element name to a map of attribute name to a set of schemes.
#[allow(clippy::type_complexity)]
allow_schemes:
Option<List<HashMap<&'static str, HashMap<&'static str, HashSet<&'static str>>>>>,
/// Removed classes per element.
///
/// The content is a map of element name to a set of classes.
remove_classes: Option<HashMap<&'static str, HashSet<&'static str>>>,
/// Change to the list of allowed classes per element.
///
/// The content is a map of element name to a set of classes.
allow_classes: Option<List<HashMap<&'static str, HashSet<&'static str>>>>,
/// Maximum nesting level of the elements.
max_depth: Option<u32>,
}
impl SanitizerConfig {
/// Constructs an empty `SanitizerConfig` that will not filter any tag or attribute.
/// Constructs an empty `SanitizerConfig` that will not filter any element or attribute.
///
/// The list of allowed and replaced elements can be changed with [`Self::allow_elements()`],
/// [`Self::replace_elements()`], [`Self::ignore_elements()`], [`Self::remove_elements()`],
/// [`Self::remove_reply_fallback()`].
///
/// The list of allowed and replaced attributes can be changed with
/// [`Self::allow_attributes()`], [`Self::replace_attributes()`],
/// [`Self::remove_attributes()`], [`Self::allow_schemes()`], [`Self::deny_schemes()`],
/// [`Self::allow_classes()`], [`Self::remove_classes()`].
pub fn new() -> Self {
Self::default()
}
/// Constructs a `SanitizerConfig` that will filter tags or attributes not [listed in the
/// Matrix specification].
/// Constructs a `SanitizerConfig` with the given mode for filtering elements and attributes.
///
/// Deprecated tags will be replaced with their non-deprecated equivalent.
/// The mode defines the basic list of allowed and replaced elements and attributes and the
/// maximum nesting level of elements.
///
/// It will not remove the reply fallback by default.
/// The list of allowed and replaced elements can be changed with [`Self::allow_elements()`],
/// [`Self::replace_elements()`], [`Self::ignore_elements()`], [`Self::remove_elements()`],
/// [`Self::remove_reply_fallback()`].
///
/// [listed in the Matrix specification]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
pub fn strict() -> Self {
Self {
allowed_tags: Some(&ALLOWED_TAGS_WITHOUT_REPLY_STRICT),
deprecated_tags: Some(&DEPRECATED_TAGS),
allowed_attrs: Some(&ALLOWED_ATTRIBUTES_STRICT),
deprecated_attrs: Some(&DEPRECATED_ATTRS),
allowed_schemes: Some(&ALLOWED_SCHEMES_STRICT),
allowed_classes: Some(&ALLOWED_CLASSES_STRICT),
max_depth: Some(MAX_DEPTH_STRICT),
remove_reply_fallback: false,
}
/// The list of allowed and replaced attributes can be changed with
/// [`Self::allow_attributes()`], [`Self::replace_attributes()`],
/// [`Self::remove_attributes()`], [`Self::allow_schemes()`], [`Self::deny_schemes()`],
/// [`Self::allow_classes()`], [`Self::remove_classes()`].
pub fn with_mode(mode: HtmlSanitizerMode) -> Self {
Self { mode: Some(mode), ..Default::default() }
}
/// Constructs a `SanitizerConfig` that will filter tags or attributes not [listed in the
/// Matrix specification], except a few for improved compatibility:
/// Constructs a `SanitizerConfig` that will filter elements and attributes not [suggested in
/// the Matrix specification].
///
/// - The `matrix` scheme is allowed in links.
/// The list of allowed and replaced elements can be changed with [`Self::allow_elements()`],
/// [`Self::replace_elements()`], [`Self::ignore_elements()`], [`Self::remove_elements()`],
/// [`Self::remove_reply_fallback()`].
///
/// Deprecated tags will be replaced with their non-deprecated equivalent.
/// The list of allowed and replaced attributes can be changed with
/// [`Self::allow_attributes()`], [`Self::replace_attributes()`],
/// [`Self::remove_attributes()`], [`Self::allow_schemes()`], [`Self::deny_schemes()`],
/// [`Self::allow_classes()`], [`Self::remove_classes()`].
///
/// It will not remove the reply fallback by default.
/// This is the same as calling `SanitizerConfig::with_mode(HtmlSanitizerMode::Strict)`.
///
/// [suggested in the Matrix specification]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
pub fn strict() -> Self {
Self::with_mode(HtmlSanitizerMode::Strict)
}
/// Constructs a `SanitizerConfig` that will filter elements and attributes not [suggested in
/// the Matrix specification], except a few for improved compatibility:
///
/// * The `matrix` scheme is allowed in links.
///
/// The list of allowed elements can be changed with [`Self::allow_elements()`],
/// [`Self::replace_elements()`], [`Self::ignore_elements()`], [`Self::remove_elements()`],
/// [`Self::remove_reply_fallback()`].
///
/// The list of allowed attributes can be changed with [`Self::allow_attributes()`],
/// [`Self::replace_attributes()`], [`Self::remove_attributes()`], [`Self::allow_schemes()`],
/// [`Self::deny_schemes()`], [`Self::allow_classes()`], [`Self::remove_classes()`].
///
/// This is the same as calling `SanitizerConfig::with_mode(HtmlSanitizerMode::Compat)`.
///
/// [listed in the Matrix specification]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
pub fn compat() -> Self {
Self { allowed_schemes: Some(&ALLOWED_SCHEMES_COMPAT), ..Self::strict() }
Self::with_mode(HtmlSanitizerMode::Compat)
}
/// Change the list of replaced HTML elements.
///
/// The given list is added to or replaces the list of replacements of the current mode,
/// depending on the [`ListBehavior`].
///
/// The replacement occurs before the removal, so the replaced element should not be in
/// the allowed list of elements, but the replacement element should.
///
/// # Parameters
///
/// * `elements`: The list of element names replacements.
pub fn replace_elements(
mut self,
elements: impl IntoIterator<Item = NameReplacement>,
behavior: ListBehavior,
) -> Self {
let content = elements.into_iter().map(|r| r.to_tuple()).collect();
self.replace_elements = Some(List { content, behavior });
self
}
/// Remove the given HTML elements.
///
/// When an element is removed, the element and its children are dropped. If you want to remove
/// an element but keep its children, use [`SanitizerConfig::ignore_elements`] or
/// [`SanitizerConfig::allow_elements`].
///
/// Removing elements has a higher priority than ignoring or allowing. So if an element is in
/// this list, it will always be removed.
///
/// # Parameters
///
/// * `elements`: The list of element names to remove.
pub fn remove_elements(mut self, elements: impl IntoIterator<Item = &'static str>) -> Self {
self.remove_elements = Some(elements.into_iter().collect());
self
}
/// Remove the [rich reply fallback].
///
/// Calling this allows to remove the `mx-reply` element in addition to the list of elements to
/// remove.
///
/// Removing elements has a higher priority than ignoring or allowing. So if this settings is
/// set, `mx-reply` will always be removed.
///
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
pub fn remove_reply_fallback(mut self) -> Self {
self.remove_reply_fallback = true;
self
}
/// Clean the given HTML with this sanitizer.
pub(crate) fn clean(&self, html: &mut Html) {
let root = html.root();
let mut next_child = root.first_child;
while let Some(child) = next_child {
next_child = html.nodes[child].next_sibling;
self.clean_node(html, child, 0);
}
/// Ignore the given HTML elements.
///
/// When an element is ignored, the element is dropped and replaced by its children. If you want
/// to drop an element and its children, use [`SanitizerConfig::remove_elements`].
///
/// Removing elements has a lower priority than removing but a higher priority than allowing.
///
/// # Parameters
///
/// * `elements`: The list of element names to ignore.
pub fn ignore_elements(mut self, elements: impl IntoIterator<Item = &'static str>) -> Self {
self.ignore_elements = Some(elements.into_iter().collect());
self
}
fn clean_node(&self, html: &mut Html, node_id: usize, depth: u32) {
self.apply_deprecations(html, node_id);
let action = self.node_action(html, node_id, depth);
if action != NodeAction::Remove {
let mut next_child = html.nodes[node_id].first_child;
while let Some(child) = next_child {
next_child = html.nodes[child].next_sibling;
if action == NodeAction::Ignore {
html.insert_before(node_id, child);
}
self.clean_node(html, child, depth + 1);
}
}
if matches!(action, NodeAction::Ignore | NodeAction::Remove) {
html.detach(node_id);
} else if let Some(data) = html.nodes[node_id].as_element_mut() {
self.clean_element_attributes(data);
}
/// Change the list of allowed HTML elements.
///
/// The given list is added to or replaces the list of allowed elements of the current
/// mode, depending on the [`ListBehavior`].
///
/// If an element is not allowed, it is ignored. If no mode is set and no elements are
/// explicitly allowed, all elements are allowed.
///
/// # Parameters
///
/// * `elements`: The list of element names.
pub fn allow_elements(
mut self,
elements: impl IntoIterator<Item = &'static str>,
behavior: ListBehavior,
) -> Self {
let content = elements.into_iter().collect();
self.allow_elements = Some(List { content, behavior });
self
}
fn apply_deprecations(&self, html: &mut Html, node_id: usize) {
if let NodeData::Element(ElementData { name, attrs, .. }) = &mut html.nodes[node_id].data {
let tag: &str = &name.local;
if let Some(deprecated_attrs) =
self.deprecated_attrs.and_then(|deprecated_attrs| deprecated_attrs.get(tag))
{
*attrs = attrs
.clone()
.into_iter()
.map(|mut attr| {
let attr_name: &str = &attr.name.local;
let attr_replacement =
deprecated_attrs.get(attr_name).map(|s| LocalName::from(*s));
if let Some(attr_replacement) = attr_replacement {
attr.name.local = attr_replacement;
}
attr
})
.collect();
}
let tag_replacement = self
.deprecated_tags
.and_then(|deprecated_tags| deprecated_tags.get(tag))
.map(|s| LocalName::from(*s));
if let Some(tag_replacement) = tag_replacement {
name.local = tag_replacement;
}
}
/// Change the list of replaced attributes per HTML element.
///
/// The given list is added to or replaces the list of replacements of the current mode,
/// depending on the [`ListBehavior`].
///
/// The replacement occurs before the removal, so the replaced attribute should not be in the
/// list of allowed attributes, but the replacement attribute should. Attribute replacement
/// occurs before element replacement, so if you want to replace an attribute on an element
/// that is set to be replaced, you must use the replaced element's name, not the name of its
/// replacement.
///
/// # Parameters
///
/// * `attrs`: The list of element's attributes replacements.
pub fn replace_attributes<'a>(
mut self,
attrs: impl IntoIterator<Item = ElementAttributesReplacement<'a>>,
behavior: ListBehavior,
) -> Self {
let content = attrs.into_iter().map(|r| r.to_tuple()).collect();
self.replace_attrs = Some(List { content, behavior });
self
}
fn node_action(&self, html: &Html, node_id: usize, depth: u32) -> NodeAction {
match &html.nodes[node_id].data {
NodeData::Element(ElementData { name, attrs, .. }) => {
let tag: &str = &name.local;
if (self.remove_reply_fallback && tag == RICH_REPLY_TAG)
|| self.max_depth.is_some_and(|max| depth >= max)
{
NodeAction::Remove
} else if self
.allowed_tags
.is_some_and(|allowed| tag != RICH_REPLY_TAG && !allowed.contains(tag))
{
NodeAction::Ignore
} else if let Some(allowed_schemes) = self.allowed_schemes {
for attr in attrs.iter() {
let value = &attr.value;
let attr: &str = &attr.name.local;
// Check if there is a (tag, attr) tuple entry.
if let Some(schemes) = allowed_schemes.get(&*format!("{tag}:{attr}")) {
// Check if the scheme is allowed.
if !schemes
.iter()
.any(|scheme| value.starts_with(&format!("{scheme}:")))
{
return NodeAction::Ignore;
}
}
}
NodeAction::None
} else {
NodeAction::None
}
}
NodeData::Text(_) => NodeAction::None,
_ => NodeAction::Remove,
}
/// Remove the given attributes per HTML element.
///
/// Removing attributes has a higher priority than allowing. So if an attribute is in
/// this list, it will always be removed.
///
/// # Parameters
///
/// * `attrs`: The list of attributes per element. The value of `parent` is the element name,
/// and `properties` contains attribute names.
pub fn remove_attributes<'a>(
mut self,
attrs: impl IntoIterator<Item = PropertiesNames<'a>>,
) -> Self {
self.remove_attrs = Some(attrs.into_iter().map(|a| a.to_tuple()).collect());
self
}
fn clean_element_attributes(&self, data: &mut ElementData) {
let ElementData { name, attrs } = data;
let tag: &str = &name.local;
/// Change the list of allowed attributes per HTML element.
///
/// The given list is added to or replaces the list of allowed attributes of the current
/// mode, depending on the [`ListBehavior`].
///
/// If an attribute is not allowed, it is removed. If no mode is set and no attributes are
/// explicitly allowed, all attributes are allowed.
///
/// # Parameters
///
/// * `attrs`: The list of attributes per element. The value of `parent` is the element name,
/// and `properties` contains attribute names.
pub fn allow_attributes<'a>(
mut self,
attrs: impl IntoIterator<Item = PropertiesNames<'a>>,
behavior: ListBehavior,
) -> Self {
let content = attrs.into_iter().map(|a| a.to_tuple()).collect();
self.allow_attrs = Some(List { content, behavior });
self
}
let actions: Vec<_> = attrs
.iter()
.filter_map(|attr| {
let value = &attr.value;
let name: &str = &attr.name.local;
/// Deny the given URI schemes per attribute per HTML element.
///
/// Denying schemes has a higher priority than allowing. So if a scheme is in
/// this list, it will always be denied.
///
/// If a scheme is denied, its element is removed, because it is deemed that the element will
/// not be usable without it URI.
///
/// # Parameters
///
/// * `schemes`: The list of schemes per attribute per element.
pub fn deny_schemes<'a>(
mut self,
schemes: impl IntoIterator<Item = ElementAttributesSchemes<'a>>,
) -> Self {
self.deny_schemes = Some(schemes.into_iter().map(|s| s.to_tuple()).collect());
self
}
if self
.allowed_attrs
.is_some_and(|m| !m.get(tag).is_some_and(|attrs| attrs.contains(name)))
{
return Some(AttributeAction::Remove(attr.to_owned()));
}
/// Change the list of allowed schemes per attribute per HTML element.
///
/// The given list is added to or replaces the list of allowed schemes of the current
/// mode, depending on the [`ListBehavior`].
///
/// If a scheme is not allowed, it is denied. If a scheme is denied, its element is ignored,
/// because it is deemed that the element will not be usable without it URI. If no mode is set
/// and no schemes are explicitly allowed, all schemes are allowed.
///
/// # Parameters
///
/// * `schemes`: The list of schemes per attribute per element.
pub fn allow_schemes<'a>(
mut self,
schemes: impl IntoIterator<Item = ElementAttributesSchemes<'a>>,
behavior: ListBehavior,
) -> Self {
let content = schemes.into_iter().map(|s| s.to_tuple()).collect();
self.allow_schemes = Some(List { content, behavior });
self
}
if name == "class" {
if let Some(classes) = self.allowed_classes.and_then(|m| m.get(tag)) {
let mut changed = false;
let attr_classes = value.split_whitespace().filter(|attr_class| {
for class in classes.iter() {
if WildMatch::new(class).matches(attr_class) {
return true;
}
}
changed = true;
false
});
/// Deny the given classes per HTML element.
///
/// Removing classes has a higher priority than allowing. So if a class is in
/// this list, it will always be removed.
///
/// If all the classes of a `class` attribute are removed, the whole attribute is removed.
///
/// In the list of classes, the names must match the full class name. `*` can be used as a
/// wildcard for any number of characters. So `language` will only match a class named
/// `language`, and `language-*` will match any class name starting with `language-`.
///
/// # Parameters
///
/// * `attrs`: The list of classes per element. The value of `parent` is the element name, and
/// `properties` contains classes.
pub fn remove_classes<'a>(
mut self,
classes: impl IntoIterator<Item = PropertiesNames<'a>>,
) -> Self {
self.remove_classes = Some(classes.into_iter().map(|c| c.to_tuple()).collect());
self
}
let folded_classes = attr_classes.fold(String::new(), |mut a, b| {
a.reserve(b.len() + 1);
a.push_str(b);
a.push('\n');
a
});
let final_classes = folded_classes.trim_end();
/// Change the list of allowed classes per HTML element.
///
/// The given list is added, removed or replaces the list of allowed classes of the current
/// mode, depending on the [`ListBehavior`].
///
/// If a class is not allowed, it is removed. If all the classes of a `class` attribute are
/// removed, the whole attribute is removed. If no mode is set and no classes are explicitly
/// allowed, all classes are allowed.
///
/// In the list of classes, the names must match the full class name. `*` can be used as a
/// wildcard for any number of characters. So `language` will only match a class named
/// `language`, and `language-*` will match any class name starting with `language-`.
///
/// # Parameters
///
/// * `attrs`: The list of classes per element. The value of `parent` is the element name, and
/// `properties` contains classes.
pub fn allow_classes<'a>(
mut self,
classes: impl IntoIterator<Item = PropertiesNames<'a>>,
behavior: ListBehavior,
) -> Self {
let content = classes.into_iter().map(|c| c.to_tuple()).collect();
self.allow_classes = Some(List { content, behavior });
self
}
if changed {
if final_classes.is_empty() {
return Some(AttributeAction::Remove(attr.to_owned()));
} else {
return Some(AttributeAction::ReplaceValue(
attr.to_owned(),
final_classes.to_owned().into(),
));
}
}
}
}
None
})
.collect();
for action in actions {
match action {
AttributeAction::ReplaceValue(attr, value) => {
if let Some(mut attr) = attrs.take(&attr) {
attr.value = value;
attrs.insert(attr);
}
}
AttributeAction::Remove(attr) => {
attrs.remove(&attr);
}
}
}
/// The maximum nesting level of HTML elements.
///
/// This overrides the maximum depth set by the mode, if one is set.
///
/// All elements that are deeper than the maximum depth will be removed. If no mode is set and
/// no maximum depth is explicitly set, elements are not filtered by their nesting level.
///
/// # Parameters
///
/// * `depth`: The maximum nesting level allowed.
pub fn max_depth(mut self, depth: u32) -> Self {
self.max_depth = Some(depth);
self
}
}
/// The possible actions to apply to an element node.
#[derive(Debug, PartialEq, Eq)]
enum NodeAction {
/// Don't do anything.
None,
/// A list with a behavior.
#[derive(Debug, Clone)]
struct List<T> {
/// The content of this list.
content: T,
/// Remove the element but keep its children.
Ignore,
/// Remove the element and its children.
Remove,
/// The behavior of this list.
behavior: ListBehavior,
}
/// The possible actions to apply to an attribute.
#[derive(Debug)]
enum AttributeAction {
/// Replace the value of the attribute.
ReplaceValue(Attribute, StrTendril),
/// Remove the attribute.
Remove(Attribute),
impl<T> List<T> {
/// Whether this is `ListBehavior::Override`.
fn is_override(&self) -> bool {
self.behavior == ListBehavior::Override
}
}
/// List of HTML tags allowed in the Matrix specification, without the rich reply fallback tag.
static ALLOWED_TAGS_WITHOUT_REPLY_STRICT: Set<&str> = phf_set! {
"del", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "p", "a",
"ul", "ol", "sup", "sub", "li", "b", "i", "u", "strong", "em", "s",
"code", "hr", "br", "div", "table", "thead", "tbody", "tr", "th", "td",
"caption", "pre", "span", "img", "details", "summary",
};
/// The behavior of the setting.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(clippy::exhaustive_enums)]
pub enum ListBehavior {
/// The list replaces the default list of the current mode, if one is set.
///
/// If no mode is set, this is the full allow list.
Override,
/// The HTML tag name for a rich reply fallback.
const RICH_REPLY_TAG: &str = "mx-reply";
/// The list is added to the default list of the current mode, if one is set.
///
/// If no mode is set, this is the full allow list.
Add,
}
/// HTML tags that were allowed in the Matrix specification, with their replacement.
static DEPRECATED_TAGS: Map<&str, &str> = phf_map! {
"font" => "span",
"strike" => "s",
};
/// The replacement of a name.
#[derive(Debug, Clone, Copy)]
#[allow(clippy::exhaustive_structs)]
pub struct NameReplacement {
/// The name to replace.
pub old: &'static str,
/// The name of the replacement.
pub new: &'static str,
}
/// Allowed attributes per HTML tag according to the Matrix specification.
static ALLOWED_ATTRIBUTES_STRICT: Map<&str, &Set<&str>> = phf_map! {
"span" => &ALLOWED_ATTRIBUTES_SPAN_STRICT,
"a" => &ALLOWED_ATTRIBUTES_A_STRICT,
"img" => &ALLOWED_ATTRIBUTES_IMG_STRICT,
"ol" => &ALLOWED_ATTRIBUTES_OL_STRICT,
"code" => &ALLOWED_ATTRIBUTES_CODE_STRICT,
};
static ALLOWED_ATTRIBUTES_SPAN_STRICT: Set<&str> =
phf_set! { "data-mx-bg-color", "data-mx-color", "data-mx-spoiler" };
static ALLOWED_ATTRIBUTES_A_STRICT: Set<&str> = phf_set! { "name", "target", "href" };
static ALLOWED_ATTRIBUTES_IMG_STRICT: Set<&str> =
phf_set! { "width", "height", "alt", "title", "src" };
static ALLOWED_ATTRIBUTES_OL_STRICT: Set<&str> = phf_set! { "start" };
static ALLOWED_ATTRIBUTES_CODE_STRICT: Set<&str> = phf_set! { "class" };
impl NameReplacement {
fn to_tuple(self) -> (&'static str, &'static str) {
(self.old, self.new)
}
}
/// Attributes that were allowed on HTML tags according to the Matrix specification, with their
/// replacement.
static DEPRECATED_ATTRS: Map<&str, &Map<&str, &str>> = phf_map! {
"font" => &DEPRECATED_ATTRIBUTES_FONT,
};
static DEPRECATED_ATTRIBUTES_FONT: Map<&str, &str> = phf_map! { "color" => "data-mx-color" };
/// A list of properties names for a parent.
#[allow(clippy::exhaustive_structs)]
#[derive(Debug, Clone, Copy)]
pub struct PropertiesNames<'a> {
/// The name of the parent.
pub parent: &'static str,
/// The list of properties names.
pub properties: &'a [&'static str],
}
/// Allowed schemes of URIs per HTML tag and attribute tuple according to the Matrix specification.
static ALLOWED_SCHEMES_STRICT: Map<&str, &Set<&str>> = phf_map! {
"a:href" => &ALLOWED_SCHEMES_A_HREF_STRICT,
"img:src" => &ALLOWED_SCHEMES_IMG_SRC_STRICT,
};
static ALLOWED_SCHEMES_A_HREF_STRICT: Set<&str> =
phf_set! { "http", "https", "ftp", "mailto", "magnet" };
static ALLOWED_SCHEMES_IMG_SRC_STRICT: Set<&str> = phf_set! { "mxc" };
impl<'a> PropertiesNames<'a> {
fn to_tuple(self) -> (&'static str, HashSet<&'static str>) {
let set = self.properties.iter().copied().collect();
(self.parent, set)
}
}
/// Extra allowed schemes of URIs per HTML tag and attribute tuple.
///
/// This is a convenience list to add schemes that can be encountered but are not listed in the
/// Matrix specification. It consists of:
///
/// * The `matrix` scheme for `a` tags (see [matrix-org/matrix-spec#1108]).
///
/// To get a complete list, add these to `ALLOWED_SCHEMES_STRICT`.
///
/// [matrix-org/matrix-spec#1108]: https://github.com/matrix-org/matrix-spec/issues/1108
static ALLOWED_SCHEMES_COMPAT: Map<&str, &Set<&str>> = phf_map! {
"a:href" => &ALLOWED_SCHEMES_A_HREF_COMPAT,
"img:src" => &ALLOWED_SCHEMES_IMG_SRC_STRICT,
};
pub(crate) static ALLOWED_SCHEMES_A_HREF_COMPAT: Set<&str> =
phf_set! { "http", "https", "ftp", "mailto", "magnet", "matrix" };
/// The replacement of an element's attributes.
#[allow(clippy::exhaustive_structs)]
#[derive(Debug, Clone, Copy)]
pub struct ElementAttributesReplacement<'a> {
/// The name of the element.
pub element: &'static str,
/// The list of attributes replacements.
pub replacements: &'a [NameReplacement],
}
/// Allowed classes per HTML tag according to the Matrix specification.
static ALLOWED_CLASSES_STRICT: Map<&str, &Set<&str>> =
phf_map! { "code" => &ALLOWED_CLASSES_CODE_STRICT };
static ALLOWED_CLASSES_CODE_STRICT: Set<&str> = phf_set! { "language-*" };
impl<'a> ElementAttributesReplacement<'a> {
fn to_tuple(self) -> (&'static str, HashMap<&'static str, &'static str>) {
let map = self.replacements.iter().map(|r| r.to_tuple()).collect();
(self.element, map)
}
}
/// Max depth of nested HTML tags allowed by the Matrix specification.
const MAX_DEPTH_STRICT: u32 = 100;
/// An element's attributes' URI schemes.
#[allow(clippy::exhaustive_structs)]
#[derive(Debug, Clone, Copy)]
pub struct ElementAttributesSchemes<'a> {
/// The name of the element.
pub element: &'static str,
/// The list of allowed URI schemes per attribute name.
///
/// The value of the `parent` is the attribute name and the properties are schemes.
pub attr_schemes: &'a [PropertiesNames<'a>],
}
impl<'a> ElementAttributesSchemes<'a> {
fn to_tuple(self) -> (&'static str, HashMap<&'static str, HashSet<&'static str>>) {
let map = self.attr_schemes.iter().map(|s| s.to_tuple()).collect();
(self.element, map)
}
}

View File

@ -0,0 +1,466 @@
use html5ever::{tendril::StrTendril, Attribute, LocalName};
use phf::{phf_map, phf_set, Map, Set};
use wildmatch::WildMatch;
use crate::{ElementData, Html, HtmlSanitizerMode, NodeData, SanitizerConfig};
/// HTML elements allowed in the Matrix specification.
static ALLOWED_ELEMENTS_STRICT: Set<&str> = phf_set! {
"del", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "p", "a",
"ul", "ol", "sup", "sub", "li", "b", "i", "u", "strong", "em", "s",
"code", "hr", "br", "div", "table", "thead", "tbody", "tr", "th", "td",
"caption", "pre", "span", "img", "details", "summary", "mx-reply",
};
/// The HTML element name for a rich reply fallback.
const RICH_REPLY_ELEMENT_NAME: &str = "mx-reply";
/// HTML elements that were previously allowed in the Matrix specification, with their replacement.
static DEPRECATED_ELEMENTS: Map<&str, &str> = phf_map! {
"font" => "span",
"strike" => "s",
};
/// Allowed attributes per HTML element according to the Matrix specification.
static ALLOWED_ATTRIBUTES_STRICT: Map<&str, &Set<&str>> = phf_map! {
"span" => &ALLOWED_ATTRIBUTES_SPAN_STRICT,
"a" => &ALLOWED_ATTRIBUTES_A_STRICT,
"img" => &ALLOWED_ATTRIBUTES_IMG_STRICT,
"ol" => &ALLOWED_ATTRIBUTES_OL_STRICT,
"code" => &ALLOWED_ATTRIBUTES_CODE_STRICT,
};
static ALLOWED_ATTRIBUTES_SPAN_STRICT: Set<&str> =
phf_set! { "data-mx-bg-color", "data-mx-color", "data-mx-spoiler" };
static ALLOWED_ATTRIBUTES_A_STRICT: Set<&str> = phf_set! { "name", "target", "href" };
static ALLOWED_ATTRIBUTES_IMG_STRICT: Set<&str> =
phf_set! { "width", "height", "alt", "title", "src" };
static ALLOWED_ATTRIBUTES_OL_STRICT: Set<&str> = phf_set! { "start" };
static ALLOWED_ATTRIBUTES_CODE_STRICT: Set<&str> = phf_set! { "class" };
/// Attributes that were previously allowed on HTML elements according to the Matrix specification,
/// with their replacement.
static DEPRECATED_ATTRS: Map<&str, &Map<&str, &str>> = phf_map! {
"font" => &DEPRECATED_ATTRIBUTES_FONT,
};
static DEPRECATED_ATTRIBUTES_FONT: Map<&str, &str> = phf_map! { "color" => "data-mx-color" };
/// Allowed schemes of URIs per attribute per HTML element according to the Matrix specification.
static ALLOWED_SCHEMES_STRICT: Map<&str, &Map<&str, &Set<&str>>> = phf_map! {
"a" => &ALLOWED_SCHEMES_A_STRICT,
"img" => &ALLOWED_SCHEMES_IMG_STRICT,
};
static ALLOWED_SCHEMES_A_STRICT: Map<&str, &Set<&str>> = phf_map! {
"href" => &ALLOWED_SCHEMES_A_HREF_STRICT,
};
pub(crate) static ALLOWED_SCHEMES_A_HREF_STRICT: Set<&str> =
phf_set! { "http", "https", "ftp", "mailto", "magnet" };
static ALLOWED_SCHEMES_IMG_STRICT: Map<&str, &Set<&str>> = phf_map! {
"src" => &ALLOWED_SCHEMES_IMG_SRC_STRICT,
};
static ALLOWED_SCHEMES_IMG_SRC_STRICT: Set<&str> = phf_set! { "mxc" };
/// Extra allowed schemes of URIs per attribute per HTML element.
///
/// This is a convenience list to add schemes that can be encountered but are not listed in the
/// Matrix specification. It consists of:
///
/// * The `matrix` scheme for `a` elements (see [matrix-org/matrix-spec#1108]).
///
/// To get a complete list, add these to `ALLOWED_SCHEMES_STRICT`.
///
/// [matrix-org/matrix-spec#1108]: https://github.com/matrix-org/matrix-spec/issues/1108
static ALLOWED_SCHEMES_COMPAT: Map<&str, &Map<&str, &Set<&str>>> = phf_map! {
"a" => &ALLOWED_SCHEMES_A_COMPAT,
};
static ALLOWED_SCHEMES_A_COMPAT: Map<&str, &Set<&str>> = phf_map! {
"href" => &ALLOWED_SCHEMES_A_HREF_COMPAT,
};
pub(crate) static ALLOWED_SCHEMES_A_HREF_COMPAT: Set<&str> = phf_set! { "matrix" };
/// Allowed classes per HTML element according to the Matrix specification.
static ALLOWED_CLASSES_STRICT: Map<&str, &Set<&str>> =
phf_map! { "code" => &ALLOWED_CLASSES_CODE_STRICT };
static ALLOWED_CLASSES_CODE_STRICT: Set<&str> = phf_set! { "language-*" };
/// Max depth of nested HTML elements allowed by the Matrix specification.
const MAX_DEPTH_STRICT: u32 = 100;
impl SanitizerConfig {
/// Whether the current mode uses the values of the strict mode.
fn use_strict(&self) -> bool {
self.mode.is_some()
}
/// Whether the current mode uses the values of the compat mode.
fn use_compat(&self) -> bool {
self.mode.is_some_and(|m| m == HtmlSanitizerMode::Compat)
}
/// The maximum nesting level allowed by the config.
fn max_depth_value(&self) -> Option<u32> {
self.max_depth.or_else(|| self.use_strict().then_some(MAX_DEPTH_STRICT))
}
/// Clean the given HTML with this sanitizer.
pub(crate) fn clean(&self, html: &mut Html) {
let root = html.root();
let mut next_child = root.first_child;
while let Some(child) = next_child {
next_child = html.nodes[child].next_sibling;
self.clean_node(html, child, 0);
}
}
fn clean_node(&self, html: &mut Html, node_id: usize, depth: u32) {
self.apply_replacements(html, node_id);
let action = self.node_action(html, node_id, depth);
if action != NodeAction::Remove {
let mut next_child = html.nodes[node_id].first_child;
while let Some(child) = next_child {
next_child = html.nodes[child].next_sibling;
if action == NodeAction::Ignore {
html.insert_before(node_id, child);
}
self.clean_node(html, child, depth + 1);
}
}
if matches!(action, NodeAction::Ignore | NodeAction::Remove) {
html.detach(node_id);
} else if let Some(data) = html.nodes[node_id].as_element_mut() {
self.clean_element_attributes(data);
}
}
fn apply_replacements(&self, html: &mut Html, node_id: usize) {
if let NodeData::Element(ElementData { name, attrs, .. }) = &mut html.nodes[node_id].data {
let element_name = name.local.as_ref();
// Replace attributes.
let list_replacements =
self.replace_attrs.as_ref().and_then(|list| list.content.get(element_name));
let list_is_override =
self.replace_attrs.as_ref().map(|list| list.is_override()).unwrap_or_default();
let mode_replacements = (!list_is_override && self.use_strict())
.then(|| DEPRECATED_ATTRS.get(element_name))
.flatten();
if list_replacements.is_some() || mode_replacements.is_some() {
*attrs = attrs
.clone()
.into_iter()
.map(|mut attr| {
let attr_name = attr.name.local.as_ref();
let attr_replacement = list_replacements
.and_then(|s| s.get(attr_name))
.or_else(|| mode_replacements.and_then(|s| s.get(attr_name)))
.copied();
if let Some(attr_replacement) = attr_replacement {
attr.name.local = LocalName::from(attr_replacement);
}
attr
})
.collect();
}
// Replace element.
let mut element_replacement = self
.replace_elements
.as_ref()
.and_then(|list| list.content.get(element_name))
.copied();
if element_replacement.is_none() {
let list_is_override = self
.replace_elements
.as_ref()
.map(|list| list.is_override())
.unwrap_or_default();
element_replacement = (!list_is_override && self.use_strict())
.then(|| DEPRECATED_ELEMENTS.get(element_name))
.flatten()
.copied();
}
if let Some(element_replacement) = element_replacement {
name.local = LocalName::from(element_replacement);
}
}
}
fn node_action(&self, html: &Html, node_id: usize, depth: u32) -> NodeAction {
match &html.nodes[node_id].data {
NodeData::Element(ElementData { name, attrs, .. }) => {
let element_name = name.local.as_ref();
// Check if element should be removed.
if self.remove_elements.as_ref().is_some_and(|set| set.contains(element_name)) {
return NodeAction::Remove;
}
if self.remove_reply_fallback && element_name == RICH_REPLY_ELEMENT_NAME {
return NodeAction::Remove;
}
if self.max_depth_value().is_some_and(|max| depth >= max) {
return NodeAction::Remove;
}
// Check if element should be ignored.
if self.ignore_elements.as_ref().is_some_and(|set| set.contains(element_name)) {
return NodeAction::Ignore;
}
// Check if element should be allowed.
if self.allow_elements.is_some() || self.use_strict() {
let list_allowed = self
.allow_elements
.as_ref()
.is_some_and(|list| list.content.contains(element_name));
let list_is_override = self
.allow_elements
.as_ref()
.map(|list| list.is_override())
.unwrap_or_default();
let mode_allowed = !list_is_override
&& self.use_strict()
&& ALLOWED_ELEMENTS_STRICT.contains(element_name);
if !list_allowed && !mode_allowed {
return NodeAction::Ignore;
}
}
// Check if element contains scheme that should be denied.
if let Some(deny_schemes) =
self.deny_schemes.as_ref().and_then(|map| map.get(element_name))
{
for attr in attrs.iter() {
let value = &attr.value;
let attr_name = attr.name.local.as_ref();
if let Some(schemes) = deny_schemes.get(attr_name) {
// Check if the scheme is denied.
if schemes.iter().any(|scheme| value.starts_with(&format!("{scheme}:")))
{
return NodeAction::Ignore;
}
}
}
}
if self.allow_schemes.is_none() && !self.use_strict() {
// All schemes are allowed.
return NodeAction::None;
}
// Check if element contains scheme that should be allowed.
let list_element_schemes =
self.allow_schemes.as_ref().and_then(|list| list.content.get(element_name));
let list_is_override =
self.allow_schemes.as_ref().map(|list| list.is_override()).unwrap_or_default();
let strict_mode_element_schemes = (!list_is_override && self.use_strict())
.then(|| ALLOWED_SCHEMES_STRICT.get(element_name))
.flatten();
let compat_mode_element_schemes = (!list_is_override && self.use_compat())
.then(|| ALLOWED_SCHEMES_COMPAT.get(element_name))
.flatten();
if list_element_schemes.is_none()
&& strict_mode_element_schemes.is_none()
&& compat_mode_element_schemes.is_none()
{
// We don't check schemes for this element.
return NodeAction::None;
}
for attr in attrs.iter() {
let value = &attr.value;
let attr_name = attr.name.local.as_ref();
let list_attr_schemes = list_element_schemes.and_then(|map| map.get(attr_name));
let strict_mode_attr_schemes =
strict_mode_element_schemes.and_then(|map| map.get(attr_name));
let compat_mode_attr_schemes =
compat_mode_element_schemes.and_then(|map| map.get(attr_name));
if list_attr_schemes.is_none()
&& strict_mode_attr_schemes.is_none()
&& compat_mode_attr_schemes.is_none()
{
// We don't check schemes for this attribute.
return NodeAction::None;
}
let mut allowed_schemes = list_attr_schemes
.into_iter()
.flatten()
.chain(strict_mode_attr_schemes.map(|set| set.iter()).into_iter().flatten())
.chain(
compat_mode_attr_schemes.map(|set| set.iter()).into_iter().flatten(),
);
// Check if the scheme is allowed.
if !allowed_schemes.any(|scheme| value.starts_with(&format!("{scheme}:"))) {
return NodeAction::Ignore;
}
}
NodeAction::None
}
NodeData::Text(_) => NodeAction::None,
_ => NodeAction::Remove,
}
}
fn clean_element_attributes(&self, data: &mut ElementData) {
let ElementData { name, attrs } = data;
let element_name = name.local.as_ref();
let list_remove_attrs = self.remove_attrs.as_ref().and_then(|map| map.get(element_name));
let whitelist_attrs = self.allow_attrs.is_some() || self.use_strict();
let list_allow_attrs =
self.allow_attrs.as_ref().and_then(|list| list.content.get(element_name));
let list_is_override =
self.allow_attrs.as_ref().map(|list| list.is_override()).unwrap_or_default();
let mode_allow_attrs = (!list_is_override && self.use_strict())
.then(|| ALLOWED_ATTRIBUTES_STRICT.get(element_name))
.flatten();
let list_remove_classes =
self.remove_classes.as_ref().and_then(|map| map.get(element_name));
let whitelist_classes = self.allow_classes.is_some() || self.use_strict();
let list_allow_classes =
self.allow_classes.as_ref().and_then(|list| list.content.get(element_name));
let list_is_override =
self.allow_classes.as_ref().map(|list| list.is_override()).unwrap_or_default();
let mode_allow_classes = (!list_is_override && self.use_strict())
.then(|| ALLOWED_CLASSES_STRICT.get(element_name))
.flatten();
let actions: Vec<_> = attrs
.iter()
.filter_map(|attr| {
let value = &attr.value;
let attr_name = attr.name.local.as_ref();
// Check if the attribute should be removed.
if list_remove_attrs.is_some_and(|set| set.contains(attr_name)) {
return Some(AttributeAction::Remove(attr.to_owned()));
}
// Check if the attribute is allowed.
if whitelist_attrs {
let list_allowed = list_allow_attrs.is_some_and(|set| set.contains(attr_name));
let mode_allowed = mode_allow_attrs.is_some_and(|set| set.contains(attr_name));
if !list_allowed && !mode_allowed {
return Some(AttributeAction::Remove(attr.to_owned()));
}
}
// Filter classes.
if attr_name == "class" {
let mut classes = value.split_whitespace().collect::<Vec<_>>();
let initial_len = classes.len();
// Process classes to remove.
if let Some(remove_classes) = list_remove_classes {
classes.retain(|class| {
for remove_class in remove_classes {
if WildMatch::new(remove_class).matches(class) {
return false;
}
}
true
});
}
// Process classes to allow.
if whitelist_classes {
classes.retain(|class| {
let allow_classes = list_allow_classes
.map(|set| set.iter())
.into_iter()
.flatten()
.chain(
mode_allow_classes.map(|set| set.iter()).into_iter().flatten(),
);
for allow_class in allow_classes {
if WildMatch::new(allow_class).matches(class) {
return true;
}
}
false
});
}
if classes.len() == initial_len {
// The list has not changed, no action necessary.
return None;
}
if classes.is_empty() {
return Some(AttributeAction::Remove(attr.to_owned()));
} else {
let new_class = classes.join(" ");
return Some(AttributeAction::ReplaceValue(
attr.to_owned(),
new_class.into(),
));
}
}
None
})
.collect();
for action in actions {
match action {
AttributeAction::ReplaceValue(attr, value) => {
if let Some(mut attr) = attrs.take(&attr) {
attr.value = value;
attrs.insert(attr);
}
}
AttributeAction::Remove(attr) => {
attrs.remove(&attr);
}
}
}
}
}
/// The possible actions to apply to an element node.
#[derive(Debug, PartialEq, Eq)]
enum NodeAction {
/// Don't do anything.
None,
/// Remove the element but keep its children.
Ignore,
/// Remove the element and its children.
Remove,
}
/// The possible actions to apply to an attribute.
#[derive(Debug)]
enum AttributeAction {
/// Replace the value of the attribute.
ReplaceValue(Attribute, StrTendril),
/// Remove the attribute.
Remove(Attribute),
}

View File

@ -1,7 +1,10 @@
use ruma_html::{Html, SanitizerConfig};
use ruma_html::{
ElementAttributesReplacement, ElementAttributesSchemes, Html, ListBehavior, NameReplacement,
PropertiesNames, SanitizerConfig,
};
#[test]
fn valid_input() {
fn strict_mode_valid_input() {
let config = SanitizerConfig::strict().remove_reply_fallback();
let mut html = Html::parse(
"\
@ -25,7 +28,7 @@ fn valid_input() {
}
#[test]
fn tags_remove() {
fn strict_mode_elements_remove() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
"\
@ -61,7 +64,7 @@ fn tags_remove() {
}
#[test]
fn tags_remove_without_reply() {
fn strict_mode_elements_reply_remove() {
let config = SanitizerConfig::strict().remove_reply_fallback();
let mut html = Html::parse(
"\
@ -89,7 +92,7 @@ fn tags_remove_without_reply() {
}
#[test]
fn tags_remove_only_reply_fallback() {
fn remove_only_reply_fallback() {
let config = SanitizerConfig::new().remove_reply_fallback();
let mut html = Html::parse(
"\
@ -117,7 +120,7 @@ fn tags_remove_only_reply_fallback() {
}
#[test]
fn attrs_remove() {
fn strict_mode_attrs_remove() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
"\
@ -137,7 +140,7 @@ fn attrs_remove() {
}
#[test]
fn img_remove_scheme() {
fn strict_mode_img_remove_scheme() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
"\
@ -151,7 +154,7 @@ fn img_remove_scheme() {
}
#[test]
fn link_remove_scheme() {
fn strict_mode_link_remove_scheme() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
"\
@ -169,7 +172,7 @@ fn link_remove_scheme() {
}
#[test]
fn link_compat_scheme() {
fn compat_mode_link_remove_scheme() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
"\
@ -204,7 +207,7 @@ fn link_compat_scheme() {
}
#[test]
fn class_remove() {
fn strict_mode_class_remove() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
"\
@ -228,7 +231,7 @@ fn class_remove() {
}
#[test]
fn depth_remove() {
fn strict_mode_depth_remove() {
let config = SanitizerConfig::strict();
let deeply_nested_html: String = std::iter::repeat("<div>")
.take(100)
@ -248,7 +251,7 @@ fn depth_remove() {
}
#[test]
fn replace_deprecated() {
fn strict_mode_replace_deprecated() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
"\
@ -264,3 +267,583 @@ fn replace_deprecated() {
"
);
}
#[test]
fn allow_elements() {
let config = SanitizerConfig::new().allow_elements(["ul", "li", "p", "img"], ListBehavior::Add);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph with some color</p>\
<img src=\"mxc://notareal.hs/abcdef\">\
&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;\
"
);
}
#[test]
fn override_elements() {
let config =
SanitizerConfig::strict().allow_elements(["ul", "li", "p", "img"], ListBehavior::Override);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph with some color</p>\
<img src=\"mxc://notareal.hs/abcdef\">\
&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;\
"
);
}
#[test]
fn add_elements() {
let config = SanitizerConfig::strict().allow_elements(["keep-me"], ListBehavior::Add);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
<keep-me>I was kept!</keep-me>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
<keep-me>I was kept!</keep-me>\
"
);
}
#[test]
fn remove_elements() {
let config = SanitizerConfig::strict().remove_elements(["span", "code"]);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph </p>\
<img src=\"mxc://notareal.hs/abcdef\">\
"
);
}
#[test]
fn ignore_elements() {
let config = SanitizerConfig::new().ignore_elements(["span", "code"]);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph with some color</p>\
<img src=\"mxc://notareal.hs/abcdef\">\
&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;\
"
);
}
#[test]
fn replace_elements() {
let config = SanitizerConfig::new()
.replace_elements([NameReplacement { old: "ul", new: "ol" }], ListBehavior::Add);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ol><li>This</li><li>has</li><li>no</li><li>tag</li></ol>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn replace_elements_override() {
let config = SanitizerConfig::strict()
.replace_elements([NameReplacement { old: "ul", new: "ol" }], ListBehavior::Override);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
<strike>This is wrong</strike>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ol><li>This</li><li>has</li><li>no</li><li>tag</li></ol>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
This is wrong\
"
);
}
#[test]
fn replace_elements_add() {
let config = SanitizerConfig::strict()
.replace_elements([NameReplacement { old: "ul", new: "ol" }], ListBehavior::Add);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
<strike>This is wrong</strike>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ol><li>This</li><li>has</li><li>no</li><li>tag</li></ol>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
<s>This is wrong</s>\
"
);
}
#[test]
fn allow_attributes() {
let config = SanitizerConfig::new().allow_attributes(
[PropertiesNames { parent: "img", properties: &["src"] }],
ListBehavior::Add,
);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span>with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code>&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn override_attributes() {
let config = SanitizerConfig::strict().allow_attributes(
[PropertiesNames { parent: "img", properties: &["src"] }],
ListBehavior::Override,
);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span>with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code>&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn add_attributes() {
let config = SanitizerConfig::strict().allow_attributes(
[PropertiesNames { parent: "img", properties: &["id"] }],
ListBehavior::Add,
);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img id=\"my_image\" src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img id=\"my_image\" src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn remove_attributes() {
let config = SanitizerConfig::strict()
.remove_attributes([PropertiesNames { parent: "span", properties: &["data-mx-color"] }]);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span>with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn replace_attributes() {
let config = SanitizerConfig::new().replace_attributes(
[ElementAttributesReplacement {
element: "span",
replacements: &[NameReplacement { old: "data-mx-color", new: "data-mx-bg-color" }],
}],
ListBehavior::Add,
);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-bg-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn replace_attributes_override() {
let config = SanitizerConfig::strict().replace_attributes(
[ElementAttributesReplacement {
element: "font",
replacements: &[NameReplacement { old: "color", new: "data-mx-bg-color" }],
}],
ListBehavior::Override,
);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <font color=\"green\">with some color</font></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-bg-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn replace_attributes_add() {
let config = SanitizerConfig::strict().replace_attributes(
[ElementAttributesReplacement {
element: "img",
replacements: &[NameReplacement { old: "alt", new: "title" }],
}],
ListBehavior::Add,
);
let mut html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <font color=\"green\">with some color</font></p>\
<img alt=\"An image\" src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\" title=\"An image\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn allow_schemes() {
let config = SanitizerConfig::new().allow_schemes(
[ElementAttributesSchemes {
element: "img",
attr_schemes: &[PropertiesNames { parent: "src", properties: &["mxc"] }],
}],
ListBehavior::Add,
);
let mut html = Html::parse(
"\
<img src=\"mxc://notareal.hs/abcdef\">\
<img src=\"https://notareal.hs/abcdef.png\">\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<img src=\"mxc://notareal.hs/abcdef\">\
"
);
}
#[test]
fn override_schemes() {
let config = SanitizerConfig::strict().allow_schemes(
[ElementAttributesSchemes {
element: "img",
attr_schemes: &[PropertiesNames { parent: "src", properties: &["https"] }],
}],
ListBehavior::Override,
);
let mut html = Html::parse(
"\
<img src=\"mxc://notareal.hs/abcdef\">\
<img src=\"https://notareal.hs/abcdef.png\">\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<img src=\"https://notareal.hs/abcdef.png\">\
"
);
}
#[test]
fn add_schemes() {
let config = SanitizerConfig::strict().allow_schemes(
[ElementAttributesSchemes {
element: "img",
attr_schemes: &[PropertiesNames { parent: "src", properties: &["https"] }],
}],
ListBehavior::Add,
);
let mut html = Html::parse(
"\
<img src=\"mxc://notareal.hs/abcdef\">\
<img src=\"https://notareal.hs/abcdef.png\">\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<img src=\"mxc://notareal.hs/abcdef\">\
<img src=\"https://notareal.hs/abcdef.png\">\
"
);
}
#[test]
fn deny_schemes() {
let config = SanitizerConfig::strict().deny_schemes([ElementAttributesSchemes {
element: "a",
attr_schemes: &[PropertiesNames { parent: "href", properties: &["http"] }],
}]);
let mut html = Html::parse(
"\
<a href=\"https://notareal.hs/abcdef.png\">Secure link to an image</a>\
<a href=\"http://notareal.hs/abcdef.png\">Insecure link to an image</a>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<a href=\"https://notareal.hs/abcdef.png\">Secure link to an image</a>\
Insecure link to an image\
"
);
}
#[test]
fn allow_classes() {
let config = SanitizerConfig::new().allow_classes(
[PropertiesNames { parent: "img", properties: &["custom-class", "custom-class-*"] }],
ListBehavior::Add,
);
let mut html = Html::parse(
"\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
<img class=\"custom-class custom-class-img img\" src=\"mxc://notareal.hs/abcdef\">\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<code>&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
<img class=\"custom-class custom-class-img\" src=\"mxc://notareal.hs/abcdef\">\
"
);
}
#[test]
fn override_classes() {
let config = SanitizerConfig::strict().allow_classes(
[PropertiesNames { parent: "code", properties: &["custom-class", "custom-class-*"] }],
ListBehavior::Override,
);
let mut html = Html::parse(
"\
<code class=\"language-html custom-class custom-class-code code\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<code class=\"custom-class custom-class-code\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn add_classes() {
let config = SanitizerConfig::strict().allow_classes(
[PropertiesNames { parent: "code", properties: &["custom-class", "custom-class-*"] }],
ListBehavior::Add,
);
let mut html = Html::parse(
"\
<code class=\"language-html custom-class custom-class-code code\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<code class=\"language-html custom-class custom-class-code\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn remove_classes() {
let config = SanitizerConfig::strict()
.remove_classes([PropertiesNames { parent: "code", properties: &["language-rust"] }]);
let mut html = Html::parse(
"\
<code class=\"language-html language-rust\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
html.sanitize_with(&config);
assert_eq!(
html.to_string(),
"\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}