events: Move sanitize HTML features to new ruma-html crate

This commit is contained in:
Kévin Commaille
2023-08-24 13:41:46 +02:00
committed by Kévin Commaille
parent acfeb38e90
commit 24ce9d5e09
17 changed files with 262 additions and 195 deletions

View File

@@ -0,0 +1,3 @@
# 0.1.0 (unreleased)
Initial release

View File

@@ -0,0 +1,21 @@
[package]
name = "ruma-html"
version = "0.1.0"
description = "Opinionated HTML parsing and manipulating."
homepage = "https://www.ruma.io/"
keywords = ["matrix", "chat", "messaging", "ruma", "html", "parser"]
license = "MIT"
readme = "README.md"
repository = "https://github.com/ruma/ruma"
edition = "2021"
rust-version = { workspace = true }
[package.metadata.docs.rs]
all-features = true
rustdoc-args = ["--cfg", "docsrs"]
[dependencies]
html5ever = "0.26.0"
phf = { version = "0.11.1", features = ["macros"] }
tracing = { workspace = true, features = ["attributes"] }
wildmatch = "2.0.0"

View File

@@ -0,0 +1,12 @@
# ruma-html
[![crates.io page](https://img.shields.io/crates/v/ruma-html.svg)](https://crates.io/crates/ruma-html)
[![docs.rs page](https://docs.rs/ruma-html/badge.svg)](https://docs.rs/ruma-html/)
![license: MIT](https://img.shields.io/crates/l/ruma-html.svg)
Opinionated HTML parsing and manipulating library.
Like the rest of the Ruma crates, this crate is primarily meant to be used for
the Matrix protocol. It should be able to be used to interact with any HTML
content but will offer APIs focused on specificities of HTML in the Matrix
specification.

View File

@@ -0,0 +1,399 @@
use std::{collections::BTreeSet, fmt, io};
use html5ever::{
local_name, namespace_url, ns, parse_fragment,
serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope},
tendril::{StrTendril, TendrilSink},
tree_builder::{NodeOrText, TreeSink},
Attribute, ParseOpts, QualName,
};
use tracing::debug;
/// An HTML fragment.
///
/// To get the serialized HTML, use its `Display` implementation.
#[derive(Debug)]
pub struct Fragment {
pub(crate) nodes: Vec<Node>,
}
impl Fragment {
/// Construct a new `Fragment` by parsing the given HTML.
pub fn parse_html(html: &str) -> Self {
let sink = Self::default();
let mut parser = parse_fragment(
sink,
ParseOpts::default(),
QualName::new(None, ns!(html), local_name!("div")),
Vec::new(),
);
parser.process(html.into());
parser.finish()
}
/// Construct a new `Node` with the given data and add it to this `Fragment`.
///
/// Returns the index of the new node.
pub fn new_node(&mut self, data: NodeData) -> usize {
self.nodes.push(Node::new(data));
self.nodes.len() - 1
}
/// Append the given node to the given parent in this `Fragment`.
///
/// The node is detached from its previous position.
pub fn append_node(&mut self, parent_id: usize, node_id: usize) {
self.detach(node_id);
self.nodes[node_id].parent = Some(parent_id);
if let Some(last_child) = self.nodes[parent_id].last_child.take() {
self.nodes[node_id].prev_sibling = Some(last_child);
self.nodes[last_child].next_sibling = Some(node_id);
} else {
self.nodes[parent_id].first_child = Some(node_id);
}
self.nodes[parent_id].last_child = Some(node_id);
}
/// Insert the given node before the given sibling in this `Fragment`.
///
/// The node is detached from its previous position.
pub fn insert_before(&mut self, sibling_id: usize, node_id: usize) {
self.detach(node_id);
self.nodes[node_id].parent = self.nodes[sibling_id].parent;
self.nodes[node_id].next_sibling = Some(sibling_id);
if let Some(prev_sibling) = self.nodes[sibling_id].prev_sibling.take() {
self.nodes[node_id].prev_sibling = Some(prev_sibling);
self.nodes[prev_sibling].next_sibling = Some(node_id);
} else if let Some(parent) = self.nodes[sibling_id].parent {
self.nodes[parent].first_child = Some(node_id);
}
self.nodes[sibling_id].prev_sibling = Some(node_id);
}
/// Detach the given node from this `Fragment`.
pub fn detach(&mut self, node_id: usize) {
let (parent, prev_sibling, next_sibling) = {
let node = &mut self.nodes[node_id];
(node.parent.take(), node.prev_sibling.take(), node.next_sibling.take())
};
if let Some(next_sibling) = next_sibling {
self.nodes[next_sibling].prev_sibling = prev_sibling;
} else if let Some(parent) = parent {
self.nodes[parent].last_child = prev_sibling;
}
if let Some(prev_sibling) = prev_sibling {
self.nodes[prev_sibling].next_sibling = next_sibling;
} else if let Some(parent) = parent {
self.nodes[parent].first_child = next_sibling;
}
}
}
impl Default for Fragment {
fn default() -> Self {
Self { nodes: vec![Node::new(NodeData::Document)] }
}
}
impl TreeSink for Fragment {
type Handle = usize;
type Output = Self;
fn finish(self) -> Self::Output {
self
}
fn parse_error(&mut self, msg: std::borrow::Cow<'static, str>) {
debug!("HTML parse error: {msg}");
}
fn get_document(&mut self) -> Self::Handle {
0
}
fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> html5ever::ExpandedName<'a> {
self.nodes[*target].as_element().expect("not an element").name.expanded()
}
fn create_element(
&mut self,
name: QualName,
attrs: Vec<Attribute>,
_flags: html5ever::tree_builder::ElementFlags,
) -> Self::Handle {
self.new_node(NodeData::Element(ElementData { name, attrs: attrs.into_iter().collect() }))
}
fn create_comment(&mut self, _text: StrTendril) -> Self::Handle {
self.new_node(NodeData::Other)
}
fn create_pi(&mut self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
self.new_node(NodeData::Other)
}
fn append(&mut self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
match child {
NodeOrText::AppendNode(index) => self.append_node(*parent, index),
NodeOrText::AppendText(text) => {
// If the previous sibling is also text, add this text to it.
if let Some(sibling) =
self.nodes[*parent].last_child.and_then(|child| self.nodes[child].as_text_mut())
{
sibling.push_tendril(&text);
} else {
let index = self.new_node(NodeData::Text(text));
self.append_node(*parent, index);
}
}
}
}
fn append_based_on_parent_node(
&mut self,
element: &Self::Handle,
prev_element: &Self::Handle,
child: NodeOrText<Self::Handle>,
) {
if self.nodes[*element].parent.is_some() {
self.append_before_sibling(element, child);
} else {
self.append(prev_element, child);
}
}
fn append_doctype_to_document(
&mut self,
_name: StrTendril,
_public_id: StrTendril,
_system_id: StrTendril,
) {
}
fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle {
*target
}
fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
x == y
}
fn set_quirks_mode(&mut self, _mode: html5ever::tree_builder::QuirksMode) {}
fn append_before_sibling(
&mut self,
sibling: &Self::Handle,
new_node: NodeOrText<Self::Handle>,
) {
match new_node {
NodeOrText::AppendNode(index) => self.insert_before(*sibling, index),
NodeOrText::AppendText(text) => {
// If the previous sibling is also text, add this text to it.
if let Some(prev_text) = self.nodes[*sibling]
.prev_sibling
.and_then(|prev| self.nodes[prev].as_text_mut())
{
prev_text.push_tendril(&text);
} else {
let index = self.new_node(NodeData::Text(text));
self.insert_before(*sibling, index);
}
}
}
}
fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec<Attribute>) {
let target = self.nodes[*target].as_element_mut().unwrap();
target.attrs.extend(attrs);
}
fn remove_from_parent(&mut self, target: &Self::Handle) {
self.detach(*target);
}
fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle) {
let mut next_child = self.nodes[*node].first_child;
while let Some(child) = next_child {
next_child = self.nodes[child].next_sibling;
self.append_node(*new_parent, child);
}
}
}
impl Serialize for Fragment {
fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
where
S: Serializer,
{
match traversal_scope {
TraversalScope::IncludeNode => {
let root = self.nodes[0].first_child.unwrap();
let mut next_child = self.nodes[root].first_child;
while let Some(child) = next_child {
let child = &self.nodes[child];
child.serialize(self, serializer)?;
next_child = child.next_sibling;
}
Ok(())
}
TraversalScope::ChildrenOnly(_) => Ok(()),
}
}
}
impl fmt::Display for Fragment {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut u8_vec = Vec::new();
serialize(
&mut u8_vec,
self,
SerializeOpts { traversal_scope: TraversalScope::IncludeNode, ..Default::default() },
)
.unwrap();
f.write_str(&String::from_utf8(u8_vec).unwrap())?;
Ok(())
}
}
/// An HTML node.
#[derive(Debug)]
#[non_exhaustive]
pub struct Node {
pub(crate) parent: Option<usize>,
pub(crate) prev_sibling: Option<usize>,
pub(crate) next_sibling: Option<usize>,
pub(crate) first_child: Option<usize>,
pub(crate) last_child: Option<usize>,
pub(crate) data: NodeData,
}
impl Node {
/// Constructs a new `Node` with the given data.
pub fn new(data: NodeData) -> Self {
Self {
parent: None,
prev_sibling: None,
next_sibling: None,
first_child: None,
last_child: None,
data,
}
}
/// Returns the `ElementData` of this `Node` if it is a `NodeData::Element`.
pub fn as_element(&self) -> Option<&ElementData> {
match &self.data {
NodeData::Element(data) => Some(data),
_ => None,
}
}
/// Returns the mutable `ElementData` of this `Node` if it is a `NodeData::Element`.
pub fn as_element_mut(&mut self) -> Option<&mut ElementData> {
match &mut self.data {
NodeData::Element(data) => Some(data),
_ => None,
}
}
/// Returns the mutable text content of this `Node`, if it is a `NodeData::Text`.
pub fn as_text_mut(&mut self) -> Option<&mut StrTendril> {
match &mut self.data {
NodeData::Text(data) => Some(data),
_ => None,
}
}
}
impl Node {
pub(crate) fn serialize<S>(&self, fragment: &Fragment, serializer: &mut S) -> io::Result<()>
where
S: Serializer,
{
match &self.data {
NodeData::Element(data) => {
serializer.start_elem(
data.name.clone(),
data.attrs.iter().map(|attr| (&attr.name, &*attr.value)),
)?;
let mut next_child = self.first_child;
while let Some(child) = next_child {
let child = &fragment.nodes[child];
child.serialize(fragment, serializer)?;
next_child = child.next_sibling;
}
serializer.end_elem(data.name.clone())?;
Ok(())
}
NodeData::Document => {
let mut next_child = self.first_child;
while let Some(child) = next_child {
let child = &fragment.nodes[child];
child.serialize(fragment, serializer)?;
next_child = child.next_sibling;
}
Ok(())
}
NodeData::Text(text) => serializer.write_text(text),
_ => Ok(()),
}
}
}
/// The data of a `Node`.
#[derive(Debug)]
#[allow(clippy::exhaustive_enums)]
pub enum NodeData {
/// The root node of the `Fragment`.
Document,
/// A text node.
Text(StrTendril),
/// An HTML element (aka a tag).
Element(ElementData),
/// Other types (comment, processing instruction, …).
Other,
}
/// The data of an HTML element.
#[derive(Debug)]
#[allow(clippy::exhaustive_structs)]
pub struct ElementData {
/// The qualified name of the element.
pub name: QualName,
/// The attributes of the element.
pub attrs: BTreeSet<Attribute>,
}
#[cfg(test)]
mod tests {
use super::Fragment;
#[test]
fn sanity() {
let html = "\
<h1>Title</h1>\
<div>\
<p>This is some <em>text</em></p>\
</div>\
";
assert_eq!(Fragment::parse_html(html).to_string(), html);
assert_eq!(Fragment::parse_html("").to_string(), "");
}
}

View File

@@ -0,0 +1,19 @@
#![doc(html_favicon_url = "https://www.ruma.io/favicon.ico")]
#![doc(html_logo_url = "https://www.ruma.io/images/logo.png")]
//! Opinionated HTML parsing and manipulating library.
//!
//! Like the rest of the Ruma crates, this crate is primarily meant to be used for
//! the Matrix protocol. It should be able to be used to interact with any HTML
//! document but will offer APIs focused on specificities of HTML in the Matrix
//! specification..
#![warn(missing_docs)]
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
mod html_fragment;
mod sanitize;
pub use self::{
html_fragment::{ElementData, Fragment, Node, NodeData},
sanitize::*,
};

View File

@@ -0,0 +1,157 @@
//! Convenience methods and types to sanitize HTML messages.
mod html_sanitizer;
pub use self::html_sanitizer::HtmlSanitizer;
/// Sanitize the given HTML string.
///
/// This removes the [tags and attributes] that are not listed in the Matrix specification.
///
/// It can also optionally remove the [rich reply fallback].
///
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
pub fn sanitize_html(
s: &str,
mode: HtmlSanitizerMode,
remove_reply_fallback: RemoveReplyFallback,
) -> String {
let sanitizer = HtmlSanitizer::new(mode, remove_reply_fallback);
sanitizer.clean(s).to_string()
}
/// What HTML [tags and attributes] should be kept by the sanitizer.
///
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[allow(clippy::exhaustive_enums)]
pub enum HtmlSanitizerMode {
/// Keep only the tags and attributes listed in the Matrix specification.
Strict,
/// Like `Strict` mode, with additional tags and attributes that are not yet included in
/// the spec, but are reasonable to keep.
Compat,
}
/// Whether to remove the [rich reply fallback] while sanitizing.
///
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[allow(clippy::exhaustive_enums)]
pub enum RemoveReplyFallback {
/// Remove the rich reply fallback.
Yes,
/// Don't remove the rich reply fallback.
No,
}
/// Remove the [rich reply fallback] of the given HTML string.
///
/// Due to the fact that the HTML is parsed, note that malformed HTML and comments will be stripped
/// from the output.
///
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
pub fn remove_html_reply_fallback(s: &str) -> String {
let sanitizer = HtmlSanitizer::reply_fallback_remover();
sanitizer.clean(s).to_string()
}
#[cfg(test)]
mod tests {
use super::{
remove_html_reply_fallback, sanitize_html, HtmlSanitizerMode, RemoveReplyFallback,
};
#[test]
fn sanitize() {
let sanitized = sanitize_html(
"\
<mx-reply>\
<blockquote>\
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
<br>\
Previous message\
</blockquote>\
</mx-reply>\
<removed>This has no tag</removed>\
<p>But this is inside a tag</p>\
",
HtmlSanitizerMode::Strict,
RemoveReplyFallback::No,
);
assert_eq!(
sanitized,
"\
<mx-reply>\
<blockquote>\
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
<br>\
Previous message\
</blockquote>\
</mx-reply>\
This has no tag\
<p>But this is inside a tag</p>\
"
);
}
#[test]
fn sanitize_without_reply() {
let sanitized = sanitize_html(
"\
<mx-reply>\
<blockquote>\
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
<br>\
Previous message\
</blockquote>\
</mx-reply>\
<removed>This has no tag</removed>\
<p>But this is inside a tag</p>\
",
HtmlSanitizerMode::Strict,
RemoveReplyFallback::Yes,
);
assert_eq!(
sanitized,
"\
This has no tag\
<p>But this is inside a tag</p>\
"
);
}
#[test]
fn remove_html_reply() {
let without_reply = remove_html_reply_fallback(
"\
<mx-reply>\
<blockquote>\
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
<br>\
Previous message\
</blockquote>\
</mx-reply>\
<keep-me>This keeps its tag</keep-me>\
<p>But this is inside a tag</p>\
",
);
assert_eq!(
without_reply,
"\
<keep-me>This keeps its tag</keep-me>\
<p>But this is inside a tag</p>\
"
);
}
}

View File

@@ -0,0 +1,540 @@
use html5ever::{tendril::StrTendril, Attribute};
use phf::{phf_map, phf_set, Map, Set};
use wildmatch::WildMatch;
use super::{HtmlSanitizerMode, RemoveReplyFallback};
use crate::{ElementData, Fragment, NodeData};
/// A sanitizer to filter [HTML tags and attributes] according to the Matrix specification.
///
/// [HTML tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
#[derive(Debug, Clone)]
pub struct HtmlSanitizer {
/// The mode of the HTML sanitizer.
mode: HtmlSanitizerMode,
/// Whether to filter HTML tags and attributes.
///
/// If this is `true`, tags and attributes that do not match the lists will be removed, but
/// the tags' children will still be present in the output.
///
/// If this is `false`, all the tags and attributes are allowed.
filter_tags_attributes: bool,
/// Whether to remove replies.
///
/// If this is `true`, the rich reply fallback will be removed.
///
/// If this is `false`, the rich reply tag will be allowed.
remove_replies: bool,
}
impl HtmlSanitizer {
/// Constructs a `HTMLSanitizer` that will filter the tags and attributes according to the given
/// mode.
///
/// It can also optionally remove the [rich reply fallback].
///
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
pub fn new(mode: HtmlSanitizerMode, remove_reply_fallback: RemoveReplyFallback) -> Self {
Self {
mode,
filter_tags_attributes: true,
remove_replies: remove_reply_fallback == RemoveReplyFallback::Yes,
}
}
/// Constructs a `HTMLSanitizer` instance that only removes the [rich reply fallback].
///
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
pub fn reply_fallback_remover() -> Self {
Self {
mode: HtmlSanitizerMode::Strict,
filter_tags_attributes: false,
remove_replies: true,
}
}
/// Clean the given HTML string with this sanitizer.
pub fn clean(&self, html: &str) -> Fragment {
let mut fragment = Fragment::parse_html(html);
let root = fragment.nodes[0].first_child.unwrap();
let mut next_child = fragment.nodes[root].first_child;
while let Some(child) = next_child {
next_child = fragment.nodes[child].next_sibling;
self.clean_node(&mut fragment, child, 0);
}
fragment
}
fn clean_node(&self, fragment: &mut Fragment, node_id: usize, depth: u32) {
let action = self.node_action(fragment, node_id, depth);
if action != NodeAction::Remove {
let mut next_child = fragment.nodes[node_id].first_child;
while let Some(child) = next_child {
next_child = fragment.nodes[child].next_sibling;
if action == NodeAction::Ignore {
fragment.insert_before(node_id, child);
}
self.clean_node(fragment, child, depth + 1);
}
}
if matches!(action, NodeAction::Ignore | NodeAction::Remove) {
fragment.detach(node_id);
} else if self.filter_tags_attributes {
if let Some(data) = fragment.nodes[node_id].as_element_mut() {
self.clean_element_attributes(data);
}
}
}
fn node_action(&self, fragment: &Fragment, node_id: usize, depth: u32) -> NodeAction {
match &fragment.nodes[node_id].data {
NodeData::Element(ElementData { name, attrs, .. }) => {
let tag: &str = &name.local;
if (self.remove_replies && tag == RICH_REPLY_TAG)
|| (self.filter_tags_attributes && depth >= MAX_DEPTH_STRICT)
{
NodeAction::Remove
} else if self.filter_tags_attributes
&& (!ALLOWED_TAGS_WITHOUT_REPLY_STRICT.contains(tag) && tag != RICH_REPLY_TAG)
{
NodeAction::Ignore
} else if self.filter_tags_attributes {
let allowed_schemes = if self.mode == HtmlSanitizerMode::Strict {
&ALLOWED_SCHEMES_STRICT
} else {
&ALLOWED_SCHEMES_COMPAT
};
for attr in attrs.iter() {
let value = &attr.value;
let attr: &str = &attr.name.local;
// Check if there is a (tag, attr) tuple entry.
if let Some(schemes) = allowed_schemes.get(&*format!("{tag}:{attr}")) {
// Check if the scheme is allowed.
if !schemes
.iter()
.any(|scheme| value.starts_with(&format!("{scheme}:")))
{
return NodeAction::Ignore;
}
}
}
NodeAction::None
} else {
NodeAction::None
}
}
NodeData::Text(_) => NodeAction::None,
_ => NodeAction::Remove,
}
}
fn clean_element_attributes(&self, data: &mut ElementData) {
let ElementData { name, attrs } = data;
let tag: &str = &name.local;
let actions: Vec<_> = attrs
.iter()
.filter_map(|attr| {
let value = &attr.value;
let name: &str = &attr.name.local;
if ALLOWED_ATTRIBUTES_STRICT.get(tag).filter(|attrs| attrs.contains(name)).is_none()
{
return Some(AttributeAction::Remove(attr.to_owned()));
}
if name == "class" {
if let Some(classes) = ALLOWED_CLASSES_STRICT.get(tag) {
let mut changed = false;
let attr_classes = value.split_whitespace().filter(|attr_class| {
for class in classes.iter() {
if WildMatch::new(class).matches(attr_class) {
return true;
}
}
changed = true;
false
});
let folded_classes = attr_classes.fold(String::new(), |mut a, b| {
a.reserve(b.len() + 1);
a.push_str(b);
a.push('\n');
a
});
let final_classes = folded_classes.trim_end();
if changed {
if final_classes.is_empty() {
return Some(AttributeAction::Remove(attr.to_owned()));
} else {
return Some(AttributeAction::ReplaceValue(
attr.to_owned(),
final_classes.to_owned().into(),
));
}
}
}
}
None
})
.collect();
for action in actions {
match action {
AttributeAction::ReplaceValue(attr, value) => {
if let Some(mut attr) = attrs.take(&attr) {
attr.value = value;
attrs.insert(attr);
}
}
AttributeAction::Remove(attr) => {
attrs.remove(&attr);
}
}
}
}
}
/// The possible actions to apply to an element node.
#[derive(Debug, PartialEq, Eq)]
enum NodeAction {
/// Don't do anything.
None,
/// Remove the element but keep its children.
Ignore,
/// Remove the element and its children.
Remove,
}
/// The possible actions to apply to an element node.
#[derive(Debug)]
enum AttributeAction {
/// Replace the value of the attribute.
ReplaceValue(Attribute, StrTendril),
/// Remove the element and its children.
Remove(Attribute),
}
/// List of HTML tags allowed in the Matrix specification, without the rich reply fallback tag.
static ALLOWED_TAGS_WITHOUT_REPLY_STRICT: Set<&str> = phf_set! {
"font", "del", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "p", "a",
"ul", "ol", "sup", "sub", "li", "b", "i", "u", "strong", "em", "strike",
"code", "hr", "br", "div", "table", "thead", "tbody", "tr", "th", "td",
"caption", "pre", "span", "img", "details", "summary",
};
/// The HTML tag name for a rich reply fallback.
const RICH_REPLY_TAG: &str = "mx-reply";
/// Allowed attributes per HTML tag according to the Matrix specification.
static ALLOWED_ATTRIBUTES_STRICT: Map<&str, &Set<&str>> = phf_map! {
"font" => &ALLOWED_ATTRIBUTES_FONT_STRICT,
"span" => &ALLOWED_ATTRIBUTES_SPAN_STRICT,
"a" => &ALLOWED_ATTRIBUTES_A_STRICT,
"img" => &ALLOWED_ATTRIBUTES_IMG_STRICT,
"ol" => &ALLOWED_ATTRIBUTES_OL_STRICT,
"code" => &ALLOWED_ATTRIBUTES_CODE_STRICT,
};
static ALLOWED_ATTRIBUTES_FONT_STRICT: Set<&str> =
phf_set! { "data-mx-bg-color", "data-mx-color", "color" };
static ALLOWED_ATTRIBUTES_SPAN_STRICT: Set<&str> =
phf_set! { "data-mx-bg-color", "data-mx-color", "data-mx-spoiler" };
static ALLOWED_ATTRIBUTES_A_STRICT: Set<&str> = phf_set! { "name", "target", "href" };
static ALLOWED_ATTRIBUTES_IMG_STRICT: Set<&str> =
phf_set! { "width", "height", "alt", "title", "src" };
static ALLOWED_ATTRIBUTES_OL_STRICT: Set<&str> = phf_set! { "start" };
static ALLOWED_ATTRIBUTES_CODE_STRICT: Set<&str> = phf_set! { "class" };
/// Allowed schemes of URIs per HTML tag and attribute tuple according to the Matrix specification.
static ALLOWED_SCHEMES_STRICT: Map<&str, &Set<&str>> = phf_map! {
"a:href" => &ALLOWED_SCHEMES_A_HREF_STRICT,
"img:src" => &ALLOWED_SCHEMES_IMG_SRC_STRICT,
};
static ALLOWED_SCHEMES_A_HREF_STRICT: Set<&str> =
phf_set! { "http", "https", "ftp", "mailto", "magnet" };
static ALLOWED_SCHEMES_IMG_SRC_STRICT: Set<&str> = phf_set! { "mxc" };
/// Extra allowed schemes of URIs per HTML tag and attribute tuple.
///
/// This is a convenience list to add schemes that can be encountered but are not listed in the
/// Matrix specification. It consists of:
///
/// * The `matrix` scheme for `a` tags (see [matrix-org/matrix-spec#1108]).
///
/// To get a complete list, add these to `ALLOWED_SCHEMES_STRICT`.
///
/// [matrix-org/matrix-spec#1108]: https://github.com/matrix-org/matrix-spec/issues/1108
static ALLOWED_SCHEMES_COMPAT: Map<&str, &Set<&str>> = phf_map! {
"a:href" => &ALLOWED_SCHEMES_A_HREF_COMPAT,
"img:src" => &ALLOWED_SCHEMES_IMG_SRC_STRICT,
};
static ALLOWED_SCHEMES_A_HREF_COMPAT: Set<&str> =
phf_set! { "http", "https", "ftp", "mailto", "magnet", "matrix" };
/// Allowed classes per HTML tag according to the Matrix specification.
static ALLOWED_CLASSES_STRICT: Map<&str, &Set<&str>> =
phf_map! { "code" => &ALLOWED_CLASSES_CODE_STRICT };
static ALLOWED_CLASSES_CODE_STRICT: Set<&str> = phf_set! { "language-*" };
/// Max depth of nested HTML tags allowed by the Matrix specification.
const MAX_DEPTH_STRICT: u32 = 100;
#[cfg(test)]
mod tests {
use super::{HtmlSanitizer, HtmlSanitizerMode, RemoveReplyFallback};
#[test]
fn valid_input() {
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::Yes);
let sanitized = sanitizer.clean(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
);
assert_eq!(
sanitized.to_string(),
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
<img src=\"mxc://notareal.hs/abcdef\">\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
"
);
}
#[test]
fn tags_remove() {
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::No);
let sanitized = sanitizer.clean(
"\
<mx-reply>\
<blockquote>\
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
<br>\
Previous message\
</blockquote>\
</mx-reply>\
<removed>This has no tag</removed>\
<p>But this is inside a tag</p>\
",
);
assert_eq!(
sanitized.to_string(),
"\
<mx-reply>\
<blockquote>\
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
<br>\
Previous message\
</blockquote>\
</mx-reply>\
This has no tag\
<p>But this is inside a tag</p>\
"
);
}
#[test]
fn tags_remove_without_reply() {
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::Yes);
let sanitized = sanitizer.clean(
"\
<mx-reply>\
<blockquote>\
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
<br>\
Previous message\
</blockquote>\
</mx-reply>\
<removed>This has no tag</removed>\
<p>But this is inside a tag</p>\
",
);
assert_eq!(
sanitized.to_string(),
"\
This has no tag\
<p>But this is inside a tag</p>\
"
);
}
#[test]
fn tags_remove_only_reply_fallback() {
let sanitizer = HtmlSanitizer::reply_fallback_remover();
let sanitized = sanitizer.clean(
"\
<mx-reply>\
<blockquote>\
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
<br>\
Previous message\
</blockquote>\
</mx-reply>\
<keep-me>This keeps its tag</keep-me>\
<p>But this is inside a tag</p>\
",
);
assert_eq!(
sanitized.to_string(),
"\
<keep-me>This keeps its tag</keep-me>\
<p>But this is inside a tag</p>\
"
);
}
#[test]
fn attrs_remove() {
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::No);
let sanitized = sanitizer.clean(
"\
<h1 id=\"anchor1\">Title for important stuff</h1>\
<p class=\"important\">Look at <font color=\"blue\" size=20>me!</font></p>\
",
);
assert_eq!(
sanitized.to_string(),
"\
<h1>Title for important stuff</h1>\
<p>Look at <font color=\"blue\">me!</font></p>\
"
);
}
#[test]
fn img_remove_scheme() {
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::No);
let sanitized = sanitizer.clean(
"\
<p>Look at that picture:</p>\
<img src=\"https://notareal.hs/abcdef\">\
",
);
assert_eq!(
sanitized.to_string(),
"\
<p>Look at that picture:</p>\
"
);
}
#[test]
fn link_remove_scheme() {
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::No);
let sanitized = sanitizer.clean(
"\
<p>Go see <a href=\"file://local/file.html\">my local website</a></p>\
",
);
assert_eq!(
sanitized.to_string(),
"\
<p>Go see my local website</p>\
"
);
}
#[test]
fn link_compat_scheme() {
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::No);
let sanitized = sanitizer.clean(
"\
<p>Join <a href=\"matrix:r/myroom:notareal.hs\">my room</a></p>\
<p>To talk about <a href=\"https://mycat.org\">my cat</a></p>\
",
);
assert_eq!(
sanitized.to_string(),
"\
<p>Join my room</p>\
<p>To talk about <a href=\"https://mycat.org\">my cat</a></p>\
"
);
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Compat, RemoveReplyFallback::No);
let sanitized = sanitizer.clean(
"\
<p>Join <a href=\"matrix:r/myroom:notareal.hs\">my room</a></p>\
<p>To talk about <a href=\"https://mycat.org\">my cat</a></p>\
",
);
assert_eq!(
sanitized.to_string(),
"\
<p>Join <a href=\"matrix:r/myroom:notareal.hs\">my room</a></p>\
<p>To talk about <a href=\"https://mycat.org\">my cat</a></p>\
"
);
}
#[test]
fn class_remove() {
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::No);
let sanitized = sanitizer.clean(
"\
<pre><code class=\"language-rust custom-class\">
type StringList = Vec&lt;String&gt;;
</code></pre>\
<p>What do you think of the name <code class=\"fake-language-rust\">StringList</code>?</p>\
",
);
assert_eq!(
sanitized.to_string(),
"\
<pre><code class=\"language-rust\">
type StringList = Vec&lt;String&gt;;
</code></pre>\
<p>What do you think of the name <code>StringList</code>?</p>\
"
);
}
#[test]
fn depth_remove() {
let sanitizer = HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::No);
let deeply_nested_html: String = std::iter::repeat("<div>")
.take(100)
.chain(Some(
"<span>I am in too deep!</span>\
I should be fine.",
))
.chain(std::iter::repeat("</div>").take(100))
.collect();
let sanitized = sanitizer.clean(&deeply_nested_html).to_string();
assert!(sanitized.contains("I should be fine."));
assert!(!sanitized.contains("I am in too deep!"));
}
}