html: Upgrade html5ever

Requires a complete change of the HTML tree, to have interior mutability
and be able to return borrowed data from a Handle.
It is now a Rc-based tree where each node has a weak reference to
its parent and strong references to its children.

This results ina few breaking changes to the public API.
This commit is contained in:
Kévin Commaille 2024-09-06 12:00:59 +02:00 committed by Kévin Commaille
parent dac38e4e17
commit 88f56b0e00
8 changed files with 303 additions and 320 deletions

View File

@ -100,7 +100,7 @@ impl fmt::Display for FormattedOrPlainBody<'_> {
if let Some(formatted_body) = self.formatted {
#[cfg(feature = "html")]
if self.is_reply {
let mut html = Html::parse(&formatted_body.body);
let html = Html::parse(&formatted_body.body);
html.sanitize();
write!(f, "{html}")

View File

@ -4,6 +4,12 @@ Breaking Changes:
- `MatrixElement::Div` is now a newtype variant.
- `AnchorData`'s `name` field was removed, according to MSC4159.
- html5ever was bumped to a new major version. A breaking change in the parsing
API required us to rewrite the `Html` type.
- `Html::sanitize()` and `Html::sanitize_with()` take a non-mutable reference.
- `NodeRef` and `Children` are now owned types and no longer implement `Copy`.
- `NodeData::Text`'s inner string and the `attrs` field of `ElementData` are
now wrapped in `RefCell`s.
Improvements:

View File

@ -18,7 +18,7 @@ matrix = ["dep:ruma-common"]
[dependencies]
as_variant = { workspace = true }
html5ever = "0.27.0"
html5ever = "0.28.0"
phf = { version = "0.11.1", features = ["macros"] }
ruma-common = { workspace = true, optional = true }
tracing = { workspace = true, features = ["attributes"] }

View File

@ -52,7 +52,7 @@ pub fn remove_html_reply_fallback(s: &str) -> String {
}
fn sanitize_inner(s: &str, config: &SanitizerConfig) -> String {
let mut html = Html::parse(s);
let html = Html::parse(s);
html.sanitize_with(config);
html.to_string()
}

View File

@ -1,4 +1,10 @@
use std::{collections::BTreeSet, fmt, io, iter::FusedIterator};
use std::{
cell::RefCell,
collections::BTreeSet,
fmt, io,
iter::FusedIterator,
rc::{Rc, Weak},
};
use as_variant::as_variant;
use html5ever::{
@ -6,7 +12,7 @@ use html5ever::{
serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope},
tendril::{StrTendril, TendrilSink},
tree_builder::{NodeOrText, TreeSink},
Attribute, ParseOpts, QualName,
Attribute, LocalName, ParseOpts, QualName,
};
use tracing::debug;
@ -21,7 +27,7 @@ use crate::SanitizerConfig;
/// parsed, note that malformed HTML and comments will be stripped from the output.
#[derive(Debug)]
pub struct Html {
pub(crate) nodes: Vec<Node>,
document: NodeRef,
}
impl Html {
@ -45,179 +51,116 @@ impl Html {
///
/// This is equivalent to calling [`Self::sanitize_with()`] with a `config` value of
/// `SanitizerConfig::compat().remove_reply_fallback()`.
pub fn sanitize(&mut self) {
pub fn sanitize(&self) {
let config = SanitizerConfig::compat().remove_reply_fallback();
self.sanitize_with(&config);
}
/// Sanitize this HTML according to the given configuration.
pub fn sanitize_with(&mut self, config: &SanitizerConfig) {
pub fn sanitize_with(&self, config: &SanitizerConfig) {
config.clean(self);
}
/// Construct a new `Node` with the given data and add it to this `Html`.
///
/// Returns the index of the new node.
pub(crate) fn new_node(&mut self, data: NodeData) -> usize {
self.nodes.push(Node::new(data));
self.nodes.len() - 1
}
/// Append the given node to the given parent in this `Html`.
///
/// The node is detached from its previous position.
pub(crate) fn append_node(&mut self, parent_id: usize, node_id: usize) {
self.detach(node_id);
self.nodes[node_id].parent = Some(parent_id);
if let Some(last_child) = self.nodes[parent_id].last_child.take() {
self.nodes[node_id].prev_sibling = Some(last_child);
self.nodes[last_child].next_sibling = Some(node_id);
} else {
self.nodes[parent_id].first_child = Some(node_id);
}
self.nodes[parent_id].last_child = Some(node_id);
}
/// Insert the given node before the given sibling in this `Html`.
///
/// The node is detached from its previous position.
pub(crate) fn insert_before(&mut self, sibling_id: usize, node_id: usize) {
self.detach(node_id);
self.nodes[node_id].parent = self.nodes[sibling_id].parent;
self.nodes[node_id].next_sibling = Some(sibling_id);
if let Some(prev_sibling) = self.nodes[sibling_id].prev_sibling.take() {
self.nodes[node_id].prev_sibling = Some(prev_sibling);
self.nodes[prev_sibling].next_sibling = Some(node_id);
} else if let Some(parent) = self.nodes[sibling_id].parent {
self.nodes[parent].first_child = Some(node_id);
}
self.nodes[sibling_id].prev_sibling = Some(node_id);
}
/// Detach the given node from this `Html`.
pub(crate) fn detach(&mut self, node_id: usize) {
let (parent, prev_sibling, next_sibling) = {
let node = &mut self.nodes[node_id];
(node.parent.take(), node.prev_sibling.take(), node.next_sibling.take())
};
if let Some(next_sibling) = next_sibling {
self.nodes[next_sibling].prev_sibling = prev_sibling;
} else if let Some(parent) = parent {
self.nodes[parent].last_child = prev_sibling;
}
if let Some(prev_sibling) = prev_sibling {
self.nodes[prev_sibling].next_sibling = next_sibling;
} else if let Some(parent) = parent {
self.nodes[parent].first_child = next_sibling;
}
}
/// Get the ID of the root node of the HTML.
pub(crate) fn root_id(&self) -> usize {
self.nodes[0].first_child.expect("html should always have a root node")
}
/// Get the root node of the HTML.
pub(crate) fn root(&self) -> &Node {
&self.nodes[self.root_id()]
fn root(&self) -> NodeRef {
self.document.first_child().expect("html should always have a root node")
}
/// Whether the root node of the HTML has children.
pub fn has_children(&self) -> bool {
self.root().first_child.is_some()
self.root().has_children()
}
/// The first child node of the root node of the HTML.
///
/// Returns `None` if the root node has no children.
pub fn first_child(&self) -> Option<NodeRef<'_>> {
self.root().first_child.map(|id| NodeRef::new(self, id))
pub fn first_child(&self) -> Option<NodeRef> {
self.root().first_child()
}
/// The last child node of the root node of the HTML .
///
/// Returns `None` if the root node has no children.
pub fn last_child(&self) -> Option<NodeRef<'_>> {
self.root().last_child.map(|id| NodeRef::new(self, id))
pub fn last_child(&self) -> Option<NodeRef> {
self.root().last_child()
}
/// Iterate through the children of the root node of the HTML.
pub fn children(&self) -> Children<'_> {
pub fn children(&self) -> Children {
Children::new(self.first_child())
}
}
impl Default for Html {
fn default() -> Self {
Self { nodes: vec![Node::new(NodeData::Document)] }
Self { document: NodeRef::new(NodeData::Document) }
}
}
impl TreeSink for Html {
type Handle = usize;
type Handle = NodeRef;
type Output = Self;
fn finish(self) -> Self::Output {
self
}
fn parse_error(&mut self, msg: std::borrow::Cow<'static, str>) {
fn parse_error(&self, msg: std::borrow::Cow<'static, str>) {
debug!("HTML parse error: {msg}");
}
fn get_document(&mut self) -> Self::Handle {
0
fn get_document(&self) -> Self::Handle {
self.document.clone()
}
fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> html5ever::ExpandedName<'a> {
self.nodes[*target].as_element().expect("not an element").name.expanded()
target.as_element().expect("not an element").name.expanded()
}
fn create_element(
&mut self,
&self,
name: QualName,
attrs: Vec<Attribute>,
_flags: html5ever::tree_builder::ElementFlags,
) -> Self::Handle {
self.new_node(NodeData::Element(ElementData { name, attrs: attrs.into_iter().collect() }))
NodeRef::new(NodeData::Element(ElementData {
name,
attrs: RefCell::new(attrs.into_iter().collect()),
}))
}
fn create_comment(&mut self, _text: StrTendril) -> Self::Handle {
self.new_node(NodeData::Other)
fn create_comment(&self, _text: StrTendril) -> Self::Handle {
NodeRef::new(NodeData::Other)
}
fn create_pi(&mut self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
self.new_node(NodeData::Other)
fn create_pi(&self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
NodeRef::new(NodeData::Other)
}
fn append(&mut self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
match child {
NodeOrText::AppendNode(index) => self.append_node(*parent, index),
NodeOrText::AppendNode(node) => parent.append_child(node),
NodeOrText::AppendText(text) => {
// If the previous sibling is also text, add this text to it.
if let Some(sibling) =
self.nodes[*parent].last_child.and_then(|child| self.nodes[child].as_text_mut())
if let Some(prev_text) =
parent.last_child().as_ref().and_then(|sibling| sibling.as_text())
{
sibling.push_tendril(&text);
prev_text.borrow_mut().push_tendril(&text);
} else {
let index = self.new_node(NodeData::Text(text));
self.append_node(*parent, index);
let node = NodeRef::new(NodeData::Text(text.into()));
parent.append_child(node);
}
}
}
}
fn append_based_on_parent_node(
&mut self,
&self,
element: &Self::Handle,
prev_element: &Self::Handle,
child: NodeOrText<Self::Handle>,
) {
if self.nodes[*element].parent.is_some() {
if element.0.parent.borrow().is_some() {
self.append_before_sibling(element, child);
} else {
self.append(prev_element, child);
@ -225,59 +168,53 @@ impl TreeSink for Html {
}
fn append_doctype_to_document(
&mut self,
&self,
_name: StrTendril,
_public_id: StrTendril,
_system_id: StrTendril,
) {
}
fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle {
*target
fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
target.clone()
}
fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
x == y
Rc::ptr_eq(&x.0, &y.0)
}
fn set_quirks_mode(&mut self, _mode: html5ever::tree_builder::QuirksMode) {}
fn set_quirks_mode(&self, _mode: html5ever::tree_builder::QuirksMode) {}
fn append_before_sibling(
&mut self,
sibling: &Self::Handle,
new_node: NodeOrText<Self::Handle>,
) {
fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
match new_node {
NodeOrText::AppendNode(index) => self.insert_before(*sibling, index),
NodeOrText::AppendNode(node) => node.insert_before_sibling(sibling),
NodeOrText::AppendText(text) => {
// If the previous sibling is also text, add this text to it.
if let Some(prev_text) = self.nodes[*sibling]
.prev_sibling
.and_then(|prev| self.nodes[prev].as_text_mut())
if let Some(prev_text) =
sibling.prev_sibling().as_ref().and_then(|prev_sibling| prev_sibling.as_text())
{
prev_text.push_tendril(&text);
prev_text.borrow_mut().push_tendril(&text);
} else {
let index = self.new_node(NodeData::Text(text));
self.insert_before(*sibling, index);
let node = NodeRef::new(NodeData::Text(text.into()));
node.insert_before_sibling(sibling);
}
}
}
}
fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec<Attribute>) {
let target = self.nodes[*target].as_element_mut().unwrap();
target.attrs.extend(attrs);
fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
let element = target.as_element().unwrap();
element.attrs.borrow_mut().extend(attrs);
}
fn remove_from_parent(&mut self, target: &Self::Handle) {
self.detach(*target);
fn remove_from_parent(&self, target: &Self::Handle) {
target.detach();
}
fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle) {
let mut next_child = self.nodes[*node].first_child;
while let Some(child) = next_child {
next_child = self.nodes[child].next_sibling;
self.append_node(*new_parent, child);
fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
for child in node.0.children.take() {
child.0.parent.take();
new_parent.append_child(child);
}
}
}
@ -289,13 +226,8 @@ impl Serialize for Html {
{
match traversal_scope {
TraversalScope::IncludeNode => {
let root = self.root();
let mut next_child = root.first_child;
while let Some(child) = next_child {
let child = &self.nodes[child];
child.serialize(self, serializer)?;
next_child = child.next_sibling;
for child in self.children() {
child.serialize(serializer)?;
}
Ok(())
@ -324,85 +256,37 @@ impl fmt::Display for Html {
/// An HTML node.
#[derive(Debug)]
#[non_exhaustive]
pub(crate) struct Node {
pub(crate) parent: Option<usize>,
pub(crate) prev_sibling: Option<usize>,
pub(crate) next_sibling: Option<usize>,
pub(crate) first_child: Option<usize>,
pub(crate) last_child: Option<usize>,
pub(crate) data: NodeData,
struct Node {
parent: RefCell<Option<Weak<Node>>>,
children: RefCell<Vec<NodeRef>>,
data: NodeData,
}
impl Node {
/// Constructs a new `Node` with the given data.
/// Constructs a new `NodeRef` with the given data.
fn new(data: NodeData) -> Self {
Self {
parent: None,
prev_sibling: None,
next_sibling: None,
first_child: None,
last_child: None,
data,
}
Self { parent: Default::default(), children: Default::default(), data }
}
/// Returns the data of this `Node` if it is an Element (aka an HTML tag).
pub(crate) fn as_element(&self) -> Option<&ElementData> {
fn as_element(&self) -> Option<&ElementData> {
as_variant!(&self.data, NodeData::Element)
}
/// Returns the mutable `ElementData` of this `Node` if it is a `NodeData::Element`.
pub(crate) fn as_element_mut(&mut self) -> Option<&mut ElementData> {
as_variant!(&mut self.data, NodeData::Element)
}
/// Returns the text content of this `Node`, if it is a `NodeData::Text`.
fn as_text(&self) -> Option<&StrTendril> {
fn as_text(&self) -> Option<&RefCell<StrTendril>> {
as_variant!(&self.data, NodeData::Text)
}
/// Returns the mutable text content of this `Node`, if it is a `NodeData::Text`.
fn as_text_mut(&mut self) -> Option<&mut StrTendril> {
as_variant!(&mut self.data, NodeData::Text)
}
}
impl Node {
pub(crate) fn serialize<S>(&self, fragment: &Html, serializer: &mut S) -> io::Result<()>
where
S: Serializer,
{
match &self.data {
NodeData::Element(data) => {
serializer.start_elem(
data.name.clone(),
data.attrs.iter().map(|attr| (&attr.name, &*attr.value)),
)?;
let mut next_child = self.first_child;
while let Some(child) = next_child {
let child = &fragment.nodes[child];
child.serialize(fragment, serializer)?;
next_child = child.next_sibling;
/// Whether this is the root node of the HTML document.
fn is_root(&self) -> bool {
// The root node is the `html` element.
matches!(&self.data, NodeData::Element(element_data) if element_data.name.local.as_bytes() == b"html")
}
serializer.end_elem(data.name.clone())?;
Ok(())
}
NodeData::Document => {
let mut next_child = self.first_child;
while let Some(child) = next_child {
let child = &fragment.nodes[child];
child.serialize(fragment, serializer)?;
next_child = child.next_sibling;
}
Ok(())
}
NodeData::Text(text) => serializer.write_text(text),
_ => Ok(()),
}
/// The parent of this node, if any.
fn parent(&self) -> Option<NodeRef> {
self.parent.borrow().as_ref()?.upgrade().map(NodeRef)
}
}
@ -414,7 +298,7 @@ pub enum NodeData {
Document,
/// A text node.
Text(StrTendril),
Text(RefCell<StrTendril>),
/// An HTML element (aka a tag).
Element(ElementData),
@ -431,7 +315,7 @@ pub struct ElementData {
pub name: QualName,
/// The attributes of the element.
pub attrs: BTreeSet<Attribute>,
pub attrs: RefCell<BTreeSet<Attribute>>,
}
impl ElementData {
@ -440,126 +324,215 @@ impl ElementData {
/// [spec]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
#[cfg(feature = "matrix")]
pub fn to_matrix(&self) -> matrix::MatrixElementData {
matrix::MatrixElementData::parse(&self.name, &self.attrs)
matrix::MatrixElementData::parse(&self.name, &self.attrs.borrow())
}
}
/// A reference to an HTML node.
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct NodeRef<'a> {
/// The `Html` struct containing the nodes.
pub(crate) html: &'a Html,
/// The referenced node.
pub(crate) node: &'a Node,
}
pub struct NodeRef(Rc<Node>);
impl<'a> NodeRef<'a> {
/// Construct a new `NodeRef` for the given HTML and node ID.
fn new(html: &'a Html, id: usize) -> Self {
Self { html, node: &html.nodes[id] }
impl NodeRef {
/// Constructs a new `NodeRef` with the given data.
fn new(data: NodeData) -> Self {
Self(Node::new(data).into())
}
/// Construct a new `NodeRef` from the same HTML as this node with the given node ID.
fn with_id(&self, id: usize) -> Self {
let html = self.html;
Self::new(html, id)
/// Detach this node from the tree, if it has a parent.
pub(crate) fn detach(&self) {
if let Some((parent, index)) = self.parent_and_index() {
parent.0.children.borrow_mut().remove(index);
self.0.parent.take();
}
}
/// Append the given child node to this node.
///
/// The child node is detached from its previous position.
fn append_child(&self, child: NodeRef) {
child.detach();
child.0.parent.replace(Some(Rc::downgrade(&self.0)));
self.0.children.borrow_mut().push(child);
}
/// If this node has a parent, get it and the node's position in the parent's children.
fn parent_and_index(&self) -> Option<(NodeRef, usize)> {
let parent = self.0.parent()?;
let i = parent
.0
.children
.borrow()
.iter()
.position(|child| Rc::ptr_eq(&child.0, &self.0))
.expect("child should be in parent's children");
Some((parent, i))
}
/// Insert this node before the given sibling.
///
/// This node is detached from its previous position.
pub(crate) fn insert_before_sibling(&self, sibling: &NodeRef) {
self.detach();
let (parent, index) = sibling.parent_and_index().expect("sibling should have parent");
self.0.parent.replace(Some(Rc::downgrade(&parent.0)));
parent.0.children.borrow_mut().insert(index, self.clone());
}
/// Constructs a new element `NodeRef` with the same data as this one, but with a different
/// element name and use it to replace this one in the parent.
///
/// Panics if this node is not in the tree and is not an element node.
pub(crate) fn replace_with_element_name(self, name: LocalName) -> NodeRef {
let mut element_data = self.as_element().unwrap().clone();
element_data.name.local = name;
let new_node = NodeRef::new(NodeData::Element(element_data));
for child in self.children() {
new_node.append_child(child);
}
new_node.insert_before_sibling(&self);
self.detach();
new_node
}
/// The data of the node.
pub fn data(&self) -> &'a NodeData {
&self.node.data
pub fn data(&self) -> &NodeData {
&self.0.data
}
/// Returns the data of this node if it is a `NodeData::Element`.
pub fn as_element(&self) -> Option<&'a ElementData> {
self.node.as_element()
/// Returns the data of this `Node` if it is an Element (aka an HTML tag).
pub fn as_element(&self) -> Option<&ElementData> {
self.0.as_element()
}
/// Returns the text content of this node, if it is a `NodeData::Text`.
pub fn as_text(&self) -> Option<&'a StrTendril> {
self.node.as_text()
/// Returns the text content of this `Node`, if it is a `NodeData::Text`.
pub fn as_text(&self) -> Option<&RefCell<StrTendril>> {
self.0.as_text()
}
/// The parent node of this node.
///
/// Returns `None` if the parent is the root node.
pub fn parent(&self) -> Option<NodeRef<'a>> {
let parent_id = self.node.parent?;
pub fn parent(&self) -> Option<NodeRef> {
let parent = self.0.parent()?;
// We don't want users to be able to navigate to the root.
if parent_id == self.html.root_id() {
if parent.0.is_root() {
return None;
}
Some(self.with_id(parent_id))
Some(parent)
}
/// The next sibling node of this node.
///
/// Returns `None` if this is the last of its siblings.
pub fn next_sibling(&self) -> Option<NodeRef<'a>> {
Some(self.with_id(self.node.next_sibling?))
pub fn next_sibling(&self) -> Option<NodeRef> {
let (parent, index) = self.parent_and_index()?;
let index = index.checked_add(1)?;
let sibling = parent.0.children.borrow().get(index).cloned();
sibling
}
/// The previous sibling node of this node.
///
/// Returns `None` if this is the first of its siblings.
pub fn prev_sibling(&self) -> Option<NodeRef<'a>> {
Some(self.with_id(self.node.prev_sibling?))
pub fn prev_sibling(&self) -> Option<NodeRef> {
let (parent, index) = self.parent_and_index()?;
let index = index.checked_sub(1)?;
let sibling = parent.0.children.borrow().get(index).cloned();
sibling
}
/// Whether this node has children.
pub fn has_children(&self) -> bool {
self.node.first_child.is_some()
!self.0.children.borrow().is_empty()
}
/// The first child node of this node.
///
/// Returns `None` if this node has no children.
pub fn first_child(&self) -> Option<NodeRef<'a>> {
Some(self.with_id(self.node.first_child?))
pub fn first_child(&self) -> Option<NodeRef> {
self.0.children.borrow().first().cloned()
}
/// The last child node of this node.
///
/// Returns `None` if this node has no children.
pub fn last_child(&self) -> Option<NodeRef<'a>> {
Some(self.with_id(self.node.last_child?))
pub fn last_child(&self) -> Option<NodeRef> {
self.0.children.borrow().last().cloned()
}
/// Get an iterator through the children of this node.
pub fn children(&self) -> Children<'a> {
pub fn children(&self) -> Children {
Children::new(self.first_child())
}
pub(crate) fn serialize<S>(&self, serializer: &mut S) -> io::Result<()>
where
S: Serializer,
{
match self.data() {
NodeData::Element(data) => {
serializer.start_elem(
data.name.clone(),
data.attrs.borrow().iter().map(|attr| (&attr.name, &*attr.value)),
)?;
for child in self.children() {
child.serialize(serializer)?;
}
serializer.end_elem(data.name.clone())?;
Ok(())
}
NodeData::Document => {
for child in self.children() {
child.serialize(serializer)?;
}
Ok(())
}
NodeData::Text(text) => serializer.write_text(&text.borrow()),
_ => Ok(()),
}
}
}
/// An iterator through the children of a node.
///
/// Can be constructed with [`Html::children()`] or [`NodeRef::children()`].
#[derive(Debug, Clone, Copy)]
pub struct Children<'a> {
next: Option<NodeRef<'a>>,
#[derive(Debug, Clone)]
pub struct Children {
next: Option<NodeRef>,
}
impl<'a> Children<'a> {
impl Children {
/// Construct a `Children` starting from the given node.
fn new(start_node: Option<NodeRef<'a>>) -> Self {
fn new(start_node: Option<NodeRef>) -> Self {
Self { next: start_node }
}
}
impl<'a> Iterator for Children<'a> {
type Item = NodeRef<'a>;
impl Iterator for Children {
type Item = NodeRef;
fn next(&mut self) -> Option<Self::Item> {
let next = self.next?;
let next = self.next.take()?;
self.next = next.next_sibling();
Some(next)
}
}
impl<'a> FusedIterator for Children<'a> {}
impl FusedIterator for Children {}
#[cfg(test)]
mod tests {

View File

@ -2,7 +2,7 @@ use html5ever::{tendril::StrTendril, Attribute, LocalName};
use phf::{phf_map, phf_set, Map, Set};
use wildmatch::WildMatch;
use crate::{ElementData, Html, HtmlSanitizerMode, NodeData, SanitizerConfig};
use crate::{ElementData, Html, HtmlSanitizerMode, NodeData, NodeRef, SanitizerConfig};
/// HTML elements allowed in the Matrix specification.
static ALLOWED_ELEMENTS_STRICT: Set<&str> = phf_set! {
@ -104,43 +104,41 @@ impl SanitizerConfig {
}
/// Clean the given HTML with this sanitizer.
pub(crate) fn clean(&self, html: &mut Html) {
let root = html.root();
let mut next_child = root.first_child;
while let Some(child) = next_child {
next_child = html.nodes[child].next_sibling;
self.clean_node(html, child, 0);
pub(crate) fn clean(&self, html: &Html) {
for child in html.children() {
self.clean_node(child, 0);
}
}
fn clean_node(&self, html: &mut Html, node_id: usize, depth: u32) {
self.apply_replacements(html, node_id);
fn clean_node(&self, node: NodeRef, depth: u32) {
let node = self.apply_replacements(node);
let action = self.node_action(html, node_id, depth);
let action = self.node_action(&node, depth);
if action != NodeAction::Remove {
let mut next_child = html.nodes[node_id].first_child;
while let Some(child) = next_child {
next_child = html.nodes[child].next_sibling;
for child in node.children() {
if action == NodeAction::Ignore {
html.insert_before(node_id, child);
child.insert_before_sibling(&node);
}
self.clean_node(html, child, depth + 1);
self.clean_node(child, depth + 1);
}
}
if matches!(action, NodeAction::Ignore | NodeAction::Remove) {
html.detach(node_id);
} else if let Some(data) = html.nodes[node_id].as_element_mut() {
node.detach();
} else if let Some(data) = node.as_element() {
self.clean_element_attributes(data);
}
}
fn apply_replacements(&self, html: &mut Html, node_id: usize) {
if let NodeData::Element(ElementData { name, attrs, .. }) = &mut html.nodes[node_id].data {
/// Apply the attributes and element name replacements to the given node.
///
/// This might return a different node than the one provided.
fn apply_replacements(&self, node: NodeRef) -> NodeRef {
let mut element_replacement = None;
if let NodeData::Element(ElementData { name, attrs, .. }) = node.data() {
let element_name = name.local.as_ref();
// Replace attributes.
@ -153,6 +151,7 @@ impl SanitizerConfig {
.flatten();
if list_replacements.is_some() || mode_replacements.is_some() {
let mut attrs = attrs.borrow_mut();
*attrs = attrs
.clone()
.into_iter()
@ -174,7 +173,7 @@ impl SanitizerConfig {
}
// Replace element.
let mut element_replacement = self
element_replacement = self
.replace_elements
.as_ref()
.and_then(|list| list.content.get(element_name))
@ -191,17 +190,20 @@ impl SanitizerConfig {
.flatten()
.copied();
}
}
if let Some(element_replacement) = element_replacement {
name.local = LocalName::from(element_replacement);
}
node.replace_with_element_name(LocalName::from(element_replacement))
} else {
node
}
}
fn node_action(&self, html: &Html, node_id: usize, depth: u32) -> NodeAction {
match &html.nodes[node_id].data {
fn node_action(&self, node: &NodeRef, depth: u32) -> NodeAction {
match node.data() {
NodeData::Element(ElementData { name, attrs, .. }) => {
let element_name = name.local.as_ref();
let attrs = attrs.borrow();
// Check if element should be removed.
if self.remove_elements.as_ref().is_some_and(|set| set.contains(element_name)) {
@ -321,9 +323,10 @@ impl SanitizerConfig {
}
}
fn clean_element_attributes(&self, data: &mut ElementData) {
fn clean_element_attributes(&self, data: &ElementData) {
let ElementData { name, attrs } = data;
let element_name = name.local.as_ref();
let mut attrs = attrs.borrow_mut();
let list_remove_attrs = self.remove_attrs.as_ref().and_then(|map| map.get(element_name));

View File

@ -21,7 +21,7 @@ fn navigate_tree() {
let h1_element = h1_node.as_element().unwrap();
assert_eq!(&h1_element.name.local, "h1");
assert!(h1_element.attrs.is_empty());
assert!(h1_element.attrs.borrow().is_empty());
assert!(h1_node.parent().is_none());
assert!(h1_node.next_sibling().is_some());
@ -35,7 +35,7 @@ fn navigate_tree() {
// Text of `<h1>` element.
let h1_text_node = h1_children.next().unwrap();
let h1_text = h1_text_node.as_text().unwrap();
assert_eq!(h1_text.as_ref(), "Title");
assert_eq!(h1_text.borrow().as_ref(), "Title");
assert!(h1_text_node.parent().is_some());
assert!(h1_text_node.next_sibling().is_none());
@ -54,8 +54,9 @@ fn navigate_tree() {
let div_element = div_node.as_element().unwrap();
assert_eq!(&div_element.name.local, "div");
assert_eq!(div_element.attrs.len(), 1);
let class_attr = div_element.attrs.first().unwrap();
let attrs = div_element.attrs.borrow();
assert_eq!(attrs.len(), 1);
let class_attr = attrs.first().unwrap();
assert_eq!(&class_attr.name.local, "class");
assert_eq!(class_attr.value.as_ref(), "text");
@ -73,7 +74,7 @@ fn navigate_tree() {
let p_element = p_node.as_element().unwrap();
assert_eq!(&p_element.name.local, "p");
assert!(p_element.attrs.is_empty());
assert!(p_element.attrs.borrow().is_empty());
assert!(p_node.parent().is_some());
assert!(p_node.next_sibling().is_none());
@ -87,7 +88,7 @@ fn navigate_tree() {
// Text of `<p>` element.
let p_text_node = p_children.next().unwrap();
let p_text = p_text_node.as_text().unwrap();
assert_eq!(p_text.as_ref(), "This is some ");
assert_eq!(p_text.borrow().as_ref(), "This is some ");
assert!(p_text_node.parent().is_some());
assert!(p_text_node.next_sibling().is_some());
@ -104,7 +105,7 @@ fn navigate_tree() {
let em_element = em_node.as_element().unwrap();
assert_eq!(&em_element.name.local, "em");
assert!(em_element.attrs.is_empty());
assert!(em_element.attrs.borrow().is_empty());
assert!(em_node.parent().is_some());
assert!(em_node.next_sibling().is_none());
@ -118,7 +119,7 @@ fn navigate_tree() {
// Text of `<em>` element.
let em_text_node = em_children.next().unwrap();
let em_text = em_text_node.as_text().unwrap();
assert_eq!(em_text.as_ref(), "text");
assert_eq!(em_text.borrow().as_ref(), "text");
assert!(em_text_node.parent().is_some());
assert!(em_text_node.next_sibling().is_none());

View File

@ -6,7 +6,7 @@ use ruma_html::{
#[test]
fn strict_mode_valid_input() {
let config = SanitizerConfig::strict().remove_reply_fallback();
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -30,7 +30,7 @@ fn strict_mode_valid_input() {
#[test]
fn strict_mode_elements_remove() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
let html = Html::parse(
"\
<mx-reply>\
<blockquote>\
@ -66,7 +66,7 @@ fn strict_mode_elements_remove() {
#[test]
fn strict_mode_elements_reply_remove() {
let config = SanitizerConfig::strict().remove_reply_fallback();
let mut html = Html::parse(
let html = Html::parse(
"\
<mx-reply>\
<blockquote>\
@ -94,7 +94,7 @@ fn strict_mode_elements_reply_remove() {
#[test]
fn remove_only_reply_fallback() {
let config = SanitizerConfig::new().remove_reply_fallback();
let mut html = Html::parse(
let html = Html::parse(
"\
<mx-reply>\
<blockquote>\
@ -122,7 +122,7 @@ fn remove_only_reply_fallback() {
#[test]
fn strict_mode_attrs_remove() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
let html = Html::parse(
"\
<h1 id=\"anchor1\">Title for important stuff</h1>\
<p class=\"important\">Look at <span data-mx-color=\"#0000ff\" size=20>me!</span></p>\
@ -142,7 +142,7 @@ fn strict_mode_attrs_remove() {
#[test]
fn strict_mode_img_remove_scheme() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
let html = Html::parse(
"\
<p>Look at that picture:</p>\
<img src=\"https://notareal.hs/abcdef\">\
@ -156,7 +156,7 @@ fn strict_mode_img_remove_scheme() {
#[test]
fn strict_mode_link_remove_scheme() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
let html = Html::parse(
"\
<p>Go see <a href=\"file://local/file.html\">my local website</a></p>\
",
@ -174,7 +174,7 @@ fn strict_mode_link_remove_scheme() {
#[test]
fn compat_mode_link_remove_scheme() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
let html = Html::parse(
"\
<p>Join <a href=\"matrix:r/myroom:notareal.hs\">my room</a></p>\
<p>To talk about <a href=\"https://mycat.org\">my cat</a></p>\
@ -190,7 +190,7 @@ fn compat_mode_link_remove_scheme() {
);
let config = SanitizerConfig::compat();
let mut html = Html::parse(
let html = Html::parse(
"\
<p>Join <a href=\"matrix:r/myroom:notareal.hs\">my room</a></p>\
<p>To talk about <a href=\"https://mycat.org\">my cat</a></p>\
@ -209,7 +209,7 @@ fn compat_mode_link_remove_scheme() {
#[test]
fn strict_mode_class_remove() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
let html = Html::parse(
"\
<pre><code class=\"language-rust custom-class\">
type StringList = Vec&lt;String&gt;;
@ -242,7 +242,7 @@ fn strict_mode_depth_remove() {
.chain(std::iter::repeat("</div>").take(100))
.collect();
let mut html = Html::parse(&deeply_nested_html);
let html = Html::parse(&deeply_nested_html);
html.sanitize_with(&config);
let res = html.to_string();
@ -253,7 +253,7 @@ fn strict_mode_depth_remove() {
#[test]
fn strict_mode_replace_deprecated() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
let html = Html::parse(
"\
<p>Look at <strike>you </strike><font data-mx-bg-color=\"#ff0000\" color=\"#0000ff\">me!</span></p>\
",
@ -271,7 +271,7 @@ fn strict_mode_replace_deprecated() {
#[test]
fn allow_elements() {
let config = SanitizerConfig::new().allow_elements(["ul", "li", "p", "img"], ListBehavior::Add);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -296,7 +296,7 @@ fn allow_elements() {
fn override_elements() {
let config =
SanitizerConfig::strict().allow_elements(["ul", "li", "p", "img"], ListBehavior::Override);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -320,7 +320,7 @@ fn override_elements() {
#[test]
fn add_elements() {
let config = SanitizerConfig::strict().allow_elements(["keep-me"], ListBehavior::Add);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -346,7 +346,7 @@ fn add_elements() {
#[test]
fn remove_elements() {
let config = SanitizerConfig::strict().remove_elements(["span", "code"]);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -369,7 +369,7 @@ fn remove_elements() {
#[test]
fn ignore_elements() {
let config = SanitizerConfig::new().ignore_elements(["span", "code"]);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -394,7 +394,7 @@ fn ignore_elements() {
fn replace_elements() {
let config = SanitizerConfig::new()
.replace_elements([NameReplacement { old: "ul", new: "ol" }], ListBehavior::Add);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -419,7 +419,7 @@ fn replace_elements() {
fn replace_elements_override() {
let config = SanitizerConfig::strict()
.replace_elements([NameReplacement { old: "ul", new: "ol" }], ListBehavior::Override);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -446,7 +446,7 @@ fn replace_elements_override() {
fn replace_elements_add() {
let config = SanitizerConfig::strict()
.replace_elements([NameReplacement { old: "ul", new: "ol" }], ListBehavior::Add);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -475,7 +475,7 @@ fn allow_attributes() {
[PropertiesNames { parent: "img", properties: &["src"] }],
ListBehavior::Add,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -502,7 +502,7 @@ fn override_attributes() {
[PropertiesNames { parent: "img", properties: &["src"] }],
ListBehavior::Override,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -529,7 +529,7 @@ fn add_attributes() {
[PropertiesNames { parent: "img", properties: &["id"] }],
ListBehavior::Add,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -554,7 +554,7 @@ fn add_attributes() {
fn remove_attributes() {
let config = SanitizerConfig::strict()
.remove_attributes([PropertiesNames { parent: "span", properties: &["data-mx-color"] }]);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -584,7 +584,7 @@ fn replace_attributes() {
}],
ListBehavior::Add,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <span data-mx-color=\"green\">with some color</span></p>\
@ -614,7 +614,7 @@ fn replace_attributes_override() {
}],
ListBehavior::Override,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <font color=\"green\">with some color</font></p>\
@ -644,7 +644,7 @@ fn replace_attributes_add() {
}],
ListBehavior::Add,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<ul><li>This</li><li>has</li><li>no</li><li>tag</li></ul>\
<p>This is a paragraph <font color=\"green\">with some color</font></p>\
@ -674,7 +674,7 @@ fn allow_schemes() {
}],
ListBehavior::Add,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<img src=\"mxc://notareal.hs/abcdef\">\
<img src=\"https://notareal.hs/abcdef.png\">\
@ -699,7 +699,7 @@ fn override_schemes() {
}],
ListBehavior::Override,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<img src=\"mxc://notareal.hs/abcdef\">\
<img src=\"https://notareal.hs/abcdef.png\">\
@ -724,7 +724,7 @@ fn add_schemes() {
}],
ListBehavior::Add,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<img src=\"mxc://notareal.hs/abcdef\">\
<img src=\"https://notareal.hs/abcdef.png\">\
@ -747,7 +747,7 @@ fn deny_schemes() {
element: "a",
attr_schemes: &[PropertiesNames { parent: "href", properties: &["http"] }],
}]);
let mut html = Html::parse(
let html = Html::parse(
"\
<a href=\"https://notareal.hs/abcdef.png\">Secure link to an image</a>\
<a href=\"http://notareal.hs/abcdef.png\">Insecure link to an image</a>\
@ -770,7 +770,7 @@ fn allow_classes() {
[PropertiesNames { parent: "img", properties: &["custom-class", "custom-class-*"] }],
ListBehavior::Add,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<code class=\"language-html\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
<img class=\"custom-class custom-class-img img\" src=\"mxc://notareal.hs/abcdef\">\
@ -793,7 +793,7 @@ fn override_classes() {
[PropertiesNames { parent: "code", properties: &["custom-class", "custom-class-*"] }],
ListBehavior::Override,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<code class=\"language-html custom-class custom-class-code code\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
@ -814,7 +814,7 @@ fn add_classes() {
[PropertiesNames { parent: "code", properties: &["custom-class", "custom-class-*"] }],
ListBehavior::Add,
);
let mut html = Html::parse(
let html = Html::parse(
"\
<code class=\"language-html custom-class custom-class-code code\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",
@ -833,7 +833,7 @@ fn add_classes() {
fn remove_classes() {
let config = SanitizerConfig::strict()
.remove_classes([PropertiesNames { parent: "code", properties: &["language-rust"] }]);
let mut html = Html::parse(
let html = Html::parse(
"\
<code class=\"language-html language-rust\">&lt;mx-reply&gt;This is a fake reply&lt;/mx-reply&gt;</code>\
",