html: Add support for deprecated HTML tags

According to Matrix 1.10
This commit is contained in:
Kévin Commaille 2024-03-24 11:30:18 +01:00 committed by Kévin Commaille
parent 5a0faa81b1
commit da1df75619
3 changed files with 95 additions and 8 deletions

View File

@ -1,5 +1,9 @@
# [unreleased]
Improvements:
- Add support for deprecated HTML tags, according to Matrix 1.10
# 0.1.0
Initial release

View File

@ -1,4 +1,4 @@
use html5ever::{tendril::StrTendril, Attribute};
use html5ever::{tendril::StrTendril, Attribute, LocalName};
use phf::{phf_map, phf_set, Map, Set};
use wildmatch::WildMatch;
@ -12,11 +12,22 @@ pub struct SanitizerConfig {
/// If this is `None`, all tags are allowed.
allowed_tags: Option<&'static Set<&'static str>>,
/// The allowed deprecated HTML tags.
///
/// This is a map of allowed deprecated tag to their replacement tag.
deprecated_tags: Option<&'static Map<&'static str, &'static str>>,
/// The allowed attributes per tag.
///
/// If this is `None`, all attributes are allowed.
allowed_attrs: Option<&'static Map<&'static str, &'static Set<&'static str>>>,
/// The allowed deprecated attributes per tag.
///
/// This is a map of tag to a map of allowed deprecated attribute to their replacement
/// attribute.
deprecated_attrs: Option<&'static Map<&'static str, &'static Map<&'static str, &'static str>>>,
/// The allowed URI schemes per tag.
///
/// If this is `None`, all schemes are allowed.
@ -43,13 +54,17 @@ impl SanitizerConfig {
/// Constructs a `SanitizerConfig` that will filter tags or attributes not [listed in the
/// Matrix specification].
///
/// Deprecated tags will be replaced with their non-deprecated equivalent.
///
/// It will not remove the reply fallback by default.
///
/// [listed in the Matrix specification]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
pub fn strict() -> Self {
Self {
allowed_tags: Some(&ALLOWED_TAGS_WITHOUT_REPLY_STRICT),
deprecated_tags: Some(&DEPRECATED_TAGS),
allowed_attrs: Some(&ALLOWED_ATTRIBUTES_STRICT),
deprecated_attrs: Some(&DEPRECATED_ATTRS),
allowed_schemes: Some(&ALLOWED_SCHEMES_STRICT),
allowed_classes: Some(&ALLOWED_CLASSES_STRICT),
max_depth: Some(MAX_DEPTH_STRICT),
@ -62,6 +77,8 @@ impl SanitizerConfig {
///
/// - The `matrix` scheme is allowed in links.
///
/// Deprecated tags will be replaced with their non-deprecated equivalent.
///
/// It will not remove the reply fallback by default.
///
/// [listed in the Matrix specification]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
@ -89,6 +106,8 @@ impl SanitizerConfig {
}
fn clean_node(&self, html: &mut Html, node_id: usize, depth: u32) {
self.apply_deprecations(html, node_id);
let action = self.node_action(html, node_id, depth);
if action != NodeAction::Remove {
@ -111,6 +130,42 @@ impl SanitizerConfig {
}
}
fn apply_deprecations(&self, html: &mut Html, node_id: usize) {
if let NodeData::Element(ElementData { name, attrs, .. }) = &mut html.nodes[node_id].data {
let tag: &str = &name.local;
if let Some(deprecated_attrs) =
self.deprecated_attrs.and_then(|deprecated_attrs| deprecated_attrs.get(tag))
{
*attrs = attrs
.clone()
.into_iter()
.map(|mut attr| {
let attr_name: &str = &attr.name.local;
let attr_replacement =
deprecated_attrs.get(attr_name).map(|s| LocalName::from(*s));
if let Some(attr_replacement) = attr_replacement {
attr.name.local = attr_replacement;
}
attr
})
.collect();
}
let tag_replacement = self
.deprecated_tags
.and_then(|deprecated_tags| deprecated_tags.get(tag))
.map(|s| LocalName::from(*s));
if let Some(tag_replacement) = tag_replacement {
name.local = tag_replacement;
}
}
}
fn node_action(&self, html: &Html, node_id: usize, depth: u32) -> NodeAction {
match &html.nodes[node_id].data {
NodeData::Element(ElementData { name, attrs, .. }) => {
@ -247,8 +302,8 @@ enum AttributeAction {
/// List of HTML tags allowed in the Matrix specification, without the rich reply fallback tag.
static ALLOWED_TAGS_WITHOUT_REPLY_STRICT: Set<&str> = phf_set! {
"font", "del", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "p", "a",
"ul", "ol", "sup", "sub", "li", "b", "i", "u", "strong", "em", "strike",
"del", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "p", "a",
"ul", "ol", "sup", "sub", "li", "b", "i", "u", "strong", "em", "s",
"code", "hr", "br", "div", "table", "thead", "tbody", "tr", "th", "td",
"caption", "pre", "span", "img", "details", "summary",
};
@ -256,17 +311,20 @@ static ALLOWED_TAGS_WITHOUT_REPLY_STRICT: Set<&str> = phf_set! {
/// The HTML tag name for a rich reply fallback.
const RICH_REPLY_TAG: &str = "mx-reply";
/// HTML tags that were allowed in the Matrix specification, with their replacement.
static DEPRECATED_TAGS: Map<&str, &str> = phf_map! {
"font" => "span",
"strike" => "s",
};
/// Allowed attributes per HTML tag according to the Matrix specification.
static ALLOWED_ATTRIBUTES_STRICT: Map<&str, &Set<&str>> = phf_map! {
"font" => &ALLOWED_ATTRIBUTES_FONT_STRICT,
"span" => &ALLOWED_ATTRIBUTES_SPAN_STRICT,
"a" => &ALLOWED_ATTRIBUTES_A_STRICT,
"img" => &ALLOWED_ATTRIBUTES_IMG_STRICT,
"ol" => &ALLOWED_ATTRIBUTES_OL_STRICT,
"code" => &ALLOWED_ATTRIBUTES_CODE_STRICT,
};
static ALLOWED_ATTRIBUTES_FONT_STRICT: Set<&str> =
phf_set! { "data-mx-bg-color", "data-mx-color", "color" };
static ALLOWED_ATTRIBUTES_SPAN_STRICT: Set<&str> =
phf_set! { "data-mx-bg-color", "data-mx-color", "data-mx-spoiler" };
static ALLOWED_ATTRIBUTES_A_STRICT: Set<&str> = phf_set! { "name", "target", "href" };
@ -275,6 +333,13 @@ static ALLOWED_ATTRIBUTES_IMG_STRICT: Set<&str> =
static ALLOWED_ATTRIBUTES_OL_STRICT: Set<&str> = phf_set! { "start" };
static ALLOWED_ATTRIBUTES_CODE_STRICT: Set<&str> = phf_set! { "class" };
/// Attributes that were allowed on HTML tags according to the Matrix specification, with their
/// replacement.
static DEPRECATED_ATTRS: Map<&str, &Map<&str, &str>> = phf_map! {
"font" => &DEPRECATED_ATTRIBUTES_FONT,
};
static DEPRECATED_ATTRIBUTES_FONT: Map<&str, &str> = phf_map! { "color" => "data-mx-color" };
/// Allowed schemes of URIs per HTML tag and attribute tuple according to the Matrix specification.
static ALLOWED_SCHEMES_STRICT: Map<&str, &Set<&str>> = phf_map! {
"a:href" => &ALLOWED_SCHEMES_A_HREF_STRICT,

View File

@ -122,7 +122,7 @@ fn attrs_remove() {
let mut html = Html::parse(
"\
<h1 id=\"anchor1\">Title for important stuff</h1>\
<p class=\"important\">Look at <font color=\"blue\" size=20>me!</font></p>\
<p class=\"important\">Look at <span data-mx-color=\"#0000ff\" size=20>me!</span></p>\
",
);
html.sanitize_with(config);
@ -131,7 +131,7 @@ fn attrs_remove() {
html.to_string(),
"\
<h1>Title for important stuff</h1>\
<p>Look at <font color=\"blue\">me!</font></p>\
<p>Look at <span data-mx-color=\"#0000ff\">me!</span></p>\
"
);
}
@ -246,3 +246,21 @@ fn depth_remove() {
assert!(res.contains("I should be fine."));
assert!(!res.contains("I am in too deep!"));
}
#[test]
fn replace_deprecated() {
let config = SanitizerConfig::strict();
let mut html = Html::parse(
"\
<p>Look at <strike>you </strike><font data-mx-bg-color=\"#ff0000\" color=\"#0000ff\">me!</span></p>\
",
);
html.sanitize_with(config);
assert_eq!(
html.to_string(),
"\
<p>Look at <s>you </s><span data-mx-bg-color=\"#ff0000\" data-mx-color=\"#0000ff\">me!</span></p>\
"
);
}