html: Add HTML data types for elements and attributes suggested by Matrix Spec

This commit is contained in:
Kévin Commaille 2024-04-28 15:24:17 +02:00 committed by Kévin Commaille
parent 18244143ca
commit e161a57eda
10 changed files with 1084 additions and 4 deletions

View File

@ -11,6 +11,9 @@ Improvements:
- Add support for deprecated HTML tags, according to Matrix 1.10
- Allow to navigate through the HTML tree with `Html::first_child()`,
`Html::last_child()` or `Html::children()`
- Add `ElementData::to_matrix` to convert it to a type using enums for HTML
elements and attributes suggested by the Matrix Specification, behind the
`matrix` cargo feature.
# 0.1.0

View File

@ -14,9 +14,16 @@ rust-version = { workspace = true }
all-features = true
rustdoc-args = ["--cfg", "docsrs"]
[features]
matrix = ["dep:ruma-common"]
[dependencies]
as_variant = { workspace = true }
html5ever = "0.27.0"
phf = { version = "0.11.1", features = ["macros"] }
ruma-common = { workspace = true, optional = true }
tracing = { workspace = true, features = ["attributes"] }
wildmatch = "2.0.0"
[dev-dependencies]
assert_matches2 = { workspace = true }

View File

@ -10,6 +10,9 @@ use html5ever::{
};
use tracing::debug;
#[cfg(feature = "matrix")]
pub mod matrix;
use crate::SanitizerConfig;
/// An HTML fragment.
@ -431,6 +434,16 @@ pub struct ElementData {
pub attrs: BTreeSet<Attribute>,
}
impl ElementData {
/// Convert this element data to typed data as [suggested by the Matrix Specification][spec].
///
/// [spec]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
#[cfg(feature = "matrix")]
pub fn to_matrix(&self) -> matrix::MatrixElementData {
matrix::MatrixElementData::parse(&self.name, &self.attrs)
}
}
/// A reference to an HTML node.
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]

View File

@ -0,0 +1,721 @@
//! Types to work with HTML elements and attributes [suggested by the Matrix Specification][spec].
//!
//! [spec]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
use std::collections::BTreeSet;
use html5ever::{namespace_url, ns, tendril::StrTendril, Attribute, QualName};
use ruma_common::{
IdParseError, MatrixToError, MatrixToUri, MatrixUri, MatrixUriError, MxcUri, OwnedMxcUri,
};
use crate::sanitizer_config::ALLOWED_SCHEMES_A_HREF_COMPAT;
const CLASS_LANGUAGE_PREFIX: &str = "language-";
/// The data of a Matrix HTML element.
///
/// This is a helper type to work with elements [suggested by the Matrix Specification][spec].
///
/// This performs a lossless conversion from [`ElementData`]. Unsupported elements are represented
/// by [`MatrixElement::Other`] and unsupported attributes are listed in the `attrs` field.
///
/// [`ElementData`]: crate::ElementData
/// [spec]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
#[derive(Debug, Clone)]
#[allow(clippy::exhaustive_structs)]
pub struct MatrixElementData {
/// The HTML element and its supported data.
pub element: MatrixElement,
/// The unsupported attributes found on the element.
pub attrs: BTreeSet<Attribute>,
}
impl MatrixElementData {
/// Parse a `MatrixElementData` from the given qualified name and attributes.
#[allow(clippy::mutable_key_type)]
pub(super) fn parse(name: &QualName, attrs: &BTreeSet<Attribute>) -> Self {
let (element, attrs) = MatrixElement::parse(name, attrs);
Self { element, attrs }
}
}
/// A Matrix HTML element.
///
/// All the elements [suggested by the Matrix Specification][spec] have a variant. The others are
/// handled by the fallback `Other` variant.
///
/// Suggested attributes are represented as optional fields on the variants structs.
///
/// [spec]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum MatrixElement {
/// [`<del>`], a deleted text element.
///
/// [`<del>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/del
Del,
/// [`<h1>-<h6>`], a section heading element.
///
/// [`<h1>-<h6>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/Heading_Elements
H(HeadingData),
/// [`<blockquote>`], a block quotation element.
///
/// [`<blockquote>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/blockquote
Blockquote,
/// [`<p>`], a paragraph element.
///
/// [`<p>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/p
P,
/// [`<a>`], an anchor element.
///
/// [`<a>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a
A(AnchorData),
/// [`<ul>`], an unordered list element.
///
/// [`<ul>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ul
Ul,
/// [`<ol>`], an ordered list element.
///
/// [`<ol>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ol
Ol(OrderedListData),
/// [`<sup>`], a superscript element.
///
/// [`<sup>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/sup
Sup,
/// [`<sub>`], a subscript element.
///
/// [`<sub>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/sub
Sub,
/// [`<li>`], a list item element.
///
/// [`<li>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/li
Li,
/// [`<b>`], a bring attention to element.
///
/// [`<b>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/b
B,
/// [`<i>`], an idiomatic text element.
///
/// [`<i>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/i
I,
/// [`<u>`], an unarticulated annotation element.
///
/// [`<u>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/u
U,
/// [`<strong>`], a strong importance element.
///
/// [`<strong>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/strong
Strong,
/// [`<em>`], an emphasis element.
///
/// [`<em>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/em
Em,
/// [`<s>`], a strikethrough element.
///
/// [`<s>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/s
S,
/// [`<code>`], an inline code element.
///
/// [`<code>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/code
Code(CodeData),
/// [`<hr>`], a thematic break element.
///
/// [`<hr>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/hr
Hr,
/// [`<br>`], a line break element.
///
/// [`<br>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/br
Br,
/// [`<div>`], a content division element.
///
/// [`<div>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/div
Div,
/// [`<table>`], a table element.
///
/// [`<table>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/table
Table,
/// [`<thead>`], a table head element.
///
/// [`<thead>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/thead
Thead,
/// [`<tbody>`], a table body element.
///
/// [`<tbody>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/tbody
Tbody,
/// [`<tr>`], a table row element.
///
/// [`<tr>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/tr
Tr,
/// [`<th>`], a table header element.
///
/// [`<th>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th
Th,
/// [`<td>`], a table data cell element.
///
/// [`<td>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td
Td,
/// [`<caption>`], a table caption element.
///
/// [`<caption>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/caption
Caption,
/// [`<pre>`], a preformatted text element.
///
/// [`<pre>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/pre
Pre,
/// [`<span>`], a content span element.
///
/// [`<span>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/span
Span(SpanData),
/// [`<img>`], an image embed element.
///
/// [`<img>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img
Img(ImageData),
/// [`<details>`], a details disclosure element.
///
/// [`<details>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/details
Details,
/// [`<summary>`], a disclosure summary element.
///
/// [`<summary>`]: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/summary
Summary,
/// [`mx-reply`], a Matrix rich reply fallback element.
///
/// [`mx-reply`]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
MatrixReply,
/// An HTML element that is not in the suggested list.
Other(QualName),
}
impl MatrixElement {
/// Parse a `MatrixElement` from the given qualified name and attributes.
///
/// Returns a tuple containing the constructed `Element` and the list of remaining unsupported
/// attributes.
#[allow(clippy::mutable_key_type)]
fn parse(name: &QualName, attrs: &BTreeSet<Attribute>) -> (Self, BTreeSet<Attribute>) {
if name.ns != ns!(html) {
return (Self::Other(name.clone()), attrs.clone());
}
match name.local.as_bytes() {
b"del" => (Self::Del, attrs.clone()),
b"h1" => (Self::H(HeadingData::new(1)), attrs.clone()),
b"h2" => (Self::H(HeadingData::new(2)), attrs.clone()),
b"h3" => (Self::H(HeadingData::new(3)), attrs.clone()),
b"h4" => (Self::H(HeadingData::new(4)), attrs.clone()),
b"h5" => (Self::H(HeadingData::new(5)), attrs.clone()),
b"h6" => (Self::H(HeadingData::new(6)), attrs.clone()),
b"blockquote" => (Self::Blockquote, attrs.clone()),
b"p" => (Self::P, attrs.clone()),
b"a" => {
let (data, attrs) = AnchorData::parse(attrs);
(Self::A(data), attrs)
}
b"ul" => (Self::Ul, attrs.clone()),
b"ol" => {
let (data, attrs) = OrderedListData::parse(attrs);
(Self::Ol(data), attrs)
}
b"sup" => (Self::Sup, attrs.clone()),
b"sub" => (Self::Sub, attrs.clone()),
b"li" => (Self::Li, attrs.clone()),
b"b" => (Self::B, attrs.clone()),
b"i" => (Self::I, attrs.clone()),
b"u" => (Self::U, attrs.clone()),
b"strong" => (Self::Strong, attrs.clone()),
b"em" => (Self::Em, attrs.clone()),
b"s" => (Self::S, attrs.clone()),
b"code" => {
let (data, attrs) = CodeData::parse(attrs);
(Self::Code(data), attrs)
}
b"hr" => (Self::Hr, attrs.clone()),
b"br" => (Self::Br, attrs.clone()),
b"div" => (Self::Div, attrs.clone()),
b"table" => (Self::Table, attrs.clone()),
b"thead" => (Self::Thead, attrs.clone()),
b"tbody" => (Self::Tbody, attrs.clone()),
b"tr" => (Self::Tr, attrs.clone()),
b"th" => (Self::Th, attrs.clone()),
b"td" => (Self::Td, attrs.clone()),
b"caption" => (Self::Caption, attrs.clone()),
b"pre" => (Self::Pre, attrs.clone()),
b"span" => {
let (data, attrs) = SpanData::parse(attrs);
(Self::Span(data), attrs)
}
b"img" => {
let (data, attrs) = ImageData::parse(attrs);
(Self::Img(data), attrs)
}
b"details" => (Self::Details, attrs.clone()),
b"summary" => (Self::Summary, attrs.clone()),
b"mx-reply" => (Self::MatrixReply, attrs.clone()),
_ => (Self::Other(name.clone()), attrs.clone()),
}
}
}
/// The supported data of a `<h1>-<h6>` HTML element.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct HeadingData {
/// The level of the heading.
pub level: HeadingLevel,
}
impl HeadingData {
/// Constructs a new `HeadingData` with the given heading level.
fn new(level: u8) -> Self {
Self { level: HeadingLevel(level) }
}
}
/// The level of a heading element.
///
/// The supported levels range from 1 (highest) to 6 (lowest). Other levels cannot construct this
/// and do not use the [`MatrixElement::H`] variant.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct HeadingLevel(u8);
impl HeadingLevel {
/// The value of the level.
///
/// Can only be a value between 1 and 6 included.
pub fn value(&self) -> u8 {
self.0
}
}
impl PartialEq<u8> for HeadingLevel {
fn eq(&self, other: &u8) -> bool {
self.0.eq(other)
}
}
/// The supported data of a `<a>` HTML element.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct AnchorData {
/// The name of the anchor.
pub name: Option<StrTendril>,
/// Where to display the linked URL.
pub target: Option<StrTendril>,
/// The URL that the hyperlink points to.
pub href: Option<AnchorUri>,
}
impl AnchorData {
/// Construct an empty `AnchorData`.
fn new() -> Self {
Self { name: None, target: None, href: None }
}
/// Parse the given attributes to construct a new `AnchorData`.
///
/// Returns a tuple containing the constructed data and the remaining unsupported attributes.
#[allow(clippy::mutable_key_type)]
fn parse(attrs: &BTreeSet<Attribute>) -> (Self, BTreeSet<Attribute>) {
let mut data = Self::new();
let mut remaining_attrs = BTreeSet::new();
for attr in attrs {
if attr.name.ns != ns!() {
remaining_attrs.insert(attr.clone());
continue;
}
match attr.name.local.as_bytes() {
b"name" => {
data.name = Some(attr.value.clone());
}
b"target" => {
data.target = Some(attr.value.clone());
}
b"href" => {
if let Some(uri) = AnchorUri::parse(&attr.value) {
data.href = Some(uri);
} else {
remaining_attrs.insert(attr.clone());
}
}
_ => {
remaining_attrs.insert(attr.clone());
}
}
}
(data, remaining_attrs)
}
}
/// A URI as a value for the `href` attribute of a `<a>` HTML element.
///
/// This is a helper type that recognizes `matrix:` and `https://matrix.to` URIs to detect mentions.
///
/// If the URI is an invalid Matrix URI or does not use one of the suggested schemes, the `href`
/// attribute will be in the `attrs` list of [`MatrixElementData`].
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum AnchorUri {
/// A `matrix:` URI.
Matrix(MatrixUri),
/// A `https://matrix.to` URI.
MatrixTo(MatrixToUri),
/// An other URL using one of the suggested schemes.
///
/// Those schemes are:
///
/// * `https`
/// * `http`
/// * `ftp`
/// * `mailto`
/// * `magnet`
Other(StrTendril),
}
impl AnchorUri {
/// Parse the given string to construct a new `AnchorUri`.
fn parse(value: &StrTendril) -> Option<Self> {
let s = value.as_ref();
// Check if it starts with a supported scheme.
if !ALLOWED_SCHEMES_A_HREF_COMPAT.iter().any(|scheme| s.starts_with(&format!("{scheme}:")))
{
return None;
}
match MatrixUri::parse(s) {
Ok(uri) => return Some(Self::Matrix(uri)),
// It's not a `matrix:` URI, continue.
Err(IdParseError::InvalidMatrixUri(MatrixUriError::WrongScheme)) => {}
// The URI is invalid.
_ => return None,
}
match MatrixToUri::parse(s) {
Ok(uri) => return Some(Self::MatrixTo(uri)),
// It's not a `https://matrix.to` URI, continue.
Err(IdParseError::InvalidMatrixToUri(MatrixToError::WrongBaseUrl)) => {}
// The URI is invalid.
_ => return None,
}
Some(Self::Other(value.clone()))
}
}
/// The supported data of a `<ol>` HTML element.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct OrderedListData {
/// An integer to start counting from for the list items.
///
/// If parsing the integer from a string fails, the attribute will be in the `attrs` list of
/// [`MatrixElementData`].
pub start: Option<i64>,
}
impl OrderedListData {
/// Construct an empty `OrderedListData`.
fn new() -> Self {
Self { start: None }
}
/// Parse the given attributes to construct a new `OrderedListData`.
///
/// Returns a tuple containing the constructed data and the remaining unsupported attributes.
#[allow(clippy::mutable_key_type)]
fn parse(attrs: &BTreeSet<Attribute>) -> (Self, BTreeSet<Attribute>) {
let mut data = Self::new();
let mut remaining_attrs = BTreeSet::new();
for attr in attrs {
if attr.name.ns != ns!() {
remaining_attrs.insert(attr.clone());
continue;
}
match attr.name.local.as_bytes() {
b"start" => {
if let Ok(start) = attr.value.parse() {
data.start = Some(start);
} else {
remaining_attrs.insert(attr.clone());
}
}
_ => {
remaining_attrs.insert(attr.clone());
}
}
}
(data, remaining_attrs)
}
}
/// The supported data of a `<code>` HTML element.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct CodeData {
/// The language of the code, for syntax highlighting.
///
/// This corresponds to the `class` attribute with a value that starts with the
/// `language-` prefix. The prefix is stripped from the value.
///
/// If there are other classes in the `class` attribute, the whole attribute will be in the
/// `attrs` list of [`MatrixElementData`].
pub language: Option<StrTendril>,
}
impl CodeData {
/// Construct an empty `CodeData`.
fn new() -> Self {
Self { language: None }
}
/// Parse the given attributes to construct a new `CodeData`.
///
/// Returns a tuple containing the constructed data and the remaining unsupported attributes.
#[allow(clippy::mutable_key_type)]
fn parse(attrs: &BTreeSet<Attribute>) -> (Self, BTreeSet<Attribute>) {
let mut data = Self::new();
let mut remaining_attrs = BTreeSet::new();
for attr in attrs {
if attr.name.ns != ns!() {
remaining_attrs.insert(attr.clone());
continue;
}
match attr.name.local.as_bytes() {
b"class" => {
let value_str = attr.value.as_ref();
// The attribute could contain several classes separated by spaces, so let's
// find the first class starting with `language-`.
for (match_start, _) in value_str.match_indices(CLASS_LANGUAGE_PREFIX) {
// The class name must either be at the start of the string or preceded by a
// space.
if match_start != 0
&& !value_str.as_bytes()[match_start - 1].is_ascii_whitespace()
{
continue;
}
let language_start = match_start + CLASS_LANGUAGE_PREFIX.len();
let str_end = &value_str[language_start..];
let language_end = str_end
.find(|c: char| c.is_ascii_whitespace())
.map(|pos| language_start + pos)
.unwrap_or(value_str.len());
if language_end == language_start {
continue;
}
let sub_len = (language_end - language_start) as u32;
data.language = Some(attr.value.subtendril(language_start as u32, sub_len));
if match_start != 0 || language_end != value_str.len() {
// There are other classes, keep the whole attribute for the conversion
// to be lossless.
remaining_attrs.insert(attr.clone());
}
break;
}
if data.language.is_none() {
// We didn't find the class we want, keep the whole attribute.
remaining_attrs.insert(attr.clone());
}
}
_ => {
remaining_attrs.insert(attr.clone());
}
}
}
(data, remaining_attrs)
}
}
/// The supported data of a `<span>` HTML element.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct SpanData {
/// `data-mx-bg-color`, the background color of the text.
pub bg_color: Option<StrTendril>,
/// `data-mx-color`, the foreground color of the text.
pub color: Option<StrTendril>,
/// `data-mx-spoiler`, a Matrix [spoiler message].
///
/// The value is the reason of the spoiler. If the string is empty, this is a spoiler
/// without a reason.
///
/// [spoiler message]: https://spec.matrix.org/latest/client-server-api/#spoiler-messages
pub spoiler: Option<StrTendril>,
}
impl SpanData {
/// Construct an empty `SpanData`.
fn new() -> Self {
Self { bg_color: None, color: None, spoiler: None }
}
/// Parse the given attributes to construct a new `SpanData`.
///
/// Returns a tuple containing the constructed data and the remaining unsupported attributes.
#[allow(clippy::mutable_key_type)]
fn parse(attrs: &BTreeSet<Attribute>) -> (Self, BTreeSet<Attribute>) {
let mut data = Self::new();
let mut remaining_attrs = BTreeSet::new();
for attr in attrs {
if attr.name.ns != ns!() {
remaining_attrs.insert(attr.clone());
continue;
}
match attr.name.local.as_bytes() {
b"data-mx-bg-color" => {
data.bg_color = Some(attr.value.clone());
}
b"data-mx-color" => data.color = Some(attr.value.clone()),
b"data-mx-spoiler" => {
data.spoiler = Some(attr.value.clone());
}
_ => {
remaining_attrs.insert(attr.clone());
}
}
}
(data, remaining_attrs)
}
}
/// The supported data of a `<img>` HTML element.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct ImageData {
/// The intrinsic width of the image, in pixels.
///
/// If parsing the integer from a string fails, the attribute will be in the `attrs` list of
/// `MatrixElementData`.
pub width: Option<i64>,
/// The intrinsic height of the image, in pixels.
///
/// If parsing the integer from a string fails, the attribute will be in the `attrs` list of
/// [`MatrixElementData`].
pub height: Option<i64>,
/// Text that can replace the image.
pub alt: Option<StrTendril>,
/// Text representing advisory information about the image.
pub title: Option<StrTendril>,
/// The image URL.
///
/// It this is not a valid `mxc:` URI, the attribute will be in the `attrs` list of
/// [`MatrixElementData`].
pub src: Option<OwnedMxcUri>,
}
impl ImageData {
/// Construct an empty `ImageData`.
fn new() -> Self {
Self { width: None, height: None, alt: None, title: None, src: None }
}
/// Parse the given attributes to construct a new `ImageData`.
///
/// Returns a tuple containing the constructed data and the remaining unsupported attributes.
#[allow(clippy::mutable_key_type)]
fn parse(attrs: &BTreeSet<Attribute>) -> (Self, BTreeSet<Attribute>) {
let mut data = Self::new();
let mut remaining_attrs = BTreeSet::new();
for attr in attrs {
if attr.name.ns != ns!() {
remaining_attrs.insert(attr.clone());
continue;
}
match attr.name.local.as_bytes() {
b"width" => {
if let Ok(width) = attr.value.parse() {
data.width = Some(width);
} else {
remaining_attrs.insert(attr.clone());
}
}
b"height" => {
if let Ok(height) = attr.value.parse() {
data.height = Some(height);
} else {
remaining_attrs.insert(attr.clone());
}
}
b"alt" => data.alt = Some(attr.value.clone()),
b"title" => data.title = Some(attr.value.clone()),
b"src" => {
let uri = <&MxcUri>::from(attr.value.as_ref());
if uri.validate().is_ok() {
data.src = Some(uri.to_owned());
} else {
remaining_attrs.insert(attr.clone());
}
}
_ => {
remaining_attrs.insert(attr.clone());
}
}
}
(data, remaining_attrs)
}
}

View File

@ -6,6 +6,13 @@
//! the Matrix protocol. It should be able to be used to interact with any HTML
//! document but will offer APIs focused on specificities of HTML in the Matrix
//! specification..
//!
//! # Features
//!
//! * `matrix` - Allow to convert HTML elements data into enums with variants for elements and
//! attributes [suggested by the Matrix Specification][spec].
//!
//! [spec]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
#![warn(missing_docs)]
#![cfg_attr(docsrs, feature(doc_auto_cfg))]

View File

@ -363,7 +363,7 @@ static ALLOWED_SCHEMES_COMPAT: Map<&str, &Set<&str>> = phf_map! {
"a:href" => &ALLOWED_SCHEMES_A_HREF_COMPAT,
"img:src" => &ALLOWED_SCHEMES_IMG_SRC_STRICT,
};
static ALLOWED_SCHEMES_A_HREF_COMPAT: Set<&str> =
pub(crate) static ALLOWED_SCHEMES_A_HREF_COMPAT: Set<&str> =
phf_set! { "http", "https", "ftp", "mailto", "magnet", "matrix" };
/// Allowed classes per HTML tag according to the Matrix specification.

View File

@ -1,2 +1,4 @@
#[cfg(feature = "matrix")]
mod matrix;
mod navigate;
mod sanitize;

View File

@ -0,0 +1,323 @@
use assert_matches2::assert_matches;
use ruma_html::{
matrix::{AnchorUri, MatrixElement},
Html,
};
#[test]
fn elements() {
let raw_html = "\
<h1>Title</h1>\
<div class=\"text\">\
<p>This is some <em>text</em></p>\
</div>\
<marquee id=\"scrolling_text\">This is scrolling</marquee>
";
let html = Html::parse(raw_html);
let mut html_children = html.children();
// `<h1>` element.
let h1_node = html_children.next().unwrap();
let h1_element = h1_node.as_element().unwrap().to_matrix();
assert_matches!(h1_element.element, MatrixElement::H(heading));
assert_eq!(heading.level, 1);
assert!(h1_element.attrs.is_empty());
// `<div>` element.
let div_node = html_children.next().unwrap();
let div_element = div_node.as_element().unwrap().to_matrix();
assert_matches!(div_element.element, MatrixElement::Div);
// The `class` attribute is not supported.
assert_eq!(div_element.attrs.len(), 1);
// `<p>` element.
let p_node = div_node.first_child().unwrap();
let p_element = p_node.as_element().unwrap().to_matrix();
assert_matches!(p_element.element, MatrixElement::P);
assert!(p_element.attrs.is_empty());
// Text of `<p>` element.
let p_text_node = p_node.first_child().unwrap();
// `<em>` element.
let em_node = p_text_node.next_sibling().unwrap();
let em_element = em_node.as_element().unwrap().to_matrix();
assert_matches!(em_element.element, MatrixElement::Em);
assert!(em_element.attrs.is_empty());
// `<marquee>` element.
let marquee_node = html_children.next().unwrap();
let marquee_element = marquee_node.as_element().unwrap().to_matrix();
assert_matches!(marquee_element.element, MatrixElement::Other(_));
assert_eq!(marquee_element.attrs.len(), 1);
}
#[test]
fn span_attributes() {
let raw_html = "\
<span \
data-mx-color=\"#00ff00\" \
data-mx-bg-color=\"#ff0000\" \
data-mx-spoiler \
data-mx-spoiler=\"This is a spoiler\"\
>\
Hidden and colored\
</span>\
";
let html = Html::parse(raw_html);
let mut html_children = html.children();
let span_node = html_children.next().unwrap();
let span_element = span_node.as_element().unwrap().to_matrix();
assert_matches!(span_element.element, MatrixElement::Span(span));
assert_eq!(span.color.unwrap().as_ref(), "#00ff00");
assert_eq!(span.bg_color.unwrap().as_ref(), "#ff0000");
// Uses the first spoiler attribute, the second is dropped.
assert!(span.spoiler.unwrap().is_empty());
assert!(span_element.attrs.is_empty());
}
#[test]
fn a_attributes() {
let raw_html = "\
<a \
name=\"my_anchor\" \
target=\"_blank\" \
href=\"https://localhost/\"\
>\
Link with all supported attributes\
</a>\
<a href=\"matrix:r/somewhere:localhost\">Link with valid matrix scheme URI</a>\
<a href=\"matrix:somewhere:localhost\">Link with invalid matrix scheme URI</a>\
<a href=\"https://matrix.to/#/%23somewhere:example.org\">Link with valid matrix.to URI</a>\
<a href=\"https://matrix.to/#/somewhere:example.org\">Link with invalid matrix.to URI</a>\
<a href=\"ruma:html\">Link with unsupported scheme</a>\
";
let html = Html::parse(raw_html);
let mut html_children = html.children();
// First `<a>` element, with all supported attributes.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::A(anchor));
assert_eq!(anchor.name.unwrap().as_ref(), "my_anchor");
assert_eq!(anchor.target.unwrap().as_ref(), "_blank");
assert_matches!(anchor.href.unwrap(), AnchorUri::Other(uri));
assert_eq!(uri.as_ref(), "https://localhost/");
assert!(element.attrs.is_empty());
// Second `<a>` element, with valid matrix scheme URI.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::A(anchor));
assert!(anchor.name.is_none());
assert!(anchor.target.is_none());
assert_matches!(anchor.href.unwrap(), AnchorUri::Matrix(uri));
assert_eq!(uri.to_string(), "matrix:r/somewhere:localhost");
assert!(element.attrs.is_empty());
// Third `<a>` element, with invalid matrix scheme URI.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::A(anchor));
assert!(anchor.name.is_none());
assert!(anchor.target.is_none());
assert!(anchor.href.is_none());
// The `href` attribute is in the unsupported attributes.
assert_eq!(element.attrs.len(), 1);
// Fourth `<a>` element, with valid matrix.to URI.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::A(anchor));
assert!(anchor.name.is_none());
assert!(anchor.target.is_none());
assert_matches!(anchor.href.unwrap(), AnchorUri::MatrixTo(uri));
assert_eq!(uri.to_string(), "https://matrix.to/#/%23somewhere:example.org");
assert!(element.attrs.is_empty());
// Fifth `<a>` element, with invalid matrix.to URI.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::A(anchor));
assert!(anchor.name.is_none());
assert!(anchor.target.is_none());
assert!(anchor.href.is_none());
// The `href` attribute is in the unsupported attributes.
assert_eq!(element.attrs.len(), 1);
// Sixth `<a>` element, with unsupported scheme.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::A(anchor));
assert!(anchor.name.is_none());
assert!(anchor.target.is_none());
assert!(anchor.href.is_none());
// The `href` attribute is in the unsupported attributes.
assert_eq!(element.attrs.len(), 1);
}
#[test]
fn img_attributes() {
let raw_html = "\
<img \
width=200 \
height=200 \
alt=\"Image with valid attributes\" \
title=\"Image with valid attributes\" \
src=\"mxc://localhost/abc123\" \
/>\
<img \
width=\"\" \
height=\"\" \
alt=\"Image with invalid attributes\" \
title=\"Image with invalid attributes\" \
src=\"https://localhost/abc123.png\" \
/>\
";
let html = Html::parse(raw_html);
let mut html_children = html.children();
// First `<img>` element, with valid attributes.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Img(image));
assert_eq!(image.width.unwrap(), 200);
assert_eq!(image.height.unwrap(), 200);
assert_eq!(image.alt.unwrap().as_ref(), "Image with valid attributes");
assert_eq!(image.title.unwrap().as_ref(), "Image with valid attributes");
assert_eq!(image.src.unwrap(), "mxc://localhost/abc123");
assert!(element.attrs.is_empty());
// Second `<img>` element, with invalid attributes.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Img(image));
assert!(image.width.is_none());
assert!(image.height.is_none());
assert_eq!(image.alt.unwrap().as_ref(), "Image with invalid attributes");
assert_eq!(image.title.unwrap().as_ref(), "Image with invalid attributes");
assert!(image.src.is_none());
// Invalid attributes are in the unsupported attributes.
assert_eq!(element.attrs.len(), 3);
}
#[test]
fn ol_attributes() {
let raw_html = "\
<ol start=2>\
<li>Item in list with valid start attribute</li>\
</ol>\
<ol start=\"beginning\">\
<li>Item in list with invalid start attribute</li>\
</ol>\
";
let html = Html::parse(raw_html);
let mut html_children = html.children();
// First `<ol>` element, with valid `start` attribute.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Ol(ol));
assert_eq!(ol.start.unwrap(), 2);
assert!(element.attrs.is_empty());
// First `<ol>` element, with invalid `start` attribute.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Ol(ol));
assert!(ol.start.is_none());
assert_eq!(element.attrs.len(), 1);
}
#[test]
fn code_attributes() {
let raw_html = "\
<code class=\"language-rust\">\
let s = \"Code with only `language-` class\";\
</code>\
<code class=\"rust-code\">\
let s = \"Code with other class\";\
</code>\
<code class=\"language-rust rust-code\">\
let s = \"Code with several classes beginning with `language-` class\";\
</code>\
<code class=\"rust-code language-rust\">\
let s = \"Code with several classes not beginning with `language-` class\";\
</code>\
<code class=\"language-\">\
let s = \"Code with invalid `language-` class\";\
</code>\
<code class=\"code-language-rust\">\
let s = \"Code with other class containing `language-`\";\
</code>\
";
let html = Html::parse(raw_html);
let mut html_children = html.children();
// First `<code>` element, with only `language-` class.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Code(code));
assert_eq!(code.language.unwrap().as_ref(), "rust");
assert!(element.attrs.is_empty());
// Second `<code>` element, with other class.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Code(code));
assert!(code.language.is_none());
// `class` is in unsupported attributes.
assert_eq!(element.attrs.len(), 1);
// Third `<code>` element, with several classes beginning with `language-` class.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Code(code));
assert_eq!(code.language.unwrap().as_ref(), "rust");
// Because it contains other classes, `class` is also in unsupported attributes.
assert_eq!(element.attrs.len(), 1);
// Fourth `<code>` element, with several classes not beginning with `language-` class.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Code(code));
assert_eq!(code.language.unwrap().as_ref(), "rust");
// Because it contains other classes, `class` is also in unsupported attributes.
assert_eq!(element.attrs.len(), 1);
// Fifth `<code>` element, with invalid `language-` class.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Code(code));
assert!(code.language.is_none());
// `class` is in unsupported attributes.
assert_eq!(element.attrs.len(), 1);
// Sixth `<code>` element, with other class containing `language-`.
let node = html_children.next().unwrap();
let element = node.as_element().unwrap().to_matrix();
assert_matches!(element.element, MatrixElement::Code(code));
assert!(code.language.is_none());
// `class` is in unsupported attributes.
assert_eq!(element.attrs.len(), 1);
}

View File

@ -70,6 +70,7 @@ js = ["ruma-common/js"]
rand = ["ruma-common/rand"]
markdown = ["ruma-events?/markdown"]
html = ["dep:ruma-html", "ruma-events?/html"]
html-matrix = ["html", "ruma-html/matrix"]
# Everything except compat, js and unstable features
full = [
@ -87,6 +88,7 @@ full = [
"rand",
"markdown",
"html",
"html-matrix",
]
# Enable all compatibility hacks. Deprecated.

View File

@ -38,9 +38,11 @@
//!
//! These features are only useful if you want to use a method that requires it:
//!
//! * `rand`
//! * `markdown`
//! * `html`
//! * `rand` -- Generate random identifiers.
//! * `markdown` -- Parse markdown to construct messages.
//! * `html` -- Parse HTML to sanitize it or navigate its tree.
//! * `html-matrix` -- Enables the `matrix` feature of `ruma-html` to parse HTML elements data to
//! typed data as suggested by the Matrix Specification.
//!
//! # Unstable features
//!