events: Move sanitize HTML features to new ruma-html crate
This commit is contained in:
parent
acfeb38e90
commit
24ce9d5e09
@ -20,6 +20,7 @@ ruma-common = { version = "0.11.3", path = "crates/ruma-common" }
|
||||
ruma-client = { version = "0.11.0", path = "crates/ruma-client" }
|
||||
ruma-client-api = { version = "0.16.2", path = "crates/ruma-client-api" }
|
||||
ruma-federation-api = { version = "0.7.1", path = "crates/ruma-federation-api" }
|
||||
ruma-html = { version = "0.1.0", path = "crates/ruma-html" }
|
||||
ruma-identifiers-validation = { version = "0.9.1", path = "crates/ruma-identifiers-validation" }
|
||||
ruma-identity-service-api = { version = "0.7.1", path = "crates/ruma-identity-service-api" }
|
||||
ruma-macros = { version = "=0.11.3", path = "crates/ruma-macros" }
|
||||
|
@ -41,6 +41,8 @@ Breaking changes:
|
||||
- `RoomMessageEventContent::make_reply_to()` and `make_for_thread()` have an extra parameter to
|
||||
support the recommended behavior for intentional mentions in replies according to Matrix 1.7
|
||||
- In Markdown, soft line breaks are transformed into hard line breaks when compiled into HTML.
|
||||
- Move the HTML functions in `events::room::message::sanitize` to the ruma-html crate
|
||||
- The `unstable-sanitize` cargo feature was renamed to `html`
|
||||
|
||||
Improvements:
|
||||
|
||||
@ -62,7 +64,7 @@ Improvements:
|
||||
- `user_can_send_message`
|
||||
- `user_can_send_state`
|
||||
- `user_can_trigger_room_notification`
|
||||
- Add `MessageType::sanitize` behind the `unstable-sanitize` feature
|
||||
- Add `MessageType::sanitize` behind the `html` feature
|
||||
- Add `MatrixVersion::V1_7` and `MatrixVersion::V1_8`
|
||||
- Stabilize support for annotations and reactions (MSC2677 / Matrix 1.7)
|
||||
- Add support for intentional mentions push rules (MSC3952 / Matrix 1.7)
|
||||
@ -275,7 +277,7 @@ Improvements:
|
||||
* Deserialize stringified integers for power levels without the `compat` feature
|
||||
* Add `JoinRule::KnockRestricted` (MSC3787)
|
||||
* Add `MatrixVersionId::V10` (MSC3604)
|
||||
* Add methods to sanitize messages according to the spec behind the `unstable-sanitize` feature
|
||||
* Add methods to sanitize messages according to the spec behind the `html` feature
|
||||
* Can also remove rich reply fallbacks
|
||||
* Implement `From<Owned*Id>` for `identifiers::matrix_uri::MatrixId`
|
||||
* Add unstable default push rule to ignore room server ACLs events (MSC3786)
|
||||
|
@ -24,6 +24,7 @@ server = []
|
||||
api = ["dep:http", "dep:konst"]
|
||||
canonical-json = []
|
||||
events = []
|
||||
html = ["dep:ruma-html"]
|
||||
js = ["dep:js-sys", "getrandom?/js", "uuid?/js"]
|
||||
markdown = ["pulldown-cmark"]
|
||||
rand = ["dep:rand", "dep:uuid"]
|
||||
@ -47,7 +48,6 @@ unstable-msc3954 = ["unstable-msc1767"]
|
||||
unstable-msc3955 = ["unstable-msc1767"]
|
||||
unstable-msc3956 = ["unstable-msc1767"]
|
||||
unstable-pdu = []
|
||||
unstable-sanitize = ["dep:html5ever", "dep:phf"]
|
||||
unstable-unspecified = []
|
||||
|
||||
# Don't validate the version part in `KeyId`.
|
||||
@ -77,17 +77,16 @@ base64 = { workspace = true }
|
||||
bytes = "1.0.1"
|
||||
form_urlencoded = "1.0.0"
|
||||
getrandom = { version = "0.2.6", optional = true }
|
||||
html5ever = { version = "0.26.0", optional = true }
|
||||
http = { workspace = true, optional = true }
|
||||
indexmap = { version = "2.0.0", features = ["serde"] }
|
||||
js_int = { workspace = true, features = ["serde"] }
|
||||
js_option = "0.1.0"
|
||||
konst = { version = "0.3.5", default-features = false, features = ["cmp", "iter", "parsing"], optional = true }
|
||||
percent-encoding = "2.1.0"
|
||||
phf = { version = "0.11.1", features = ["macros"], optional = true }
|
||||
pulldown-cmark = { version = "0.9.1", default-features = false, optional = true }
|
||||
rand = { version = "0.8.3", optional = true }
|
||||
regex = { version = "1.5.6", default-features = false, features = ["std", "perf"] }
|
||||
ruma-html = { workspace = true, optional = true }
|
||||
ruma-identifiers-validation = { workspace = true }
|
||||
ruma-macros = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
|
@ -7,7 +7,7 @@ the previous message, for which the room ID is required. If you want to reply to
|
||||
If the message was edited, the previous message should be the original message that was edited,
|
||||
with the content of its replacement, to allow the fallback to be accurate at the time it is added.
|
||||
|
||||
It is recommended to enable the `unstable-sanitize` feature when using this method as this will
|
||||
It is recommended to enable the `html` feature when using this method as this will
|
||||
clean up nested [rich reply fallbacks] in chains of replies. This uses [`sanitize_html()`]
|
||||
internally, with [`RemoveReplyFallback::Yes`].
|
||||
|
||||
|
@ -4,6 +4,8 @@
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
#[cfg(feature = "html")]
|
||||
use ruma_html::{sanitize_html, HtmlSanitizerMode, RemoveReplyFallback};
|
||||
use ruma_macros::EventContent;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use serde_json::Value as JsonValue;
|
||||
@ -40,10 +42,8 @@ pub use key_verification_request::KeyVerificationRequestEventContent;
|
||||
pub use location::{LocationInfo, LocationMessageEventContent};
|
||||
pub use notice::NoticeMessageEventContent;
|
||||
pub use relation_serde::deserialize_relation;
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
use sanitize::{
|
||||
remove_plain_reply_fallback, sanitize_html, HtmlSanitizerMode, RemoveReplyFallback,
|
||||
};
|
||||
#[cfg(feature = "html")]
|
||||
use sanitize::remove_plain_reply_fallback;
|
||||
pub use server_notice::{LimitType, ServerNoticeMessageEventContent, ServerNoticeType};
|
||||
pub use text::TextMessageEventContent;
|
||||
pub use video::{VideoInfo, VideoMessageEventContent};
|
||||
@ -432,7 +432,7 @@ impl RoomMessageEventContent {
|
||||
///
|
||||
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
#[cfg(feature = "html")]
|
||||
pub fn sanitize(
|
||||
&mut self,
|
||||
mode: HtmlSanitizerMode,
|
||||
@ -758,7 +758,7 @@ impl MessageType {
|
||||
///
|
||||
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
#[cfg(feature = "html")]
|
||||
pub fn sanitize(
|
||||
&mut self,
|
||||
mode: HtmlSanitizerMode,
|
||||
@ -907,7 +907,7 @@ impl FormattedBody {
|
||||
///
|
||||
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
#[cfg(feature = "html")]
|
||||
pub fn sanitize_html(
|
||||
&mut self,
|
||||
mode: HtmlSanitizerMode,
|
||||
|
@ -1,11 +1,12 @@
|
||||
use std::fmt::{self, Write};
|
||||
|
||||
#[cfg(feature = "html")]
|
||||
use ruma_html::{HtmlSanitizer, HtmlSanitizerMode, RemoveReplyFallback};
|
||||
|
||||
use super::{
|
||||
sanitize::remove_plain_reply_fallback, FormattedBody, MessageType, OriginalRoomMessageEvent,
|
||||
Relation,
|
||||
};
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
use super::{sanitize::HtmlSanitizer, HtmlSanitizerMode, RemoveReplyFallback};
|
||||
|
||||
fn get_message_quote_fallbacks(original_message: &OriginalRoomMessageEvent) -> (String, String) {
|
||||
let get_quotes = |body: &str, formatted: Option<&FormattedBody>, is_emote: bool| {
|
||||
@ -13,9 +14,9 @@ fn get_message_quote_fallbacks(original_message: &OriginalRoomMessageEvent) -> (
|
||||
let is_reply = matches!(content.relates_to, Some(Relation::Reply { .. }));
|
||||
let emote_sign = is_emote.then_some("* ").unwrap_or_default();
|
||||
let body = is_reply.then(|| remove_plain_reply_fallback(body)).unwrap_or(body);
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
#[cfg(feature = "html")]
|
||||
let html_body = FormattedOrPlainBody { formatted, body, is_reply };
|
||||
#[cfg(not(feature = "unstable-sanitize"))]
|
||||
#[cfg(not(feature = "html"))]
|
||||
let html_body = FormattedOrPlainBody { formatted, body };
|
||||
|
||||
(
|
||||
@ -72,14 +73,14 @@ impl fmt::Display for EscapeHtmlEntities<'_> {
|
||||
struct FormattedOrPlainBody<'a> {
|
||||
formatted: Option<&'a FormattedBody>,
|
||||
body: &'a str,
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
#[cfg(feature = "html")]
|
||||
is_reply: bool,
|
||||
}
|
||||
|
||||
impl fmt::Display for FormattedOrPlainBody<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(formatted_body) = self.formatted {
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
#[cfg(feature = "html")]
|
||||
if self.is_reply {
|
||||
let sanitizer =
|
||||
HtmlSanitizer::new(HtmlSanitizerMode::Strict, RemoveReplyFallback::Yes);
|
||||
@ -88,7 +89,7 @@ impl fmt::Display for FormattedOrPlainBody<'_> {
|
||||
f.write_str(&formatted_body.body)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "unstable-sanitize"))]
|
||||
#[cfg(not(feature = "html"))]
|
||||
f.write_str(&formatted_body.body)
|
||||
} else {
|
||||
write!(f, "{}", EscapeHtmlEntities(self.body))
|
||||
|
@ -1,52 +1,9 @@
|
||||
//! Convenience methods and types to sanitize text messages.
|
||||
|
||||
#![allow(unreachable_pub)] // https://github.com/rust-lang/rust/issues/112615
|
||||
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
mod html_fragment;
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
mod html_sanitizer;
|
||||
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
pub(super) use html_sanitizer::HtmlSanitizer;
|
||||
|
||||
/// Sanitize the given HTML string.
|
||||
///
|
||||
/// This removes the [tags and attributes] that are not listed in the Matrix specification.
|
||||
///
|
||||
/// It can also optionally remove the [rich reply fallback].
|
||||
///
|
||||
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
pub fn sanitize_html(
|
||||
s: &str,
|
||||
mode: HtmlSanitizerMode,
|
||||
remove_reply_fallback: RemoveReplyFallback,
|
||||
) -> String {
|
||||
let sanitizer = HtmlSanitizer::new(mode, remove_reply_fallback);
|
||||
sanitizer.clean(s).to_string()
|
||||
}
|
||||
|
||||
/// What HTML [tags and attributes] should be kept by the sanitizer.
|
||||
///
|
||||
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
#[allow(clippy::exhaustive_enums)]
|
||||
pub enum HtmlSanitizerMode {
|
||||
/// Keep only the tags and attributes listed in the Matrix specification.
|
||||
Strict,
|
||||
|
||||
/// Like `Strict` mode, with additional tags and attributes that are not yet included in
|
||||
/// the spec, but are reasonable to keep.
|
||||
Compat,
|
||||
}
|
||||
|
||||
/// Whether to remove the [rich reply fallback] while sanitizing.
|
||||
///
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
#[cfg(feature = "html")]
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
#[allow(clippy::exhaustive_enums)]
|
||||
pub enum RemoveReplyFallback {
|
||||
@ -57,18 +14,6 @@ pub enum RemoveReplyFallback {
|
||||
No,
|
||||
}
|
||||
|
||||
/// Remove the [rich reply fallback] of the given HTML string.
|
||||
///
|
||||
/// Due to the fact that the HTML is parsed, note that malformed HTML and comments will be stripped
|
||||
/// from the output.
|
||||
///
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
pub fn remove_html_reply_fallback(s: &str) -> String {
|
||||
let sanitizer = HtmlSanitizer::reply_fallback_remover();
|
||||
sanitizer.clean(s).to_string()
|
||||
}
|
||||
|
||||
/// Remove the [rich reply fallback] of the given plain text string.
|
||||
///
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
@ -96,103 +41,6 @@ pub fn remove_plain_reply_fallback(mut s: &str) -> &str {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::remove_plain_reply_fallback;
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
use super::{
|
||||
remove_html_reply_fallback, sanitize_html, HtmlSanitizerMode, RemoveReplyFallback,
|
||||
};
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
fn sanitize() {
|
||||
let sanitized = sanitize_html(
|
||||
"\
|
||||
<mx-reply>\
|
||||
<blockquote>\
|
||||
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
|
||||
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
|
||||
<br>\
|
||||
Previous message\
|
||||
</blockquote>\
|
||||
</mx-reply>\
|
||||
<removed>This has no tag</removed>\
|
||||
<p>But this is inside a tag</p>\
|
||||
",
|
||||
HtmlSanitizerMode::Strict,
|
||||
RemoveReplyFallback::No,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
sanitized,
|
||||
"\
|
||||
<mx-reply>\
|
||||
<blockquote>\
|
||||
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
|
||||
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
|
||||
<br>\
|
||||
Previous message\
|
||||
</blockquote>\
|
||||
</mx-reply>\
|
||||
This has no tag\
|
||||
<p>But this is inside a tag</p>\
|
||||
"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
fn sanitize_without_reply() {
|
||||
let sanitized = sanitize_html(
|
||||
"\
|
||||
<mx-reply>\
|
||||
<blockquote>\
|
||||
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
|
||||
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
|
||||
<br>\
|
||||
Previous message\
|
||||
</blockquote>\
|
||||
</mx-reply>\
|
||||
<removed>This has no tag</removed>\
|
||||
<p>But this is inside a tag</p>\
|
||||
",
|
||||
HtmlSanitizerMode::Strict,
|
||||
RemoveReplyFallback::Yes,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
sanitized,
|
||||
"\
|
||||
This has no tag\
|
||||
<p>But this is inside a tag</p>\
|
||||
"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
fn remove_html_reply() {
|
||||
let without_reply = remove_html_reply_fallback(
|
||||
"\
|
||||
<mx-reply>\
|
||||
<blockquote>\
|
||||
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
|
||||
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
|
||||
<br>\
|
||||
Previous message\
|
||||
</blockquote>\
|
||||
</mx-reply>\
|
||||
<keep-me>This keeps its tag</keep-me>\
|
||||
<p>But this is inside a tag</p>\
|
||||
",
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
without_reply,
|
||||
"\
|
||||
<keep-me>This keeps its tag</keep-me>\
|
||||
<p>But this is inside a tag</p>\
|
||||
"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn remove_plain_reply() {
|
||||
|
@ -349,7 +349,7 @@ fn escape_tags_in_plain_reply_body() {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "unstable-sanitize")]
|
||||
#[cfg(feature = "html")]
|
||||
fn reply_sanitize() {
|
||||
use ruma_common::events::room::message::ForwardThread;
|
||||
|
||||
|
3
crates/ruma-html/CHANGELOG.md
Normal file
3
crates/ruma-html/CHANGELOG.md
Normal file
@ -0,0 +1,3 @@
|
||||
# 0.1.0 (unreleased)
|
||||
|
||||
Initial release
|
21
crates/ruma-html/Cargo.toml
Normal file
21
crates/ruma-html/Cargo.toml
Normal file
@ -0,0 +1,21 @@
|
||||
[package]
|
||||
name = "ruma-html"
|
||||
version = "0.1.0"
|
||||
description = "Opinionated HTML parsing and manipulating."
|
||||
homepage = "https://www.ruma.io/"
|
||||
keywords = ["matrix", "chat", "messaging", "ruma", "html", "parser"]
|
||||
license = "MIT"
|
||||
readme = "README.md"
|
||||
repository = "https://github.com/ruma/ruma"
|
||||
edition = "2021"
|
||||
rust-version = { workspace = true }
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
all-features = true
|
||||
rustdoc-args = ["--cfg", "docsrs"]
|
||||
|
||||
[dependencies]
|
||||
html5ever = "0.26.0"
|
||||
phf = { version = "0.11.1", features = ["macros"] }
|
||||
tracing = { workspace = true, features = ["attributes"] }
|
||||
wildmatch = "2.0.0"
|
12
crates/ruma-html/README.md
Normal file
12
crates/ruma-html/README.md
Normal file
@ -0,0 +1,12 @@
|
||||
# ruma-html
|
||||
|
||||
[](https://crates.io/crates/ruma-html)
|
||||
[](https://docs.rs/ruma-html/)
|
||||

|
||||
|
||||
Opinionated HTML parsing and manipulating library.
|
||||
|
||||
Like the rest of the Ruma crates, this crate is primarily meant to be used for
|
||||
the Matrix protocol. It should be able to be used to interact with any HTML
|
||||
content but will offer APIs focused on specificities of HTML in the Matrix
|
||||
specification.
|
@ -13,8 +13,8 @@ use tracing::debug;
|
||||
///
|
||||
/// To get the serialized HTML, use its `Display` implementation.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Fragment {
|
||||
pub nodes: Vec<Node>,
|
||||
pub struct Fragment {
|
||||
pub(crate) nodes: Vec<Node>,
|
||||
}
|
||||
|
||||
impl Fragment {
|
||||
@ -265,13 +265,14 @@ impl fmt::Display for Fragment {
|
||||
|
||||
/// An HTML node.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Node {
|
||||
pub parent: Option<usize>,
|
||||
pub prev_sibling: Option<usize>,
|
||||
pub next_sibling: Option<usize>,
|
||||
pub first_child: Option<usize>,
|
||||
pub last_child: Option<usize>,
|
||||
pub data: NodeData,
|
||||
#[non_exhaustive]
|
||||
pub struct Node {
|
||||
pub(crate) parent: Option<usize>,
|
||||
pub(crate) prev_sibling: Option<usize>,
|
||||
pub(crate) next_sibling: Option<usize>,
|
||||
pub(crate) first_child: Option<usize>,
|
||||
pub(crate) last_child: Option<usize>,
|
||||
pub(crate) data: NodeData,
|
||||
}
|
||||
|
||||
impl Node {
|
||||
@ -313,7 +314,7 @@ impl Node {
|
||||
}
|
||||
|
||||
impl Node {
|
||||
pub fn serialize<S>(&self, fragment: &Fragment, serializer: &mut S) -> io::Result<()>
|
||||
pub(crate) fn serialize<S>(&self, fragment: &Fragment, serializer: &mut S) -> io::Result<()>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
@ -353,7 +354,8 @@ impl Node {
|
||||
|
||||
/// The data of a `Node`.
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum NodeData {
|
||||
#[allow(clippy::exhaustive_enums)]
|
||||
pub enum NodeData {
|
||||
/// The root node of the `Fragment`.
|
||||
Document,
|
||||
|
||||
@ -369,7 +371,8 @@ pub(crate) enum NodeData {
|
||||
|
||||
/// The data of an HTML element.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct ElementData {
|
||||
#[allow(clippy::exhaustive_structs)]
|
||||
pub struct ElementData {
|
||||
/// The qualified name of the element.
|
||||
pub name: QualName,
|
||||
|
19
crates/ruma-html/src/lib.rs
Normal file
19
crates/ruma-html/src/lib.rs
Normal file
@ -0,0 +1,19 @@
|
||||
#![doc(html_favicon_url = "https://www.ruma.io/favicon.ico")]
|
||||
#![doc(html_logo_url = "https://www.ruma.io/images/logo.png")]
|
||||
//! Opinionated HTML parsing and manipulating library.
|
||||
//!
|
||||
//! Like the rest of the Ruma crates, this crate is primarily meant to be used for
|
||||
//! the Matrix protocol. It should be able to be used to interact with any HTML
|
||||
//! document but will offer APIs focused on specificities of HTML in the Matrix
|
||||
//! specification..
|
||||
|
||||
#![warn(missing_docs)]
|
||||
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
|
||||
|
||||
mod html_fragment;
|
||||
mod sanitize;
|
||||
|
||||
pub use self::{
|
||||
html_fragment::{ElementData, Fragment, Node, NodeData},
|
||||
sanitize::*,
|
||||
};
|
157
crates/ruma-html/src/sanitize.rs
Normal file
157
crates/ruma-html/src/sanitize.rs
Normal file
@ -0,0 +1,157 @@
|
||||
//! Convenience methods and types to sanitize HTML messages.
|
||||
|
||||
mod html_sanitizer;
|
||||
|
||||
pub use self::html_sanitizer::HtmlSanitizer;
|
||||
|
||||
/// Sanitize the given HTML string.
|
||||
///
|
||||
/// This removes the [tags and attributes] that are not listed in the Matrix specification.
|
||||
///
|
||||
/// It can also optionally remove the [rich reply fallback].
|
||||
///
|
||||
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
pub fn sanitize_html(
|
||||
s: &str,
|
||||
mode: HtmlSanitizerMode,
|
||||
remove_reply_fallback: RemoveReplyFallback,
|
||||
) -> String {
|
||||
let sanitizer = HtmlSanitizer::new(mode, remove_reply_fallback);
|
||||
sanitizer.clean(s).to_string()
|
||||
}
|
||||
|
||||
/// What HTML [tags and attributes] should be kept by the sanitizer.
|
||||
///
|
||||
/// [tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
#[allow(clippy::exhaustive_enums)]
|
||||
pub enum HtmlSanitizerMode {
|
||||
/// Keep only the tags and attributes listed in the Matrix specification.
|
||||
Strict,
|
||||
|
||||
/// Like `Strict` mode, with additional tags and attributes that are not yet included in
|
||||
/// the spec, but are reasonable to keep.
|
||||
Compat,
|
||||
}
|
||||
|
||||
/// Whether to remove the [rich reply fallback] while sanitizing.
|
||||
///
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
#[allow(clippy::exhaustive_enums)]
|
||||
pub enum RemoveReplyFallback {
|
||||
/// Remove the rich reply fallback.
|
||||
Yes,
|
||||
|
||||
/// Don't remove the rich reply fallback.
|
||||
No,
|
||||
}
|
||||
|
||||
/// Remove the [rich reply fallback] of the given HTML string.
|
||||
///
|
||||
/// Due to the fact that the HTML is parsed, note that malformed HTML and comments will be stripped
|
||||
/// from the output.
|
||||
///
|
||||
/// [rich reply fallback]: https://spec.matrix.org/latest/client-server-api/#fallbacks-for-rich-replies
|
||||
pub fn remove_html_reply_fallback(s: &str) -> String {
|
||||
let sanitizer = HtmlSanitizer::reply_fallback_remover();
|
||||
sanitizer.clean(s).to_string()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{
|
||||
remove_html_reply_fallback, sanitize_html, HtmlSanitizerMode, RemoveReplyFallback,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn sanitize() {
|
||||
let sanitized = sanitize_html(
|
||||
"\
|
||||
<mx-reply>\
|
||||
<blockquote>\
|
||||
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
|
||||
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
|
||||
<br>\
|
||||
Previous message\
|
||||
</blockquote>\
|
||||
</mx-reply>\
|
||||
<removed>This has no tag</removed>\
|
||||
<p>But this is inside a tag</p>\
|
||||
",
|
||||
HtmlSanitizerMode::Strict,
|
||||
RemoveReplyFallback::No,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
sanitized,
|
||||
"\
|
||||
<mx-reply>\
|
||||
<blockquote>\
|
||||
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
|
||||
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
|
||||
<br>\
|
||||
Previous message\
|
||||
</blockquote>\
|
||||
</mx-reply>\
|
||||
This has no tag\
|
||||
<p>But this is inside a tag</p>\
|
||||
"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sanitize_without_reply() {
|
||||
let sanitized = sanitize_html(
|
||||
"\
|
||||
<mx-reply>\
|
||||
<blockquote>\
|
||||
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
|
||||
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
|
||||
<br>\
|
||||
Previous message\
|
||||
</blockquote>\
|
||||
</mx-reply>\
|
||||
<removed>This has no tag</removed>\
|
||||
<p>But this is inside a tag</p>\
|
||||
",
|
||||
HtmlSanitizerMode::Strict,
|
||||
RemoveReplyFallback::Yes,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
sanitized,
|
||||
"\
|
||||
This has no tag\
|
||||
<p>But this is inside a tag</p>\
|
||||
"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn remove_html_reply() {
|
||||
let without_reply = remove_html_reply_fallback(
|
||||
"\
|
||||
<mx-reply>\
|
||||
<blockquote>\
|
||||
<a href=\"https://matrix.to/#/!n8f893n9:example.com/$1598361704261elfgc:localhost\">In reply to</a> \
|
||||
<a href=\"https://matrix.to/#/@alice:example.com\">@alice:example.com</a>\
|
||||
<br>\
|
||||
Previous message\
|
||||
</blockquote>\
|
||||
</mx-reply>\
|
||||
<keep-me>This keeps its tag</keep-me>\
|
||||
<p>But this is inside a tag</p>\
|
||||
",
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
without_reply,
|
||||
"\
|
||||
<keep-me>This keeps its tag</keep-me>\
|
||||
<p>But this is inside a tag</p>\
|
||||
"
|
||||
);
|
||||
}
|
||||
}
|
@ -2,16 +2,14 @@ use html5ever::{tendril::StrTendril, Attribute};
|
||||
use phf::{phf_map, phf_set, Map, Set};
|
||||
use wildmatch::WildMatch;
|
||||
|
||||
use super::{
|
||||
html_fragment::{ElementData, Fragment, NodeData},
|
||||
HtmlSanitizerMode, RemoveReplyFallback,
|
||||
};
|
||||
use super::{HtmlSanitizerMode, RemoveReplyFallback};
|
||||
use crate::{ElementData, Fragment, NodeData};
|
||||
|
||||
/// A sanitizer to filter [HTML tags and attributes] according to the Matrix specification.
|
||||
///
|
||||
/// [HTML tags and attributes]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct HtmlSanitizer {
|
||||
pub struct HtmlSanitizer {
|
||||
/// The mode of the HTML sanitizer.
|
||||
mode: HtmlSanitizerMode,
|
||||
|
@ -70,6 +70,7 @@ js = ["ruma-common/js"]
|
||||
# Convenience features
|
||||
rand = ["ruma-common/rand"]
|
||||
markdown = ["ruma-common/markdown"]
|
||||
html = ["dep:ruma-html", "ruma-common/html"]
|
||||
|
||||
# Everything except compat, js and unstable features
|
||||
full = [
|
||||
@ -86,6 +87,7 @@ full = [
|
||||
"push-gateway-api",
|
||||
"rand",
|
||||
"markdown",
|
||||
"html",
|
||||
]
|
||||
|
||||
# Enable all compatibility hacks. Deprecated.
|
||||
@ -189,7 +191,6 @@ unstable-msc3954 = ["ruma-common/unstable-msc3954"]
|
||||
unstable-msc3955 = ["ruma-common/unstable-msc3955"]
|
||||
unstable-msc3956 = ["ruma-common/unstable-msc3956"]
|
||||
unstable-pdu = ["ruma-common/unstable-pdu"]
|
||||
unstable-sanitize = ["ruma-common/unstable-sanitize"]
|
||||
unstable-unspecified = [
|
||||
"ruma-common/unstable-unspecified",
|
||||
"ruma-federation-api?/unstable-unspecified",
|
||||
@ -201,7 +202,6 @@ __ci = [
|
||||
"full",
|
||||
"compat-upload-signatures",
|
||||
"unstable-unspecified",
|
||||
"unstable-sanitize",
|
||||
"unstable-msc1767",
|
||||
"unstable-msc2409",
|
||||
"unstable-msc2448",
|
||||
@ -239,6 +239,7 @@ js_option = "0.1.1"
|
||||
ruma-common = { workspace = true }
|
||||
|
||||
ruma-client = { workspace = true, optional = true }
|
||||
ruma-html = { workspace = true, optional = true }
|
||||
ruma-server-util = { workspace = true, optional = true }
|
||||
ruma-signatures = { workspace = true, optional = true }
|
||||
ruma-state-res = { workspace = true, optional = true }
|
||||
|
@ -40,6 +40,7 @@
|
||||
//!
|
||||
//! * `rand`
|
||||
//! * `markdown`
|
||||
//! * `html`
|
||||
//!
|
||||
//! # Unstable features
|
||||
//!
|
||||
@ -52,8 +53,6 @@
|
||||
//! subject to change or removal.
|
||||
//! * `unstable-unspecified` -- Undocumented Matrix features that may be subject to change or
|
||||
//! removal.
|
||||
//! * `unstable-sanitize` -- Convenience methods for spec-compliant HTML sanitization that have not
|
||||
//! been thoroughly tested.
|
||||
//!
|
||||
//! # Common features
|
||||
//!
|
||||
@ -82,6 +81,9 @@ pub use ruma_client as client;
|
||||
#[cfg(feature = "events")]
|
||||
#[doc(inline)]
|
||||
pub use ruma_common::events;
|
||||
#[cfg(feature = "html")]
|
||||
#[doc(inline)]
|
||||
pub use ruma_html as html;
|
||||
#[cfg(feature = "server-util")]
|
||||
#[doc(inline)]
|
||||
pub use ruma_server_util as server_util;
|
||||
|
Loading…
x
Reference in New Issue
Block a user