html: Allow to navigate through the HTML tree

This commit is contained in:
Kévin Commaille 2024-04-22 17:28:29 +02:00 committed by Kévin Commaille
parent d36f485b19
commit 6e763ee5e7
5 changed files with 294 additions and 9 deletions

View File

@ -8,6 +8,8 @@ Breaking Changes:
Improvements:
- Add support for deprecated HTML tags, according to Matrix 1.10
- Allow to navigate through the HTML tree with `Html::first_child()`,
`Html::last_child()` or `Html::children()`
# 0.1.0

View File

@ -1,4 +1,4 @@
use std::{collections::BTreeSet, fmt, io};
use std::{collections::BTreeSet, fmt, io, iter::FusedIterator};
use as_variant::as_variant;
use html5ever::{
@ -122,6 +122,30 @@ impl Html {
pub(crate) fn root(&self) -> &Node {
&self.nodes[self.root_id()]
}
/// Whether the root node of the HTML has children.
pub fn has_children(&self) -> bool {
self.root().first_child.is_some()
}
/// The first child node of the root node of the HTML.
///
/// Returns `None` if the root node has no children.
pub fn first_child(&self) -> Option<NodeRef<'_>> {
self.root().first_child.map(|id| NodeRef::new(self, id))
}
/// The last child node of the root node of the HTML .
///
/// Returns `None` if the root node has no children.
pub fn last_child(&self) -> Option<NodeRef<'_>> {
self.root().last_child.map(|id| NodeRef::new(self, id))
}
/// Iterate through the children of the root node of the HTML.
pub fn children(&self) -> Children<'_> {
Children::new(self.first_child())
}
}
impl Default for Html {
@ -329,6 +353,11 @@ impl Node {
as_variant!(&mut self.data, NodeData::Element)
}
/// Returns the text content of this `Node`, if it is a `NodeData::Text`.
fn as_text(&self) -> Option<&StrTendril> {
as_variant!(&self.data, NodeData::Text)
}
/// Returns the mutable text content of this `Node`, if it is a `NodeData::Text`.
fn as_text_mut(&mut self) -> Option<&mut StrTendril> {
as_variant!(&mut self.data, NodeData::Text)
@ -375,9 +404,9 @@ impl Node {
}
/// The data of a `Node`.
#[derive(Debug)]
#[derive(Debug, Clone)]
#[allow(clippy::exhaustive_enums)]
pub(crate) enum NodeData {
pub enum NodeData {
/// The root node of the `Html`.
Document,
@ -392,7 +421,7 @@ pub(crate) enum NodeData {
}
/// The data of an HTML element.
#[derive(Debug)]
#[derive(Debug, Clone)]
#[allow(clippy::exhaustive_structs)]
pub struct ElementData {
/// The qualified name of the element.
@ -402,6 +431,123 @@ pub struct ElementData {
pub attrs: BTreeSet<Attribute>,
}
/// A reference to an HTML node.
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub struct NodeRef<'a> {
/// The `Html` struct containing the nodes.
pub(crate) html: &'a Html,
/// The referenced node.
pub(crate) node: &'a Node,
}
impl<'a> NodeRef<'a> {
/// Construct a new `NodeRef` for the given HTML and node ID.
fn new(html: &'a Html, id: usize) -> Self {
Self { html, node: &html.nodes[id] }
}
/// Construct a new `NodeRef` from the same HTML as this node with the given node ID.
fn with_id(&self, id: usize) -> Self {
let html = self.html;
Self::new(html, id)
}
/// The data of the node.
pub fn data(&self) -> &'a NodeData {
&self.node.data
}
/// Returns the data of this node if it is a `NodeData::Element`.
pub fn as_element(&self) -> Option<&'a ElementData> {
self.node.as_element()
}
/// Returns the text content of this node, if it is a `NodeData::Text`.
pub fn as_text(&self) -> Option<&'a StrTendril> {
self.node.as_text()
}
/// The parent node of this node.
///
/// Returns `None` if the parent is the root node.
pub fn parent(&self) -> Option<NodeRef<'a>> {
let parent_id = self.node.parent?;
// We don't want users to be able to navigate to the root.
if parent_id == self.html.root_id() {
return None;
}
Some(self.with_id(parent_id))
}
/// The next sibling node of this node.
///
/// Returns `None` if this is the last of its siblings.
pub fn next_sibling(&self) -> Option<NodeRef<'a>> {
Some(self.with_id(self.node.next_sibling?))
}
/// The previous sibling node of this node.
///
/// Returns `None` if this is the first of its siblings.
pub fn prev_sibling(&self) -> Option<NodeRef<'a>> {
Some(self.with_id(self.node.prev_sibling?))
}
/// Whether this node has children.
pub fn has_children(&self) -> bool {
self.node.first_child.is_some()
}
/// The first child node of this node.
///
/// Returns `None` if this node has no children.
pub fn first_child(&self) -> Option<NodeRef<'a>> {
Some(self.with_id(self.node.first_child?))
}
/// The last child node of this node.
///
/// Returns `None` if this node has no children.
pub fn last_child(&self) -> Option<NodeRef<'a>> {
Some(self.with_id(self.node.last_child?))
}
/// Get an iterator through the children of this node.
pub fn children(&self) -> Children<'a> {
Children::new(self.first_child())
}
}
/// An iterator through the children of a node.
///
/// Can be constructed with [`Html::children()`] or [`NodeRef::children()`].
#[derive(Debug, Clone, Copy)]
pub struct Children<'a> {
next: Option<NodeRef<'a>>,
}
impl<'a> Children<'a> {
/// Construct a `Children` starting from the given node.
fn new(start_node: Option<NodeRef<'a>>) -> Self {
Self { next: start_node }
}
}
impl<'a> Iterator for Children<'a> {
type Item = NodeRef<'a>;
fn next(&mut self) -> Option<Self::Item> {
let next = self.next?;
self.next = next.next_sibling();
Some(next)
}
}
impl<'a> FusedIterator for Children<'a> {}
#[cfg(test)]
mod tests {
use super::Html;

View File

@ -14,8 +14,4 @@ mod helpers;
mod html;
mod sanitizer_config;
pub use self::{
helpers::*,
html::{ElementData, Html},
sanitizer_config::SanitizerConfig,
};
pub use self::{helpers::*, html::*, sanitizer_config::SanitizerConfig};

View File

@ -1 +1,2 @@
mod navigate;
mod sanitize;

View File

@ -0,0 +1,140 @@
use ruma_html::Html;
#[test]
fn navigate_tree() {
let raw_html = "\
<h1>Title</h1>\
<div class=\"text\">\
<p>This is some <em>text</em></p>\
</div>\
";
let html = Html::parse(raw_html);
assert!(html.has_children());
assert!(html.first_child().is_some());
assert!(html.last_child().is_some());
let mut html_children = html.children();
// `<h1>` element.
let h1_node = html_children.next().unwrap();
let h1_element = h1_node.as_element().unwrap();
assert_eq!(&h1_element.name.local, "h1");
assert!(h1_element.attrs.is_empty());
assert!(h1_node.parent().is_none());
assert!(h1_node.next_sibling().is_some());
assert!(h1_node.prev_sibling().is_none());
assert!(h1_node.has_children());
assert!(h1_node.first_child().is_some());
assert!(h1_node.last_child().is_some());
let mut h1_children = h1_node.children();
// Text of `<h1>` element.
let h1_text_node = h1_children.next().unwrap();
let h1_text = h1_text_node.as_text().unwrap();
assert_eq!(h1_text.as_ref(), "Title");
assert!(h1_text_node.parent().is_some());
assert!(h1_text_node.next_sibling().is_none());
assert!(h1_text_node.prev_sibling().is_none());
assert!(!h1_text_node.has_children());
assert!(h1_text_node.first_child().is_none());
assert!(h1_text_node.last_child().is_none());
let mut h1_text_children = h1_text_node.children();
assert!(h1_text_children.next().is_none());
assert!(h1_children.next().is_none());
// `<div>` element.
let div_node = html_children.next().unwrap();
let div_element = div_node.as_element().unwrap();
assert_eq!(&div_element.name.local, "div");
assert_eq!(div_element.attrs.len(), 1);
let class_attr = div_element.attrs.first().unwrap();
assert_eq!(&class_attr.name.local, "class");
assert_eq!(class_attr.value.as_ref(), "text");
assert!(div_node.parent().is_none());
assert!(div_node.next_sibling().is_none());
assert!(div_node.prev_sibling().is_some());
assert!(div_node.has_children());
assert!(div_node.first_child().is_some());
assert!(div_node.last_child().is_some());
let mut div_children = div_node.children();
// `<p>` element.
let p_node = div_children.next().unwrap();
let p_element = p_node.as_element().unwrap();
assert_eq!(&p_element.name.local, "p");
assert!(p_element.attrs.is_empty());
assert!(p_node.parent().is_some());
assert!(p_node.next_sibling().is_none());
assert!(p_node.prev_sibling().is_none());
assert!(p_node.has_children());
assert!(p_node.first_child().is_some());
assert!(p_node.last_child().is_some());
let mut p_children = p_node.children();
// Text of `<p>` element.
let p_text_node = p_children.next().unwrap();
let p_text = p_text_node.as_text().unwrap();
assert_eq!(p_text.as_ref(), "This is some ");
assert!(p_text_node.parent().is_some());
assert!(p_text_node.next_sibling().is_some());
assert!(p_text_node.prev_sibling().is_none());
assert!(!p_text_node.has_children());
assert!(p_text_node.first_child().is_none());
assert!(p_text_node.last_child().is_none());
let mut p_text_children = p_text_node.children();
assert!(p_text_children.next().is_none());
// `<em>` element.
let em_node = p_children.next().unwrap();
let em_element = em_node.as_element().unwrap();
assert_eq!(&em_element.name.local, "em");
assert!(em_element.attrs.is_empty());
assert!(em_node.parent().is_some());
assert!(em_node.next_sibling().is_none());
assert!(em_node.prev_sibling().is_some());
assert!(em_node.has_children());
assert!(em_node.first_child().is_some());
assert!(em_node.last_child().is_some());
let mut em_children = em_node.children();
// Text of `<em>` element.
let em_text_node = em_children.next().unwrap();
let em_text = em_text_node.as_text().unwrap();
assert_eq!(em_text.as_ref(), "text");
assert!(em_text_node.parent().is_some());
assert!(em_text_node.next_sibling().is_none());
assert!(em_text_node.prev_sibling().is_none());
assert!(!em_text_node.has_children());
assert!(em_text_node.first_child().is_none());
assert!(em_text_node.last_child().is_none());
let mut em_text_children = em_text_node.children();
assert!(em_text_children.next().is_none());
assert!(em_children.next().is_none());
assert!(p_children.next().is_none());
assert!(div_children.next().is_none());
assert!(html_children.next().is_none());
}