//! HTML 消毒器。 //! //! 基于 lol_html 清理不受信任的 HTML,限制允许的 tag/attribute/URL scheme, //! 分别提供文章正文(`clean_html`)与评论(`clean_comment_html`)两套白名单策略。 //! 仅在 `feature = "server"` 时执行。 #![allow(clippy::unused_unit, deprecated)] #[cfg(feature = "server")] use std::collections::HashSet; #[cfg(feature = "server")] fn default_allowed_tags() -> HashSet<&'static str> { let mut set = HashSet::new(); for tag in [ "a", "abbr", "acronym", "area", "article", "aside", "b", "bdi", "bdo", "blockquote", "br", "caption", "center", "cite", "code", "col", "colgroup", "data", "dd", "del", "details", "dfn", "div", "dl", "dt", "em", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "i", "img", "ins", "kbd", "li", "map", "mark", "nav", "ol", "p", "pre", "q", "rp", "rt", "rtc", "ruby", "s", "samp", "small", "span", "strike", "strong", "sub", "summary", "sup", "table", "tbody", "td", "th", "thead", "time", "tr", "tt", "u", "ul", "var", "wbr", ] { set.insert(tag); } set } #[cfg(feature = "server")] fn clean_content_tags() -> HashSet<&'static str> { let mut set = HashSet::new(); set.insert("script"); set.insert("style"); set } #[cfg(feature = "server")] fn default_allowed_schemes() -> HashSet<&'static str> { let mut set = HashSet::new(); for scheme in [ "bitcoin", "ftp", "ftps", "geo", "http", "https", "im", "irc", "ircs", "magnet", "mailto", "mms", "mx", "news", "nntp", "openpgp4fpr", "sip", "sms", "smsto", "ssh", "tel", "url", "webcal", "wtai", "xmpp", ] { set.insert(scheme); } set } #[cfg(feature = "server")] fn is_safe_url(url: &str, allowed_schemes: &HashSet<&str>, allow_data_uri: bool) -> bool { let trimmed = url.trim(); if trimmed.is_empty() { return true; } // 解析 scheme 并与白名单对比;data URI 与 javascript/vbscript 单独处理。 if let Some(colon_pos) = trimmed.find(':') { let scheme = &trimmed[..colon_pos]; let scheme_lower = scheme.to_lowercase(); if allowed_schemes.contains(scheme_lower.as_str()) { return true; } if scheme_lower == "data" { return allow_data_uri; } if scheme_lower == "javascript" || scheme_lower == "vbscript" { return false; } if scheme.contains(|c: char| c.is_ascii_whitespace()) { return false; } } if trimmed.starts_with('#') || trimmed.starts_with('/') { return true; } true } #[cfg(feature = "server")] /// HTML 消毒配置:白名单 tag/attribute、允许 URL scheme 与链接 rel。 struct SanitizerConfig { allowed_tags: HashSet<&'static str>, extra_generic_attrs: Vec<&'static str>, extra_tag_attrs: Vec<(&'static str, Vec<&'static str>)>, allowed_schemes: HashSet<&'static str>, allow_data_uri: bool, link_rel: Option<&'static str>, remove_tags: HashSet<&'static str>, } #[cfg(feature = "server")] fn sanitize(input: &str, config: &SanitizerConfig) -> String { let allowed_tags = config.allowed_tags.clone(); let remove_tags = config.remove_tags.clone(); let generic_attrs: HashSet<&str> = config .extra_generic_attrs .iter() .copied() .chain(["lang", "title"]) .collect(); let tag_attrs_map: std::collections::HashMap<&str, HashSet<&str>> = { let mut m = std::collections::HashMap::new(); let base = [ ("a", vec!["href", "hreflang"]), ("bdo", vec!["dir"]), ("blockquote", vec!["cite"]), ("col", vec!["align", "char", "charoff", "span"]), ("colgroup", vec!["align", "char", "charoff", "span"]), ("del", vec!["cite", "datetime"]), ("hr", vec!["align", "size", "width"]), ("img", vec!["align", "alt", "height", "src", "width"]), ("ins", vec!["cite", "datetime"]), ("ol", vec!["start"]), ("q", vec!["cite"]), ("table", vec!["align", "char", "charoff", "summary"]), ("tbody", vec!["align", "char", "charoff"]), ( "td", vec!["align", "char", "charoff", "colspan", "headers", "rowspan"], ), ("tfoot", vec!["align", "char", "charoff"]), ( "th", vec![ "align", "char", "charoff", "colspan", "headers", "rowspan", "scope", ], ), ("thead", vec!["align", "char", "charoff"]), ("tr", vec!["align", "char", "charoff"]), ]; for (tag, attrs) in &base { m.insert(*tag, attrs.iter().copied().collect()); } for (tag, attrs) in &config.extra_tag_attrs { m.entry(tag) .or_insert_with(HashSet::new) .extend(attrs.iter().copied()); } m }; let allowed_schemes = config.allowed_schemes.clone(); let allow_data_uri = config.allow_data_uri; let link_rel = config.link_rel; let element_handler = move |el: &mut lol_html::html_content::Element| { let tag = el.tag_name().to_lowercase(); if remove_tags.contains(tag.as_str()) { el.remove(); return Ok(()); } if !allowed_tags.contains(tag.as_str()) { el.remove_and_keep_content(); return Ok(()); } let allowed_for_tag: HashSet<&str> = { let mut s = generic_attrs.clone(); if let Some(tag_specific) = tag_attrs_map.get(tag.as_str()) { s.extend(tag_specific.iter().copied()); } s }; let attrs_to_remove: Vec = el .attributes() .iter() .filter_map(|attr| { let name = attr.name(); let name_lower = name.to_lowercase(); // 仅保留白名单属性;对 href/src/cite 额外校验 URL 安全性。 if allowed_for_tag.contains(name_lower.as_str()) { if name_lower == "href" || name_lower == "src" || name_lower == "cite" { let val = attr.value(); if !is_safe_url(&val, &allowed_schemes, allow_data_uri) { return Some(name); } } None } else { Some(name) } }) .collect(); for attr_name in attrs_to_remove { el.remove_attribute(&attr_name); } if link_rel.is_some() && tag == "a" { if let Some(rel) = link_rel { let existing = el.get_attribute("rel").unwrap_or_default(); if existing != rel { el.set_attribute("rel", rel).ok(); } } } Ok(()) }; lol_html::rewrite_str( input, lol_html::RewriteStrSettings { element_content_handlers: vec![lol_html::element!("*", element_handler)], document_content_handlers: vec![lol_html::doc_comments!(|c| { c.remove(); Ok(()) })], ..lol_html::RewriteStrSettings::new() }, ) .unwrap_or_default() } #[cfg(feature = "server")] /// 文章正文 HTML 清理:允许较完整的标签与 data URI,外链添加 `noopener noreferrer`。 pub fn clean_html(input: &str) -> String { let config = SanitizerConfig { allowed_tags: default_allowed_tags(), extra_generic_attrs: vec![ "class", "aria-hidden", "aria-label", "id", "role", "accesskey", "title", ], extra_tag_attrs: vec![ ("a", vec!["class", "aria-hidden", "aria-label"]), ("span", vec!["class"]), ("h1", vec!["id", "class"]), ("h2", vec!["id", "class"]), ("h3", vec!["id", "class"]), ("h4", vec!["id", "class"]), ("h5", vec!["id", "class"]), ("h6", vec!["id", "class"]), ], allowed_schemes: default_allowed_schemes(), allow_data_uri: true, link_rel: Some("noopener noreferrer"), remove_tags: clean_content_tags(), }; sanitize(input, &config) } #[cfg(feature = "server")] /// 评论 HTML 清理:移除图片与折叠块,禁用 data URI,外链添加 `nofollow noopener`。 pub fn clean_comment_html(input: &str) -> String { let mut tags = default_allowed_tags(); tags.remove("img"); tags.remove("details"); tags.remove("summary"); let config = SanitizerConfig { allowed_tags: tags, extra_generic_attrs: vec![ "class", "title", "aria-hidden", "aria-label", "role", "accesskey", ], extra_tag_attrs: vec![ ("a", vec!["class", "aria-hidden", "aria-label"]), ("span", vec!["class"]), ], allowed_schemes: default_allowed_schemes(), allow_data_uri: false, link_rel: Some("nofollow noopener"), remove_tags: clean_content_tags(), }; sanitize(input, &config) } #[cfg(all(test, feature = "server"))] mod tests { use super::*; #[test] fn safe_tags_preserved() { assert_eq!(clean_html("

safe

"), "

safe

"); assert_eq!( clean_html("

bold

"), "

bold

" ); } #[test] fn script_and_style_removed() { assert_eq!( clean_html("

ok

"), "

ok

" ); } #[test] fn id_and_class_preserved() { assert_eq!( clean_html("

x

"), "

x

" ); assert_eq!( clean_html("

x

"), "

x

" ); } #[test] fn javascript_url_stripped() { assert_eq!( clean_html("x"), "x" ); } #[test] fn vbscript_url_stripped() { assert_eq!( clean_html("x"), "x" ); } #[test] fn unknown_tags_removed_content_kept() { assert_eq!(clean_html("keep me"), "keep me"); } #[test] fn comment_removes_img_details_summary() { assert_eq!( clean_comment_html("
sumbody
"), "sumbody" ); } #[test] fn comment_removes_data_uris() { assert_eq!( clean_comment_html("x"), "x" ); } // ---- is_safe_url 直接分支测试 ---- // is_safe_url 是安全敏感的内部函数,以下测试锁定其各分支的行为契约。 #[test] fn is_safe_url_allows_https() { let schemes = default_allowed_schemes(); assert!(is_safe_url("https://example.com", &schemes, false)); assert!(is_safe_url("http://example.com", &schemes, false)); } #[test] fn is_safe_url_rejects_javascript() { let schemes = default_allowed_schemes(); assert!(!is_safe_url("javascript:alert(1)", &schemes, false)); } #[test] fn is_safe_url_rejects_vbscript() { let schemes = default_allowed_schemes(); assert!(!is_safe_url("vbscript:msgbox", &schemes, false)); } #[test] fn is_safe_url_data_uri_respects_flag() { let schemes = default_allowed_schemes(); // 文章正文允许 data URI assert!(is_safe_url("data:image/png;base64,iVBOR", &schemes, true)); // 评论禁用 data URI assert!(!is_safe_url("data:image/png;base64,iVBOR", &schemes, false)); } #[test] fn is_safe_url_allows_relative_and_fragment() { let schemes = default_allowed_schemes(); // 绝对路径 assert!(is_safe_url("/path/to/page", &schemes, false)); // 锚点 assert!(is_safe_url("#section", &schemes, false)); } #[test] fn is_safe_url_empty_is_safe() { let schemes = default_allowed_schemes(); // 空 URL(如 img 无 src)视为安全。 assert!(is_safe_url("", &schemes, false)); assert!(is_safe_url(" ", &schemes, false)); } #[test] fn is_safe_url_allows_other_whitelisted_schemes() { let schemes = default_allowed_schemes(); // mailto / tel / ftp 等均在默认白名单中。 assert!(is_safe_url("mailto:user@example.com", &schemes, false)); assert!(is_safe_url("tel:+8613800138000", &schemes, false)); assert!(is_safe_url("ftp://example.com/file", &schemes, false)); } #[test] fn is_safe_url_rejects_scheme_with_whitespace() { let schemes = default_allowed_schemes(); // 含空格的 scheme 名是已知的混淆手法,应被拒绝。 assert!(!is_safe_url("java\tscript:alert(1)", &schemes, false)); } #[test] fn is_safe_url_scheme_matching_is_case_insensitive() { let schemes = default_allowed_schemes(); // scheme 大小写不敏感:HTTPS 与 https 等价。 assert!(is_safe_url("HTTPS://example.com", &schemes, false)); assert!(!is_safe_url("JAVASCRIPT:alert(1)", &schemes, false)); } }