monolith/src/html.rs

541 lines
21 KiB
Rust
Raw Normal View History

2019-08-24 20:48:10 +02:00
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::TendrilSink;
2019-08-23 20:24:45 +02:00
use http::{is_valid_url, resolve_url, retrieve_asset};
2019-08-24 20:48:10 +02:00
use regex::Regex;
2019-08-23 05:17:15 +02:00
use std::default::Default;
use std::io;
2019-08-24 02:16:16 +02:00
use utils::data_to_dataurl;
2019-08-23 05:17:15 +02:00
2019-08-24 20:48:10 +02:00
lazy_static! {
2019-08-25 05:06:40 +02:00
static ref EMPTY_STRING: String = String::new();
2019-08-24 20:48:10 +02:00
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
2019-08-25 05:06:40 +02:00
static ref ICON_VALUES: Regex = Regex::new(
r"^icon|shortcut icon|mask-icon|apple-touch-icon|fluid-icon$"
2019-08-25 05:06:40 +02:00
).unwrap();
2019-08-23 05:17:15 +02:00
}
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
2019-08-23 10:33:30 +02:00
2019-08-23 20:24:45 +02:00
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
2019-08-23 05:17:15 +02:00
// Input
2019-08-23 20:24:45 +02:00
"onfocus",
"onblur",
"onselect",
"onchange",
"onsubmit",
"onreset",
"onkeydown",
"onkeypress",
"onkeyup",
2019-08-23 05:17:15 +02:00
// Mouse
2019-08-23 20:24:45 +02:00
"onmouseover",
"onmouseout",
"onmousedown",
"onmouseup",
"onmousemove",
2019-08-23 05:17:15 +02:00
// Click
2019-08-23 20:24:45 +02:00
"onclick",
"ondblclick",
2019-08-23 05:17:15 +02:00
// Load
2019-08-23 20:24:45 +02:00
"onload",
"onunload",
"onabort",
"onerror",
"onresize",
2019-08-23 05:17:15 +02:00
];
2019-08-24 17:21:29 +02:00
fn get_parent_node_name(node: &Handle) -> String {
let parent = node.parent.take().clone();
let parent_node = parent.and_then(|node| node.upgrade()).unwrap();
match &parent_node.data {
2019-08-25 05:06:40 +02:00
NodeData::Document => { EMPTY_STRING.clone() }
NodeData::Doctype { .. } => { EMPTY_STRING.clone() }
NodeData::Text { .. } => { EMPTY_STRING.clone() }
NodeData::Comment { .. } => { EMPTY_STRING.clone() }
NodeData::Element { ref name, attrs: _, .. } => {
2019-08-24 17:21:29 +02:00
name.local.as_ref().to_string()
}
NodeData::ProcessingInstruction { .. } => unreachable!()
}
}
2019-08-23 20:33:18 +02:00
pub fn walk_and_embed_assets(
url: &str,
node: &Handle,
opt_no_js: bool,
opt_no_images: bool,
opt_user_agent: &str,
2019-08-25 17:41:30 +02:00
opt_silent: bool,
opt_insecure: bool,
2019-08-23 20:33:18 +02:00
) {
2019-08-23 05:17:15 +02:00
match node.data {
NodeData::Document => {
// Dig deeper
for child in node.children.borrow().iter() {
2019-08-23 20:33:18 +02:00
walk_and_embed_assets(
&url, child,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
);
2019-08-23 05:17:15 +02:00
}
2019-08-23 20:24:45 +02:00
}
NodeData::Doctype { .. } => {}
NodeData::Text { .. } => {}
NodeData::Comment { .. } => {
2019-08-23 05:17:15 +02:00
// Note: in case of opt_no_js being set to true, there's no need to worry about
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
// since that's not part of W3C standard and therefore gets ignored
// by browsers other than IE [5, 9]
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
NodeData::Element {
ref name,
ref attrs,
..
} => {
2019-08-23 20:24:45 +02:00
let attrs_mut = &mut attrs.borrow_mut();
2019-08-23 05:17:15 +02:00
2019-08-24 02:16:16 +02:00
match name.local.as_ref() {
"link" => {
2019-08-25 05:06:40 +02:00
let mut link_type = "";
2019-08-24 02:16:16 +02:00
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" {
if is_icon(&attr.value.to_string()) {
2019-08-25 05:06:40 +02:00
link_type = "icon";
2019-08-24 02:16:16 +02:00
break;
} else if attr.value.to_string() == "stylesheet" {
2019-08-25 05:06:40 +02:00
link_type = "stylesheet";
2019-08-24 02:16:16 +02:00
break;
}
2019-08-23 05:17:15 +02:00
}
}
2019-08-25 05:06:40 +02:00
if link_type == "icon" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_images {
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
let href_full_url: String = resolve_url(
&url,
&attr.value.to_string()
)
.unwrap_or(EMPTY_STRING.clone());
let favicon_datauri = retrieve_asset(
&href_full_url,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
2019-08-25 05:06:40 +02:00
attr.value.clear();
attr.value.push_slice(favicon_datauri.as_str());
}
}
}
} else if link_type == "stylesheet" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let css_datauri = retrieve_asset(
&href_full_url,
true,
"text/css",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
2019-08-24 20:22:34 +02:00
attr.value.clear();
2019-08-25 05:06:40 +02:00
attr.value.push_slice(css_datauri.as_str());
}
}
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear();
attr.value.push_slice(&href_full_url.as_str());
2019-08-24 20:22:34 +02:00
}
2019-08-23 05:17:15 +02:00
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-25 05:06:40 +02:00
"img" => {
2019-08-23 05:17:15 +02:00
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let value = attr.value.to_string();
// Ignore images with empty source (they're hopelessly broken)
if value == EMPTY_STRING.clone() {
continue;
}
2019-08-23 10:33:30 +02:00
if opt_no_images {
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
2019-08-23 10:33:30 +02:00
} else {
2019-08-25 05:06:40 +02:00
let src_full_url: String = resolve_url(
&url,
&value,
2019-08-25 05:06:40 +02:00
)
.unwrap_or(EMPTY_STRING.clone());
2019-08-23 20:33:18 +02:00
let img_datauri = retrieve_asset(
&src_full_url,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
2019-08-23 10:33:30 +02:00
attr.value.clear();
2019-08-25 05:06:40 +02:00
attr.value.push_slice(img_datauri.as_str());
2019-08-23 10:33:30 +02:00
}
2019-08-23 05:17:15 +02:00
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-25 05:06:40 +02:00
"source" => {
2019-08-24 17:21:29 +02:00
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "srcset" {
if get_parent_node_name(&node) == "picture" {
if opt_no_images {
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
2019-08-25 05:06:40 +02:00
let srcset_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
2019-08-24 17:21:29 +02:00
let source_datauri = retrieve_asset(
&srcset_full_url,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
2019-08-24 17:21:29 +02:00
attr.value.clear();
2019-08-25 05:06:40 +02:00
attr.value.push_slice(source_datauri.as_str());
2019-08-24 17:21:29 +02:00
}
}
}
}
}
2019-08-25 05:06:40 +02:00
"a" => {
2019-08-23 05:17:15 +02:00
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
// Don't touch email links or hrefs which begin with a hash sign
2019-08-24 20:48:10 +02:00
if attr.value.starts_with('#') || has_protocol(&attr.value) {
2019-08-23 05:17:15 +02:00
continue;
}
2019-08-25 05:06:40 +02:00
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
2019-08-23 05:17:15 +02:00
attr.value.clear();
2019-08-25 05:06:40 +02:00
attr.value.push_slice(href_full_url.as_str());
2019-08-23 05:17:15 +02:00
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-25 05:06:40 +02:00
"script" => {
2019-08-23 05:17:15 +02:00
if opt_no_js {
// Get rid of src and inner content of SCRIPT tags
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
attr.value.clear();
}
}
node.children.borrow_mut().clear();
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
2019-08-25 05:06:40 +02:00
let src_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
2019-08-23 20:24:45 +02:00
let js_datauri = retrieve_asset(
&src_full_url,
true,
"application/javascript",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
2019-08-23 05:17:15 +02:00
attr.value.clear();
2019-08-25 05:06:40 +02:00
attr.value.push_slice(js_datauri.as_str());
2019-08-23 05:17:15 +02:00
}
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-25 05:06:40 +02:00
"form" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "action" {
// Don't modify action that's already a full URL
2019-08-23 20:24:45 +02:00
if is_valid_url(&attr.value) {
continue;
}
2019-08-25 05:06:40 +02:00
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear();
2019-08-25 05:06:40 +02:00
attr.value.push_slice(href_full_url.as_str());
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-25 05:06:40 +02:00
"iframe" => {
2019-08-24 02:16:16 +02:00
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
2019-08-27 04:57:10 +02:00
let value = attr.value.to_string();
2019-08-27 04:57:10 +02:00
// Ignore iframes with empty source (they cause infinite loops)
if value == EMPTY_STRING.clone() {
continue;
}
let src_full_url: String = resolve_url(&url, &value)
2019-08-25 05:06:40 +02:00
.unwrap_or(EMPTY_STRING.clone());
2019-08-24 02:16:16 +02:00
let iframe_data = retrieve_asset(
&src_full_url,
false,
"text/html",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
2019-08-25 05:06:40 +02:00
let dom = html_to_dom(&iframe_data);
walk_and_embed_assets(
&src_full_url,
&dom.document,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
);
2019-08-24 02:16:16 +02:00
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
let iframe_datauri = data_to_dataurl("text/html", &buf);
attr.value.clear();
attr.value.push_slice(iframe_datauri.as_str());
}
}
}
2019-08-25 05:06:40 +02:00
_ => {}
2019-08-23 05:17:15 +02:00
}
if opt_no_js {
// Get rid of JS event attributes
for attr in attrs_mut.iter_mut() {
if JS_DOM_EVENT_ATTRS.contains(&attr.name.local.to_lowercase().as_str()) {
attr.value.clear();
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
// Dig deeper
for child in node.children.borrow().iter() {
2019-08-23 20:33:18 +02:00
walk_and_embed_assets(
&url,
child,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
);
2019-08-23 05:17:15 +02:00
}
2019-08-23 20:24:45 +02:00
}
2019-08-24 17:21:29 +02:00
NodeData::ProcessingInstruction { .. } => unreachable!()
2019-08-23 05:17:15 +02:00
}
}
2019-08-24 20:48:10 +02:00
fn has_protocol(url: &str) -> bool {
HAS_PROTOCOL.is_match(&url.to_lowercase())
}
2019-08-23 05:17:15 +02:00
pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut data.as_bytes())
.unwrap()
}
2019-08-23 10:33:30 +02:00
pub fn print_dom(handle: &Handle) {
2019-08-23 05:17:15 +02:00
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
}
fn is_icon(attr_value: &str) -> bool {
2019-08-25 05:06:40 +02:00
ICON_VALUES.is_match(&attr_value.to_lowercase())
2019-08-23 05:17:15 +02:00
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_icon() {
assert_eq!(is_icon("icon"), true);
2019-08-31 04:18:14 +02:00
assert_eq!(is_icon("Shortcut Icon"), true);
assert_eq!(is_icon("ICON"), true);
assert_eq!(is_icon("mask-icon"), true);
assert_eq!(is_icon("fluid-icon"), true);
2019-08-23 05:17:15 +02:00
assert_eq!(is_icon("stylesheet"), false);
2019-08-31 04:18:14 +02:00
assert_eq!(is_icon(""), false);
2019-08-23 05:17:15 +02:00
}
2019-08-24 20:48:10 +02:00
#[test]
fn test_has_protocol() {
assert_eq!(has_protocol("mailto:somebody@somewhere.com?subject=hello"), true);
assert_eq!(has_protocol("tel:5551234567"), true);
assert_eq!(has_protocol("ftp:user:password@some-ftp-server.com"), true);
assert_eq!(has_protocol("javascript:void(0)"), true);
assert_eq!(has_protocol("http://news.ycombinator.com"), true);
assert_eq!(has_protocol("https://github.com"), true);
assert_eq!(has_protocol("//some-hostname.com/some-file.html"), false);
assert_eq!(has_protocol("some-hostname.com/some-file.html"), false);
assert_eq!(has_protocol("/some-file.html"), false);
assert_eq!(has_protocol(""), false);
assert_eq!(has_protocol("MAILTO:somebody@somewhere.com?subject=hello"), true);
}
2019-08-31 04:18:14 +02:00
#[test]
fn test_get_parent_node_name() {
let html = "<!doctype html><html><HEAD></HEAD><body><div><P></P></div></body></html>";
let dom = html_to_dom(&html);
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {
*i += 1;
match &node.data {
NodeData::Document => {
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
NodeData::Doctype { .. } => (),
NodeData::Text { .. } => (),
NodeData::Comment { .. } => (),
NodeData::Element { ref name, attrs: _, .. } => {
let node_name = name.local.as_ref().to_string();
let parent_node_name = get_parent_node_name(node);
if node_name == "head" || node_name == "body" {
assert_eq!(parent_node_name, "html");
} else if node_name == "div" {
assert_eq!(parent_node_name, "body");
} else if node_name == "p" {
assert_eq!(parent_node_name, "div");
}
println!("{}", node_name);
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
NodeData::ProcessingInstruction { .. } => unreachable!()
};
}
test_walk(&dom.document, &mut count);
assert_eq!(count, 7);
}
#[test]
fn test_walk_and_embed_assets() {
let html = "<div><P></P></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><p></p></div></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_iframe() {
let html = "<div><P></P><iframe src=\"\"></iframe></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><p></p><iframe src=\"\"></iframe></div></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_img() {
let html = "<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div>\
<img src=\"data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0\
lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\
</div></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_js() {
let html = "<div><script src=\"http://localhost/assets/some.js\"></script>\
<script>alert(1)</script></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><script src=\"\"></script>\
<script></script></div></body></html>"
);
}
2019-08-23 05:17:15 +02:00
}