monolith/src/html.rs

270 lines
9.2 KiB
Rust
Raw Normal View History

2019-08-23 20:24:45 +02:00
use http::{is_valid_url, resolve_url, retrieve_asset};
2019-08-23 05:17:15 +02:00
use std::default::Default;
use std::io;
2019-08-23 20:24:45 +02:00
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::TendrilSink;
2019-08-23 05:17:15 +02:00
enum NodeMatch {
Icon,
Image,
StyleSheet,
Anchor,
Script,
Form,
2019-08-23 05:17:15 +02:00
Other,
}
2019-08-23 20:24:45 +02:00
const PNG_PIXEL: &str = "";
2019-08-23 10:33:30 +02:00
2019-08-23 20:24:45 +02:00
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
2019-08-23 05:17:15 +02:00
// Input
2019-08-23 20:24:45 +02:00
"onfocus",
"onblur",
"onselect",
"onchange",
"onsubmit",
"onreset",
"onkeydown",
"onkeypress",
"onkeyup",
2019-08-23 05:17:15 +02:00
// Mouse
2019-08-23 20:24:45 +02:00
"onmouseover",
"onmouseout",
"onmousedown",
"onmouseup",
"onmousemove",
2019-08-23 05:17:15 +02:00
// Click
2019-08-23 20:24:45 +02:00
"onclick",
"ondblclick",
2019-08-23 05:17:15 +02:00
// Load
2019-08-23 20:24:45 +02:00
"onload",
"onunload",
"onabort",
"onerror",
"onresize",
2019-08-23 05:17:15 +02:00
];
2019-08-23 20:33:18 +02:00
pub fn walk_and_embed_assets(
url: &str,
node: &Handle,
opt_no_js: bool,
opt_no_images: bool,
opt_user_agent: &str,
) {
2019-08-23 05:17:15 +02:00
match node.data {
NodeData::Document => {
// Dig deeper
for child in node.children.borrow().iter() {
2019-08-23 20:33:18 +02:00
walk_and_embed_assets(
&url, child,
opt_no_js,
opt_no_images,
opt_user_agent,
);
2019-08-23 05:17:15 +02:00
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
2019-08-23 20:24:45 +02:00
NodeData::Doctype { .. } => {}
2019-08-23 05:17:15 +02:00
2019-08-23 20:24:45 +02:00
NodeData::Text { .. } => {}
2019-08-23 05:17:15 +02:00
2019-08-23 20:24:45 +02:00
NodeData::Comment { .. } => {
2019-08-23 05:17:15 +02:00
// Note: in case of opt_no_js being set to true, there's no need to worry about
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
// since that's not part of W3C standard and gets ignored by browsers other than IE [5, 9]
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
NodeData::Element {
ref name,
ref attrs,
..
} => {
2019-08-23 20:24:45 +02:00
let attrs_mut = &mut attrs.borrow_mut();
2019-08-23 05:17:15 +02:00
let mut found = NodeMatch::Other;
if &name.local == "link" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" {
if is_icon(&attr.value.to_string()) {
found = NodeMatch::Icon;
break;
} else if attr.value.to_string() == "stylesheet" {
found = NodeMatch::StyleSheet;
break;
}
}
}
} else if &name.local == "img" {
found = NodeMatch::Image;
} else if &name.local == "a" {
found = NodeMatch::Anchor;
} else if &name.local == "script" {
found = NodeMatch::Script;
} else if &name.local == "form" {
found = NodeMatch::Form;
2019-08-23 05:17:15 +02:00
}
match found {
NodeMatch::Icon => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url = resolve_url(&url, &attr.value.to_string());
2019-08-23 20:33:18 +02:00
let favicon_datauri = retrieve_asset(
&href_full_url.unwrap(),
true,
"",
opt_user_agent,
);
2019-08-23 05:17:15 +02:00
attr.value.clear();
attr.value.push_slice(favicon_datauri.unwrap().as_str());
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
NodeMatch::Image => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
2019-08-23 10:33:30 +02:00
if opt_no_images {
attr.value.clear();
attr.value.push_slice(PNG_PIXEL);
} else {
let src_full_url = resolve_url(&url, &attr.value.to_string());
2019-08-23 20:33:18 +02:00
let img_datauri = retrieve_asset(
&src_full_url.unwrap(),
true,
"",
opt_user_agent,
);
2019-08-23 10:33:30 +02:00
attr.value.clear();
attr.value.push_slice(img_datauri.unwrap().as_str());
}
2019-08-23 05:17:15 +02:00
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
NodeMatch::Anchor => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
// Don't touch email links or hrefs which begin with a hash sign
if attr.value.starts_with('#') || attr.value.starts_with("mailto:") {
2019-08-23 05:17:15 +02:00
continue;
}
let href_full_url = resolve_url(&url, &attr.value.to_string());
attr.value.clear();
attr.value.push_slice(href_full_url.unwrap().as_str());
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
NodeMatch::StyleSheet => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url = resolve_url(&url, &attr.value.to_string());
2019-08-23 20:33:18 +02:00
let css_datauri = retrieve_asset(
&href_full_url.unwrap(),
true,
"text/css",
opt_user_agent,
);
2019-08-23 05:17:15 +02:00
attr.value.clear();
2019-08-23 10:33:30 +02:00
attr.value.push_slice(css_datauri.unwrap().as_str());
2019-08-23 05:17:15 +02:00
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
NodeMatch::Script => {
if opt_no_js {
// Get rid of src and inner content of SCRIPT tags
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
attr.value.clear();
}
}
node.children.borrow_mut().clear();
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
2019-08-23 10:33:30 +02:00
let src_full_url = resolve_url(&url, &attr.value.to_string());
2019-08-23 20:24:45 +02:00
let js_datauri = retrieve_asset(
&src_full_url.unwrap(),
true,
"application/javascript",
2019-08-23 20:33:18 +02:00
opt_user_agent,
2019-08-23 20:24:45 +02:00
);
2019-08-23 05:17:15 +02:00
attr.value.clear();
2019-08-23 10:33:30 +02:00
attr.value.push_slice(js_datauri.unwrap().as_str());
2019-08-23 05:17:15 +02:00
}
}
}
2019-08-23 20:24:45 +02:00
}
NodeMatch::Form => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "action" {
// Do not touch action props which are set to a URL
2019-08-23 20:24:45 +02:00
if is_valid_url(&attr.value) {
continue;
}
let href_full_url = resolve_url(&url, &attr.value.to_string());
attr.value.clear();
attr.value.push_slice(href_full_url.unwrap().as_str());
}
}
2019-08-23 20:24:45 +02:00
}
NodeMatch::Other => {}
2019-08-23 05:17:15 +02:00
}
if opt_no_js {
// Get rid of JS event attributes
for attr in attrs_mut.iter_mut() {
if JS_DOM_EVENT_ATTRS.contains(&attr.name.local.to_lowercase().as_str()) {
attr.value.clear();
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
// Dig deeper
for child in node.children.borrow().iter() {
2019-08-23 20:33:18 +02:00
walk_and_embed_assets(
&url,
child,
opt_no_js,
opt_no_images,
opt_user_agent,
);
2019-08-23 05:17:15 +02:00
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
NodeData::ProcessingInstruction { .. } => unreachable!(),
}
}
pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut data.as_bytes())
.unwrap()
}
2019-08-23 10:33:30 +02:00
pub fn print_dom(handle: &Handle) {
2019-08-23 05:17:15 +02:00
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
}
fn is_icon(attr_value: &str) -> bool {
attr_value == "icon"
|| attr_value == "shortcut icon"
|| attr_value == "mask-icon"
|| attr_value == "apple-touch-icon"
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_icon() {
assert_eq!(is_icon("icon"), true);
assert_eq!(is_icon("stylesheet"), false);
}
}