2019-08-23 05:17:15 +02:00
|
|
|
extern crate html5ever;
|
|
|
|
|
|
|
|
use std::default::Default;
|
|
|
|
use std::io;
|
2019-08-23 09:26:05 +02:00
|
|
|
use http::{is_url, retrieve_asset, resolve_url};
|
2019-08-23 05:17:15 +02:00
|
|
|
|
|
|
|
use self::html5ever::parse_document;
|
|
|
|
use self::html5ever::rcdom::{Handle, NodeData, RcDom};
|
|
|
|
use self::html5ever::tendril::TendrilSink;
|
|
|
|
use self::html5ever::serialize::{SerializeOpts, serialize};
|
|
|
|
|
|
|
|
enum NodeMatch {
|
|
|
|
Icon,
|
|
|
|
Image,
|
|
|
|
StyleSheet,
|
|
|
|
Anchor,
|
|
|
|
Script,
|
2019-08-23 09:26:05 +02:00
|
|
|
Form,
|
2019-08-23 05:17:15 +02:00
|
|
|
Other,
|
|
|
|
}
|
|
|
|
|
2019-08-23 10:33:30 +02:00
|
|
|
static PNG_PIXEL: &str = "";
|
|
|
|
|
2019-08-23 05:17:15 +02:00
|
|
|
static JS_DOM_EVENT_ATTRS: [&str; 21] = [
|
|
|
|
// Input
|
|
|
|
"onfocus", "onblur", "onselect", "onchange", "onsubmit", "onreset", "onkeydown", "onkeypress", "onkeyup",
|
|
|
|
// Mouse
|
|
|
|
"onmouseover", "onmouseout", "onmousedown", "onmouseup", "onmousemove",
|
|
|
|
// Click
|
|
|
|
"onclick", "ondblclick",
|
|
|
|
// Load
|
|
|
|
"onload", "onunload", "onabort", "onerror", "onresize",
|
|
|
|
];
|
|
|
|
|
2019-08-23 10:33:30 +02:00
|
|
|
pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_images: bool) {
|
2019-08-23 05:17:15 +02:00
|
|
|
match node.data {
|
|
|
|
NodeData::Document => {
|
|
|
|
// Dig deeper
|
|
|
|
for child in node.children.borrow().iter() {
|
2019-08-23 10:33:30 +02:00
|
|
|
walk_and_embed_assets(&url, child, opt_no_js, opt_no_images);
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
},
|
|
|
|
|
|
|
|
NodeData::Doctype {
|
|
|
|
name: _,
|
|
|
|
public_id: _,
|
|
|
|
system_id: _,
|
|
|
|
} => {},
|
|
|
|
|
|
|
|
NodeData::Text { contents: _, } => {},
|
|
|
|
|
|
|
|
NodeData::Comment { contents: _, } => {
|
|
|
|
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
|
|
|
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
|
|
|
// since that's not part of W3C standard and gets ignored by browsers other than IE [5, 9]
|
|
|
|
},
|
|
|
|
|
|
|
|
NodeData::Element {
|
|
|
|
ref name,
|
|
|
|
ref attrs,
|
|
|
|
..
|
|
|
|
} => {
|
|
|
|
let ref mut attrs_mut = attrs.borrow_mut();
|
|
|
|
let mut found = NodeMatch::Other;
|
|
|
|
|
|
|
|
if &name.local == "link" {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "rel" {
|
|
|
|
if is_icon(&attr.value.to_string()) {
|
|
|
|
found = NodeMatch::Icon;
|
|
|
|
break;
|
|
|
|
} else if attr.value.to_string() == "stylesheet" {
|
|
|
|
found = NodeMatch::StyleSheet;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if &name.local == "img" {
|
|
|
|
found = NodeMatch::Image;
|
|
|
|
} else if &name.local == "a" {
|
|
|
|
found = NodeMatch::Anchor;
|
|
|
|
} else if &name.local == "script" {
|
|
|
|
found = NodeMatch::Script;
|
2019-08-23 09:26:05 +02:00
|
|
|
} else if &name.local == "form" {
|
|
|
|
found = NodeMatch::Form;
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
match found {
|
|
|
|
NodeMatch::Icon => {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
|
|
|
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
|
|
|
let favicon_datauri = retrieve_asset(&href_full_url.unwrap(), true, "");
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(favicon_datauri.unwrap().as_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
|
|
|
NodeMatch::Image => {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
2019-08-23 10:33:30 +02:00
|
|
|
if opt_no_images {
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(PNG_PIXEL);
|
|
|
|
} else {
|
|
|
|
let src_full_url = resolve_url(&url, &attr.value.to_string());
|
|
|
|
let img_datauri = retrieve_asset(&src_full_url.unwrap(), true, "");
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(img_datauri.unwrap().as_str());
|
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
|
|
|
NodeMatch::Anchor => {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
|
|
|
// Do not touch hrefs which begin with a hash sign
|
|
|
|
if attr.value.to_string().chars().nth(0) == Some('#') {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(href_full_url.unwrap().as_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
|
|
|
NodeMatch::StyleSheet => {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
|
|
|
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
2019-08-23 10:33:30 +02:00
|
|
|
let css_datauri = retrieve_asset(&href_full_url.unwrap(), true, "text/css");
|
2019-08-23 05:17:15 +02:00
|
|
|
attr.value.clear();
|
2019-08-23 10:33:30 +02:00
|
|
|
attr.value.push_slice(css_datauri.unwrap().as_str());
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
|
|
|
NodeMatch::Script => {
|
|
|
|
if opt_no_js {
|
|
|
|
// Get rid of src and inner content of SCRIPT tags
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
|
|
|
attr.value.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
node.children.borrow_mut().clear();
|
|
|
|
} else {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
2019-08-23 10:33:30 +02:00
|
|
|
let src_full_url = resolve_url(&url, &attr.value.to_string());
|
|
|
|
let js_datauri = retrieve_asset(&src_full_url.unwrap(), true, "application/javascript");
|
2019-08-23 05:17:15 +02:00
|
|
|
attr.value.clear();
|
2019-08-23 10:33:30 +02:00
|
|
|
attr.value.push_slice(js_datauri.unwrap().as_str());
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
2019-08-23 09:26:05 +02:00
|
|
|
NodeMatch::Form => {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "action" {
|
|
|
|
// Do not touch action props which are set to a URL
|
|
|
|
if is_url(&attr.value) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(href_full_url.unwrap().as_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
2019-08-23 05:17:15 +02:00
|
|
|
NodeMatch::Other => {},
|
|
|
|
}
|
|
|
|
|
|
|
|
if opt_no_js {
|
|
|
|
// Get rid of JS event attributes
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if JS_DOM_EVENT_ATTRS.contains(&attr.name.local.to_lowercase().as_str()) {
|
|
|
|
attr.value.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Dig deeper
|
|
|
|
for child in node.children.borrow().iter() {
|
2019-08-23 10:33:30 +02:00
|
|
|
walk_and_embed_assets(&url, child, opt_no_js, opt_no_images);
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
},
|
|
|
|
|
|
|
|
NodeData::ProcessingInstruction { .. } => unreachable!(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
|
|
|
|
parse_document(RcDom::default(), Default::default())
|
|
|
|
.from_utf8()
|
|
|
|
.read_from(&mut data.as_bytes())
|
|
|
|
.unwrap()
|
|
|
|
}
|
|
|
|
|
2019-08-23 10:33:30 +02:00
|
|
|
pub fn print_dom(handle: &Handle) {
|
2019-08-23 09:26:05 +02:00
|
|
|
// TODO: append <meta http-equiv="Access-Control-Allow-Origin" content="'self'"/> to the <head> if opt_isolate
|
2019-08-23 05:17:15 +02:00
|
|
|
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
|
|
|
|
}
|
|
|
|
|
|
|
|
fn is_icon(attr_value: &str) -> bool {
|
|
|
|
attr_value == "icon"
|
|
|
|
|| attr_value == "shortcut icon"
|
|
|
|
|| attr_value == "mask-icon"
|
|
|
|
|| attr_value == "apple-touch-icon"
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_is_icon() {
|
|
|
|
assert_eq!(is_icon("icon"), true);
|
|
|
|
assert_eq!(is_icon("stylesheet"), false);
|
|
|
|
}
|
|
|
|
}
|