2019-08-24 20:48:10 +02:00
|
|
|
use html5ever::parse_document;
|
|
|
|
use html5ever::rcdom::{Handle, NodeData, RcDom};
|
|
|
|
use html5ever::serialize::{serialize, SerializeOpts};
|
|
|
|
use html5ever::tendril::TendrilSink;
|
2019-08-23 20:24:45 +02:00
|
|
|
use http::{is_valid_url, resolve_url, retrieve_asset};
|
2019-08-24 20:48:10 +02:00
|
|
|
use regex::Regex;
|
2019-08-23 05:17:15 +02:00
|
|
|
use std::default::Default;
|
|
|
|
use std::io;
|
2019-08-24 02:16:16 +02:00
|
|
|
use utils::data_to_dataurl;
|
2019-08-23 05:17:15 +02:00
|
|
|
|
2019-08-24 20:48:10 +02:00
|
|
|
lazy_static! {
|
2019-08-25 05:06:40 +02:00
|
|
|
static ref EMPTY_STRING: String = String::new();
|
2019-08-24 20:48:10 +02:00
|
|
|
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
2019-08-25 05:06:40 +02:00
|
|
|
static ref ICON_VALUES: Regex = Regex::new(
|
|
|
|
r"^icon|shortcut icon|mask-icon|apple-touch-icon$"
|
|
|
|
).unwrap();
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
|
2019-08-24 05:06:06 +02:00
|
|
|
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
|
|
|
|
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
|
2019-08-23 10:33:30 +02:00
|
|
|
|
2019-08-23 20:24:45 +02:00
|
|
|
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
|
2019-08-23 05:17:15 +02:00
|
|
|
// Input
|
2019-08-23 20:24:45 +02:00
|
|
|
"onfocus",
|
|
|
|
"onblur",
|
|
|
|
"onselect",
|
|
|
|
"onchange",
|
|
|
|
"onsubmit",
|
|
|
|
"onreset",
|
|
|
|
"onkeydown",
|
|
|
|
"onkeypress",
|
|
|
|
"onkeyup",
|
2019-08-23 05:17:15 +02:00
|
|
|
// Mouse
|
2019-08-23 20:24:45 +02:00
|
|
|
"onmouseover",
|
|
|
|
"onmouseout",
|
|
|
|
"onmousedown",
|
|
|
|
"onmouseup",
|
|
|
|
"onmousemove",
|
2019-08-23 05:17:15 +02:00
|
|
|
// Click
|
2019-08-23 20:24:45 +02:00
|
|
|
"onclick",
|
|
|
|
"ondblclick",
|
2019-08-23 05:17:15 +02:00
|
|
|
// Load
|
2019-08-23 20:24:45 +02:00
|
|
|
"onload",
|
|
|
|
"onunload",
|
|
|
|
"onabort",
|
|
|
|
"onerror",
|
|
|
|
"onresize",
|
2019-08-23 05:17:15 +02:00
|
|
|
];
|
|
|
|
|
2019-08-24 17:21:29 +02:00
|
|
|
fn get_parent_node_name(node: &Handle) -> String {
|
|
|
|
let parent = node.parent.take().clone();
|
|
|
|
let parent_node = parent.and_then(|node| node.upgrade()).unwrap();
|
|
|
|
|
|
|
|
match &parent_node.data {
|
2019-08-25 05:06:40 +02:00
|
|
|
NodeData::Document => { EMPTY_STRING.clone() }
|
|
|
|
NodeData::Doctype { .. } => { EMPTY_STRING.clone() }
|
|
|
|
NodeData::Text { .. } => { EMPTY_STRING.clone() }
|
|
|
|
NodeData::Comment { .. } => { EMPTY_STRING.clone() }
|
|
|
|
NodeData::Element { ref name, attrs: _, .. } => {
|
2019-08-24 17:21:29 +02:00
|
|
|
name.local.as_ref().to_string()
|
|
|
|
}
|
|
|
|
NodeData::ProcessingInstruction { .. } => unreachable!()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-23 20:33:18 +02:00
|
|
|
pub fn walk_and_embed_assets(
|
|
|
|
url: &str,
|
|
|
|
node: &Handle,
|
|
|
|
opt_no_js: bool,
|
|
|
|
opt_no_images: bool,
|
|
|
|
opt_user_agent: &str,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent: bool,
|
2019-08-23 20:33:18 +02:00
|
|
|
) {
|
2019-08-23 05:17:15 +02:00
|
|
|
match node.data {
|
|
|
|
NodeData::Document => {
|
|
|
|
// Dig deeper
|
|
|
|
for child in node.children.borrow().iter() {
|
2019-08-23 20:33:18 +02:00
|
|
|
walk_and_embed_assets(
|
|
|
|
&url, child,
|
|
|
|
opt_no_js,
|
|
|
|
opt_no_images,
|
|
|
|
opt_user_agent,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent,
|
2019-08-23 20:33:18 +02:00
|
|
|
);
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
|
|
|
NodeData::Doctype { .. } => {}
|
|
|
|
NodeData::Text { .. } => {}
|
|
|
|
NodeData::Comment { .. } => {
|
2019-08-23 05:17:15 +02:00
|
|
|
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
|
|
|
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
2019-08-24 05:06:06 +02:00
|
|
|
// since that's not part of W3C standard and therefore gets ignored
|
|
|
|
// by browsers other than IE [5, 9]
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
NodeData::Element {
|
|
|
|
ref name,
|
|
|
|
ref attrs,
|
|
|
|
..
|
|
|
|
} => {
|
2019-08-23 20:24:45 +02:00
|
|
|
let attrs_mut = &mut attrs.borrow_mut();
|
2019-08-23 05:17:15 +02:00
|
|
|
|
2019-08-24 02:16:16 +02:00
|
|
|
match name.local.as_ref() {
|
|
|
|
"link" => {
|
2019-08-25 05:06:40 +02:00
|
|
|
let mut link_type = "";
|
|
|
|
|
2019-08-24 02:16:16 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "rel" {
|
|
|
|
if is_icon(&attr.value.to_string()) {
|
2019-08-25 05:06:40 +02:00
|
|
|
link_type = "icon";
|
2019-08-24 02:16:16 +02:00
|
|
|
break;
|
|
|
|
} else if attr.value.to_string() == "stylesheet" {
|
2019-08-25 05:06:40 +02:00
|
|
|
link_type = "stylesheet";
|
2019-08-24 02:16:16 +02:00
|
|
|
break;
|
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-25 05:06:40 +02:00
|
|
|
if link_type == "icon" {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
|
|
|
if opt_no_images {
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(TRANSPARENT_PIXEL);
|
|
|
|
} else {
|
|
|
|
let href_full_url: String = resolve_url(
|
|
|
|
&url,
|
|
|
|
&attr.value.to_string()
|
|
|
|
)
|
|
|
|
.unwrap_or(EMPTY_STRING.clone());
|
|
|
|
let favicon_datauri = retrieve_asset(
|
|
|
|
&href_full_url,
|
|
|
|
true,
|
|
|
|
"",
|
|
|
|
opt_user_agent,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent,
|
2019-08-25 05:06:40 +02:00
|
|
|
).unwrap_or(EMPTY_STRING.clone());
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(favicon_datauri.as_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if link_type == "stylesheet" {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
|
|
|
let href_full_url: String = resolve_url(
|
|
|
|
&url,
|
|
|
|
&attr.value.to_string(),
|
|
|
|
)
|
|
|
|
.unwrap_or(EMPTY_STRING.clone());
|
|
|
|
let css_datauri = retrieve_asset(
|
|
|
|
&href_full_url,
|
2019-08-24 20:22:34 +02:00
|
|
|
true,
|
2019-08-25 05:06:40 +02:00
|
|
|
"text/css",
|
2019-08-24 20:22:34 +02:00
|
|
|
opt_user_agent,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent,
|
2019-08-25 05:06:40 +02:00
|
|
|
).unwrap_or(EMPTY_STRING.clone());
|
2019-08-24 20:22:34 +02:00
|
|
|
attr.value.clear();
|
2019-08-25 05:06:40 +02:00
|
|
|
attr.value.push_slice(css_datauri.as_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
|
|
|
let href_full_url: String = resolve_url(
|
|
|
|
&url,
|
|
|
|
&attr.value.to_string(),
|
|
|
|
)
|
|
|
|
.unwrap_or(EMPTY_STRING.clone());
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(&href_full_url.as_str());
|
2019-08-24 20:22:34 +02:00
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"img" => {
|
2019-08-23 05:17:15 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
2019-08-23 10:33:30 +02:00
|
|
|
if opt_no_images {
|
|
|
|
attr.value.clear();
|
2019-08-24 05:06:06 +02:00
|
|
|
attr.value.push_slice(TRANSPARENT_PIXEL);
|
2019-08-23 10:33:30 +02:00
|
|
|
} else {
|
2019-08-25 05:06:40 +02:00
|
|
|
let src_full_url: String = resolve_url(
|
|
|
|
&url,
|
|
|
|
&attr.value.to_string(),
|
|
|
|
)
|
|
|
|
.unwrap_or(EMPTY_STRING.clone());
|
2019-08-23 20:33:18 +02:00
|
|
|
let img_datauri = retrieve_asset(
|
2019-08-25 05:06:40 +02:00
|
|
|
&src_full_url,
|
2019-08-23 20:33:18 +02:00
|
|
|
true,
|
|
|
|
"",
|
|
|
|
opt_user_agent,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent,
|
2019-08-25 05:06:40 +02:00
|
|
|
).unwrap_or(EMPTY_STRING.clone());
|
2019-08-23 10:33:30 +02:00
|
|
|
attr.value.clear();
|
2019-08-25 05:06:40 +02:00
|
|
|
attr.value.push_slice(img_datauri.as_str());
|
2019-08-23 10:33:30 +02:00
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"source" => {
|
2019-08-24 17:21:29 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "srcset" {
|
|
|
|
if get_parent_node_name(&node) == "picture" {
|
|
|
|
if opt_no_images {
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(TRANSPARENT_PIXEL);
|
|
|
|
} else {
|
2019-08-25 05:06:40 +02:00
|
|
|
let srcset_full_url: String = resolve_url(
|
|
|
|
&url,
|
|
|
|
&attr.value.to_string(),
|
|
|
|
)
|
|
|
|
.unwrap_or(EMPTY_STRING.clone());
|
2019-08-24 17:21:29 +02:00
|
|
|
let source_datauri = retrieve_asset(
|
2019-08-25 05:06:40 +02:00
|
|
|
&srcset_full_url,
|
2019-08-24 17:21:29 +02:00
|
|
|
true,
|
|
|
|
"",
|
|
|
|
opt_user_agent,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent,
|
2019-08-25 05:06:40 +02:00
|
|
|
).unwrap_or(EMPTY_STRING.clone());
|
2019-08-24 17:21:29 +02:00
|
|
|
attr.value.clear();
|
2019-08-25 05:06:40 +02:00
|
|
|
attr.value.push_slice(source_datauri.as_str());
|
2019-08-24 17:21:29 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"a" => {
|
2019-08-23 05:17:15 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
2019-08-23 22:00:05 +02:00
|
|
|
// Don't touch email links or hrefs which begin with a hash sign
|
2019-08-24 20:48:10 +02:00
|
|
|
if attr.value.starts_with('#') || has_protocol(&attr.value) {
|
2019-08-23 05:17:15 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2019-08-25 05:06:40 +02:00
|
|
|
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
|
|
|
|
.unwrap_or(EMPTY_STRING.clone());
|
2019-08-23 05:17:15 +02:00
|
|
|
attr.value.clear();
|
2019-08-25 05:06:40 +02:00
|
|
|
attr.value.push_slice(href_full_url.as_str());
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"script" => {
|
2019-08-23 05:17:15 +02:00
|
|
|
if opt_no_js {
|
|
|
|
// Get rid of src and inner content of SCRIPT tags
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
|
|
|
attr.value.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
node.children.borrow_mut().clear();
|
|
|
|
} else {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
2019-08-25 05:06:40 +02:00
|
|
|
let src_full_url: String = resolve_url(
|
|
|
|
&url,
|
|
|
|
&attr.value.to_string(),
|
|
|
|
)
|
|
|
|
.unwrap_or(EMPTY_STRING.clone());
|
2019-08-23 20:24:45 +02:00
|
|
|
let js_datauri = retrieve_asset(
|
2019-08-25 05:06:40 +02:00
|
|
|
&src_full_url,
|
2019-08-23 20:24:45 +02:00
|
|
|
true,
|
|
|
|
"application/javascript",
|
2019-08-23 20:33:18 +02:00
|
|
|
opt_user_agent,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent,
|
2019-08-25 05:06:40 +02:00
|
|
|
).unwrap_or(EMPTY_STRING.clone());
|
2019-08-23 05:17:15 +02:00
|
|
|
attr.value.clear();
|
2019-08-25 05:06:40 +02:00
|
|
|
attr.value.push_slice(js_datauri.as_str());
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"form" => {
|
2019-08-23 09:26:05 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "action" {
|
|
|
|
// Do not touch action props which are set to a URL
|
2019-08-23 20:24:45 +02:00
|
|
|
if is_valid_url(&attr.value) {
|
2019-08-23 09:26:05 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2019-08-25 05:06:40 +02:00
|
|
|
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
|
|
|
|
.unwrap_or(EMPTY_STRING.clone());
|
2019-08-23 09:26:05 +02:00
|
|
|
attr.value.clear();
|
2019-08-25 05:06:40 +02:00
|
|
|
attr.value.push_slice(href_full_url.as_str());
|
2019-08-23 09:26:05 +02:00
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"iframe" => {
|
2019-08-24 02:16:16 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
2019-08-25 05:06:40 +02:00
|
|
|
let src_full_url: String = resolve_url(&url, &attr.value.to_string())
|
|
|
|
.unwrap_or(EMPTY_STRING.clone());
|
2019-08-24 02:16:16 +02:00
|
|
|
let iframe_data = retrieve_asset(
|
|
|
|
&src_full_url,
|
|
|
|
false,
|
|
|
|
"text/html",
|
|
|
|
opt_user_agent,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent,
|
2019-08-25 05:06:40 +02:00
|
|
|
).unwrap_or(EMPTY_STRING.clone());
|
|
|
|
let dom = html_to_dom(&iframe_data);
|
2019-08-24 05:06:06 +02:00
|
|
|
walk_and_embed_assets(
|
|
|
|
&src_full_url,
|
|
|
|
&dom.document,
|
|
|
|
opt_no_js,
|
|
|
|
opt_no_images,
|
|
|
|
opt_user_agent,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent,
|
2019-08-24 05:06:06 +02:00
|
|
|
);
|
2019-08-24 02:16:16 +02:00
|
|
|
let mut buf: Vec<u8> = Vec::new();
|
|
|
|
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
|
|
|
let iframe_datauri = data_to_dataurl("text/html", &buf);
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(iframe_datauri.as_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
_ => {}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if opt_no_js {
|
|
|
|
// Get rid of JS event attributes
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if JS_DOM_EVENT_ATTRS.contains(&attr.name.local.to_lowercase().as_str()) {
|
|
|
|
attr.value.clear();
|
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
|
|
|
|
// Dig deeper
|
|
|
|
for child in node.children.borrow().iter() {
|
2019-08-23 20:33:18 +02:00
|
|
|
walk_and_embed_assets(
|
|
|
|
&url,
|
|
|
|
child,
|
|
|
|
opt_no_js,
|
|
|
|
opt_no_images,
|
|
|
|
opt_user_agent,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent,
|
2019-08-23 20:33:18 +02:00
|
|
|
);
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-24 17:21:29 +02:00
|
|
|
NodeData::ProcessingInstruction { .. } => unreachable!()
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-24 20:48:10 +02:00
|
|
|
fn has_protocol(url: &str) -> bool {
|
|
|
|
HAS_PROTOCOL.is_match(&url.to_lowercase())
|
|
|
|
}
|
|
|
|
|
2019-08-23 05:17:15 +02:00
|
|
|
pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
|
|
|
|
parse_document(RcDom::default(), Default::default())
|
|
|
|
.from_utf8()
|
|
|
|
.read_from(&mut data.as_bytes())
|
|
|
|
.unwrap()
|
|
|
|
}
|
|
|
|
|
2019-08-23 10:33:30 +02:00
|
|
|
pub fn print_dom(handle: &Handle) {
|
2019-08-23 05:17:15 +02:00
|
|
|
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
|
|
|
|
}
|
|
|
|
|
|
|
|
fn is_icon(attr_value: &str) -> bool {
|
2019-08-25 05:06:40 +02:00
|
|
|
ICON_VALUES.is_match(&attr_value.to_lowercase())
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_is_icon() {
|
|
|
|
assert_eq!(is_icon("icon"), true);
|
|
|
|
assert_eq!(is_icon("stylesheet"), false);
|
|
|
|
}
|
2019-08-24 20:48:10 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_has_protocol() {
|
|
|
|
assert_eq!(has_protocol("mailto:somebody@somewhere.com?subject=hello"), true);
|
|
|
|
assert_eq!(has_protocol("tel:5551234567"), true);
|
|
|
|
assert_eq!(has_protocol("ftp:user:password@some-ftp-server.com"), true);
|
|
|
|
assert_eq!(has_protocol("javascript:void(0)"), true);
|
|
|
|
assert_eq!(has_protocol("http://news.ycombinator.com"), true);
|
|
|
|
assert_eq!(has_protocol("https://github.com"), true);
|
|
|
|
assert_eq!(has_protocol("//some-hostname.com/some-file.html"), false);
|
|
|
|
assert_eq!(has_protocol("some-hostname.com/some-file.html"), false);
|
|
|
|
assert_eq!(has_protocol("/some-file.html"), false);
|
|
|
|
assert_eq!(has_protocol(""), false);
|
|
|
|
assert_eq!(has_protocol("MAILTO:somebody@somewhere.com?subject=hello"), true);
|
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|