From 3948ea3aa0fcec0afa11715cfa05d62a85f63ee8 Mon Sep 17 00:00:00 2001 From: Vincent Flyson Date: Sun, 29 Sep 2019 17:15:49 -0400 Subject: [PATCH] Improve code structure --- Cargo.toml | 2 +- src/html.rs | 678 ++++++++------------------------------------- src/http.rs | 120 +------- src/js.rs | 32 +++ src/lib.rs | 4 + src/main.rs | 5 +- src/tests/html.rs | 467 +++++++++++++++++++++++++++++++ src/tests/js.rs | 13 + src/tests/mod.rs | 3 + src/tests/utils.rs | 160 +++++++++++ src/utils.rs | 68 ++--- 11 files changed, 831 insertions(+), 721 deletions(-) create mode 100644 src/js.rs create mode 100644 src/tests/html.rs create mode 100644 src/tests/js.rs create mode 100644 src/tests/mod.rs create mode 100644 src/tests/utils.rs diff --git a/Cargo.toml b/Cargo.toml index 1689cc2..6068a86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "monolith" -version = "2.0.19" +version = "2.0.20" authors = [ "Sunshine ", "Mahdi Robatipoor ", diff --git a/src/html.rs b/src/html.rs index 33592fd..f13b542 100644 --- a/src/html.rs +++ b/src/html.rs @@ -5,62 +5,44 @@ use html5ever::serialize::{serialize, SerializeOpts}; use html5ever::tendril::{format_tendril, TendrilSink}; use html5ever::tree_builder::{Attribute, TreeSink}; use html5ever::{local_name, namespace_url, ns}; -use http::{is_valid_url, resolve_url, retrieve_asset}; +use http::retrieve_asset; +use js::attr_is_event_handler; use regex::Regex; use std::default::Default; -use utils::data_to_dataurl; +use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol}; lazy_static! { static ref EMPTY_STRING: String = String::new(); - static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); - static ref ICON_VALUES: Regex = - Regex::new(r"^icon|shortcut icon|mask-icon|apple-touch-icon|fluid-icon$").unwrap(); } +const ICON_VALUES: [&str; 5] = [ + "icon", + "shortcut icon", + "mask-icon", + "apple-touch-icon", + "fluid-icon", +]; + const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\ iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="; -const JS_DOM_EVENT_ATTRS: [&str; 21] = [ - // Input - "onfocus", - "onblur", - "onselect", - "onchange", - "onsubmit", - "onreset", - "onkeydown", - "onkeypress", - "onkeyup", - // Mouse - "onmouseover", - "onmouseout", - "onmousedown", - "onmouseup", - "onmousemove", - // Click - "onclick", - "ondblclick", - // Load - "onload", - "onunload", - "onabort", - "onerror", - "onresize", -]; - -fn get_parent_node(node: &Handle) -> Handle { +pub fn get_parent_node(node: &Handle) -> Handle { let parent = node.parent.take().clone(); parent.and_then(|node| node.upgrade()).unwrap() } -fn get_node_name(node: &Handle) -> String { +pub fn get_node_name(node: &Handle) -> String { match &node.data { NodeData::Element { ref name, .. } => name.local.as_ref().to_string(), _ => EMPTY_STRING.clone(), } } +pub fn is_icon(attr_value: &str) -> bool { + ICON_VALUES.contains(&&*attr_value.to_lowercase()) +} + pub fn walk_and_embed_assets( url: &str, node: &Handle, @@ -239,7 +221,7 @@ pub fn walk_and_embed_assets( for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { // Don't touch email links or hrefs which begin with a hash sign - if attr.value.starts_with('#') || has_protocol(&attr.value) { + if attr.value.starts_with('#') || url_has_protocol(&attr.value) { continue; } @@ -282,73 +264,68 @@ pub fn walk_and_embed_assets( } "style" => { if opt_no_css { - // Empty inner content of STYLE tags + // Empty inner content of STYLE tags node.children.borrow_mut().clear(); } } "form" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "action" { - // Don't modify action that's already a full URL - if is_valid_url(&attr.value) { - continue; + // Modify action to be a full URL + if !is_valid_url(&attr.value) { + let href_full_url: String = + resolve_url(&url, &attr.value.to_string()) + .unwrap_or(EMPTY_STRING.clone()); + attr.value.clear(); + attr.value.push_slice(href_full_url.as_str()); } - - let href_full_url: String = resolve_url(&url, &attr.value.to_string()) - .unwrap_or(EMPTY_STRING.clone()); - attr.value.clear(); - attr.value.push_slice(href_full_url.as_str()); } } } "iframe" => { - if opt_no_frames { - // Empty the src attribute - for attr in attrs_mut.iter_mut() { - if &attr.name.local == "src" { + for attr in attrs_mut.iter_mut() { + if &attr.name.local == "src" { + if opt_no_frames { + // Empty the src attribute attr.value.clear(); + continue; } - } - } else { - for attr in attrs_mut.iter_mut() { - if &attr.name.local == "src" { - let iframe_src = attr.value.to_string(); - // Ignore iframes with empty source (they cause infinite loops) - if iframe_src == EMPTY_STRING.clone() { - continue; - } + let iframe_src: String = attr.value.to_string(); - let src_full_url: String = - resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone()); - let iframe_data = retrieve_asset( - &src_full_url, - false, - "text/html", - opt_user_agent, - opt_silent, - opt_insecure, - ) - .unwrap_or(EMPTY_STRING.clone()); - let dom = html_to_dom(&iframe_data); - walk_and_embed_assets( - &src_full_url, - &dom.document, - opt_no_css, - opt_no_js, - opt_no_images, - opt_user_agent, - opt_silent, - opt_insecure, - opt_no_frames, - ); - let mut buf: Vec = Vec::new(); - serialize(&mut buf, &dom.document, SerializeOpts::default()) - .unwrap(); - let iframe_datauri = data_to_dataurl("text/html", &buf); - attr.value.clear(); - attr.value.push_slice(iframe_datauri.as_str()); + // Ignore iframes with empty source (they cause infinite loops) + if iframe_src == EMPTY_STRING.clone() { + continue; } + + let src_full_url: String = + resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone()); + let iframe_data = retrieve_asset( + &src_full_url, + false, + "text/html", + opt_user_agent, + opt_silent, + opt_insecure, + ) + .unwrap_or(EMPTY_STRING.clone()); + let dom = html_to_dom(&iframe_data); + walk_and_embed_assets( + &src_full_url, + &dom.document, + opt_no_css, + opt_no_js, + opt_no_images, + opt_user_agent, + opt_silent, + opt_insecure, + opt_no_frames, + ); + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + let iframe_datauri = data_to_dataurl("text/html", &buf); + attr.value.clear(); + attr.value.push_slice(iframe_datauri.as_str()); } } } @@ -357,7 +334,7 @@ pub fn walk_and_embed_assets( if &attr.name.local == "poster" { let video_poster = attr.value.to_string(); - // Ignore posters with empty source + // Skip posters with empty source if video_poster == EMPTY_STRING.clone() { continue; } @@ -387,20 +364,30 @@ pub fn walk_and_embed_assets( if opt_no_css { // Get rid of style attributes - for attr in attrs_mut.iter_mut() { + let mut style_attr_indexes = Vec::new(); + for (i, attr) in attrs_mut.iter_mut().enumerate() { if attr.name.local.to_lowercase() == "style" { - attr.value.clear(); + style_attr_indexes.push(i); } } + style_attr_indexes.reverse(); + for attr_index in style_attr_indexes { + attrs_mut.remove(attr_index); + } } if opt_no_js { // Get rid of JS event attributes - for attr in attrs_mut.iter_mut() { - if JS_DOM_EVENT_ATTRS.contains(&attr.name.local.to_lowercase().as_str()) { - attr.value.clear(); + let mut js_attr_indexes = Vec::new(); + for (i, attr) in attrs_mut.iter_mut().enumerate() { + if attr_is_event_handler(&attr.name.local) { + js_attr_indexes.push(i); } } + js_attr_indexes.reverse(); + for attr_index in js_attr_indexes { + attrs_mut.remove(attr_index); + } } // Dig deeper @@ -427,10 +414,6 @@ pub fn walk_and_embed_assets( } } -fn has_protocol(url: &str) -> bool { - HAS_PROTOCOL.is_match(&url.to_lowercase()) -} - pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom { parse_document(RcDom::default(), Default::default()) .from_utf8() @@ -472,44 +455,45 @@ pub fn stringify_document( let doc = dom.get_document(); let html = get_child_node_by_name(&doc, "html"); let head = get_child_node_by_name(&html, "head"); - { - let mut content_attr = EMPTY_STRING.clone(); - if opt_isolate { - content_attr += "default-src 'unsafe-inline' data:;" - } - if opt_no_css { - content_attr += "style-src 'none';" - } - if opt_no_frames { - content_attr += "frame-src 'none';child-src 'none';" - } - if opt_no_js { - content_attr += "script-src 'none';" - } - if opt_no_images { - content_attr += "img-src data:;" - } - let meta = dom.create_element( - QualName::new(None, ns!(), local_name!("meta")), - vec![ - Attribute { - name: QualName::new(None, ns!(), local_name!("http-equiv")), - value: format_tendril!("Content-Security-Policy"), - }, - Attribute { - name: QualName::new(None, ns!(), local_name!("content")), - value: format_tendril!("{}", content_attr), - }, - ], - Default::default(), - ); - head.children.borrow_mut().reverse(); - head.children.borrow_mut().push(meta.clone()); - head.children.borrow_mut().reverse(); - // Note: the CSP meta-tag has to be prepended, never appended, - // since there already may be one defined in the document, - // and browsers don't allow re-defining them (for obvious reasons) + let mut content_attr = EMPTY_STRING.clone(); + if opt_isolate { + content_attr += " default-src 'unsafe-inline' data:;"; } + if opt_no_css { + content_attr += " style-src 'none';"; + } + if opt_no_frames { + content_attr += " frame-src 'none';child-src 'none';"; + } + if opt_no_js { + content_attr += " script-src 'none';"; + } + if opt_no_images { + content_attr += " img-src data:;"; + } + content_attr = content_attr.trim().to_string(); + + let meta = dom.create_element( + QualName::new(None, ns!(), local_name!("meta")), + vec![ + Attribute { + name: QualName::new(None, ns!(), local_name!("http-equiv")), + value: format_tendril!("Content-Security-Policy"), + }, + Attribute { + name: QualName::new(None, ns!(), local_name!("content")), + value: format_tendril!("{}", content_attr), + }, + ], + Default::default(), + ); + head.children.borrow_mut().reverse(); + head.children.borrow_mut().push(meta.clone()); + head.children.borrow_mut().reverse(); + // Note: the CSP meta-tag has to be prepended, never appended, + // since there already may be one defined in the document, + // and browsers don't allow re-defining them (for obvious reasons) + serialize(&mut buf, &doc, SerializeOpts::default()) .expect("unable to serialize DOM into buffer"); result = String::from_utf8(buf).unwrap(); @@ -519,429 +503,3 @@ pub fn stringify_document( result } - -fn is_icon(attr_value: &str) -> bool { - ICON_VALUES.is_match(&attr_value.to_lowercase()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_is_icon() { - assert_eq!(is_icon("icon"), true); - assert_eq!(is_icon("Shortcut Icon"), true); - assert_eq!(is_icon("ICON"), true); - assert_eq!(is_icon("mask-icon"), true); - assert_eq!(is_icon("fluid-icon"), true); - assert_eq!(is_icon("stylesheet"), false); - assert_eq!(is_icon(""), false); - } - - #[test] - fn test_has_protocol() { - assert_eq!( - has_protocol("mailto:somebody@somewhere.com?subject=hello"), - true - ); - assert_eq!(has_protocol("tel:5551234567"), true); - assert_eq!(has_protocol("ftp:user:password@some-ftp-server.com"), true); - assert_eq!(has_protocol("javascript:void(0)"), true); - assert_eq!(has_protocol("http://news.ycombinator.com"), true); - assert_eq!(has_protocol("https://github.com"), true); - assert_eq!(has_protocol("//some-hostname.com/some-file.html"), false); - assert_eq!(has_protocol("some-hostname.com/some-file.html"), false); - assert_eq!(has_protocol("/some-file.html"), false); - assert_eq!(has_protocol(""), false); - assert_eq!( - has_protocol("MAILTO:somebody@somewhere.com?subject=hello"), - true - ); - } - - #[test] - fn test_get_parent_node_name() { - let html = "

"; - let dom = html_to_dom(&html); - let mut count = 0; - - fn test_walk(node: &Handle, i: &mut i8) { - *i += 1; - - match &node.data { - NodeData::Document => { - for child in node.children.borrow().iter() { - test_walk(child, &mut *i); - } - } - NodeData::Doctype { .. } => (), - NodeData::Text { .. } => (), - NodeData::Comment { .. } => (), - NodeData::Element { ref name, .. } => { - let node_name = name.local.as_ref().to_string(); - let parent_node_name = get_node_name(&get_parent_node(node)); - if node_name == "head" || node_name == "body" { - assert_eq!(parent_node_name, "html"); - } else if node_name == "div" { - assert_eq!(parent_node_name, "body"); - } else if node_name == "p" { - assert_eq!(parent_node_name, "div"); - } - - println!("{}", node_name); - - for child in node.children.borrow().iter() { - test_walk(child, &mut *i); - } - } - NodeData::ProcessingInstruction { .. } => unreachable!(), - }; - } - - test_walk(&dom.document, &mut count); - - assert_eq!(count, 7); - } - - #[test] - fn test_walk_and_embed_assets() { - let html = "

"; - let dom = html_to_dom(&html); - let url = "http://localhost"; - - let opt_no_css: bool = false; - let opt_no_frames: bool = false; - let opt_no_js: bool = false; - let opt_no_images: bool = false; - let opt_silent = true; - let opt_insecure = false; - - walk_and_embed_assets( - &url, - &dom.document, - opt_no_css, - opt_no_js, - opt_no_images, - "", - opt_silent, - opt_insecure, - opt_no_frames, - ); - - let mut buf: Vec = Vec::new(); - serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); - - assert_eq!( - buf.iter().map(|&c| c as char).collect::(), - "

" - ); - } - - #[test] - fn test_walk_and_embed_assets_no_recursive_iframe() { - let html = "

"; - let dom = html_to_dom(&html); - let url = "http://localhost"; - - let opt_no_css: bool = false; - let opt_no_frames: bool = false; - let opt_no_js: bool = false; - let opt_no_images: bool = false; - let opt_silent = true; - let opt_insecure = false; - - walk_and_embed_assets( - &url, - &dom.document, - opt_no_css, - opt_no_js, - opt_no_images, - "", - opt_silent, - opt_insecure, - opt_no_frames, - ); - - let mut buf: Vec = Vec::new(); - serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); - - assert_eq!( - buf.iter().map(|&c| c as char).collect::(), - "

" - ); - } - - #[test] - fn test_walk_and_embed_assets_no_css() { - let html = "\ - \ -
"; - let dom = html_to_dom(&html); - let url = "http://localhost"; - - let opt_no_css: bool = true; - let opt_no_frames: bool = false; - let opt_no_js: bool = false; - let opt_no_images: bool = false; - let opt_silent = true; - let opt_insecure = false; - - walk_and_embed_assets( - &url, - &dom.document, - opt_no_css, - opt_no_js, - opt_no_images, - "", - opt_silent, - opt_insecure, - opt_no_frames, - ); - - let mut buf: Vec = Vec::new(); - serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); - - assert_eq!( - buf.iter().map(|&c| c as char).collect::(), - "\ -
" - ); - } - - #[test] - fn test_walk_and_embed_assets_no_images() { - let html = "\ -
"; - let dom = html_to_dom(&html); - let url = "http://localhost"; - - let opt_no_css: bool = false; - let opt_no_frames: bool = false; - let opt_no_js: bool = false; - let opt_no_images: bool = true; - let opt_silent = true; - let opt_insecure = false; - - walk_and_embed_assets( - &url, - &dom.document, - opt_no_css, - opt_no_js, - opt_no_images, - "", - opt_silent, - opt_insecure, - opt_no_frames, - ); - - let mut buf: Vec = Vec::new(); - serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); - - assert_eq!( - buf.iter().map(|&c| c as char).collect::(), - "
\ - \ -
" - ); - } - - #[test] - fn test_walk_and_embed_assets_no_frames() { - let html = ""; - let dom = html_to_dom(&html); - let url = "http://localhost"; - - let opt_no_css: bool = false; - let opt_no_frames: bool = true; - let opt_no_js: bool = false; - let opt_no_images: bool = false; - let opt_silent = true; - let opt_insecure = false; - - walk_and_embed_assets( - &url, - &dom.document, - opt_no_css, - opt_no_js, - opt_no_images, - "", - opt_silent, - opt_insecure, - opt_no_frames, - ); - - let mut buf: Vec = Vec::new(); - serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); - - assert_eq!( - buf.iter().map(|&c| c as char).collect::(), - "" - ); - } - - #[test] - fn test_walk_and_embed_assets_no_js() { - let html = - "
\ -
"; - let dom = html_to_dom(&html); - let url = "http://localhost"; - - let opt_no_css: bool = false; - let opt_no_frames: bool = false; - let opt_no_js: bool = true; - let opt_no_images: bool = false; - let opt_silent = true; - let opt_insecure = false; - - walk_and_embed_assets( - &url, - &dom.document, - opt_no_css, - opt_no_js, - opt_no_images, - "", - opt_silent, - opt_insecure, - opt_no_frames, - ); - - let mut buf: Vec = Vec::new(); - serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); - - assert_eq!( - buf.iter().map(|&c| c as char).collect::(), - "
\ -
" - ); - } - - #[test] - fn test_stringify_document() { - let html = "
"; - let dom = html_to_dom(&html); - - let opt_no_css: bool = false; - let opt_no_frames: bool = false; - let opt_no_js: bool = false; - let opt_no_images: bool = false; - let opt_isolate: bool = false; - - assert_eq!( - stringify_document( - &dom.document, - opt_no_css, - opt_no_frames, - opt_no_js, - opt_no_images, - opt_isolate, - ), - "
" - ); - } - - #[test] - fn test_stringify_document_isolate() { - let html = "Isolated document\ -
"; - let dom = html_to_dom(&html); - - let opt_no_css: bool = false; - let opt_no_frames: bool = false; - let opt_no_js: bool = false; - let opt_no_images: bool = false; - let opt_isolate: bool = true; - - assert_eq!( - stringify_document( - &dom.document, - opt_no_css, - opt_no_frames, - opt_no_js, - opt_no_images, - opt_isolate, - ), - "\ - \ - \ - Isolated document\ - \ - \ -
\ - " - ); - } - - #[test] - fn test_stringify_document_no_css() { - let html = "\ - Unstyled document\ - \ -
"; - let dom = html_to_dom(&html); - - let opt_no_css: bool = true; - let opt_no_frames: bool = false; - let opt_no_js: bool = false; - let opt_no_images: bool = false; - let opt_isolate: bool = false; - - assert_eq!( - stringify_document( - &dom.document, - opt_no_css, - opt_no_frames, - opt_no_js, - opt_no_images, - opt_isolate, - ), - "\ - \ - \ - \ - Unstyled document\ - \ - \ -
\ - " - ); - } - - #[test] - fn test_stringify_document_no_frames() { - let html = "Frameless document\ -
"; - let dom = html_to_dom(&html); - - let opt_no_css: bool = false; - let opt_no_frames: bool = true; - let opt_no_js: bool = false; - let opt_no_images: bool = false; - let opt_isolate: bool = false; - - assert_eq!( - stringify_document( - &dom.document, - opt_no_css, - opt_no_frames, - opt_no_js, - opt_no_images, - opt_isolate, - ), - "\ - \ - \ - \ - Frameless document\ - \ - \ -
\ - " - ); - } -} diff --git a/src/http.rs b/src/http.rs index 50d686b..fe5e3ae 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,35 +1,7 @@ -use regex::Regex; use reqwest::header::{CONTENT_TYPE, USER_AGENT}; use reqwest::Client; use std::time::Duration; -use url::{ParseError, Url}; -use utils::data_to_dataurl; - -lazy_static! { - static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap(); -} - -pub fn is_data_url(url: &str) -> Result { - match Url::parse(url) { - Ok(parsed_url) => Ok(parsed_url.scheme() == "data"), - Err(err) => Err(err), - } -} - -pub fn is_valid_url(path: &str) -> bool { - REGEX_URL.is_match(path) -} - -pub fn resolve_url(from: &str, to: &str) -> Result { - let result = if is_valid_url(to) { - // (anything, http://site.com/css/main.css) - to.to_string() - } else { - Url::parse(from)?.join(to)?.to_string() - }; - - Ok(result) -} +use utils::{data_to_dataurl, is_data_url}; pub fn retrieve_asset( url: &str, @@ -79,93 +51,3 @@ pub fn retrieve_asset( } } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_is_valid_url() { - assert!(is_valid_url("https://www.rust-lang.org/")); - assert!(is_valid_url("http://kernel.org")); - assert!(!is_valid_url("//kernel.org")); - assert!(!is_valid_url("./index.html")); - assert!(!is_valid_url("some-local-page.htm")); - assert!(!is_valid_url("ftp://1.2.3.4/www/index.html")); - assert!(!is_valid_url( - "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" - )); - } - - #[test] - fn test_resolve_url() -> Result<(), ParseError> { - let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?; - assert_eq!( - resolved_url.as_str(), - "https://www.kernel.org/category/signatures.html" - ); - - let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?; - assert_eq!( - resolved_url.as_str(), - "https://www.kernel.org/category/signatures.html" - ); - - let resolved_url = resolve_url( - "saved_page.htm", - "https://www.kernel.org/category/signatures.html", - )?; - assert_eq!( - resolved_url.as_str(), - "https://www.kernel.org/category/signatures.html" - ); - - let resolved_url = resolve_url( - "https://www.kernel.org", - "//www.kernel.org/theme/images/logos/tux.png", - )?; - assert_eq!( - resolved_url.as_str(), - "https://www.kernel.org/theme/images/logos/tux.png" - ); - - let resolved_url = resolve_url( - "https://www.kernel.org", - "//another-host.org/theme/images/logos/tux.png", - )?; - assert_eq!( - resolved_url.as_str(), - "https://another-host.org/theme/images/logos/tux.png" - ); - - let resolved_url = resolve_url( - "https://www.kernel.org/category/signatures.html", - "/theme/images/logos/tux.png", - )?; - assert_eq!( - resolved_url.as_str(), - "https://www.kernel.org/theme/images/logos/tux.png" - ); - - let resolved_url = resolve_url( - "https://www.w3schools.com/html/html_iframe.asp", - "default.asp", - )?; - assert_eq!( - resolved_url.as_str(), - "https://www.w3schools.com/html/default.asp" - ); - - Ok(()) - } - - #[test] - fn test_is_data_url() { - assert!( - is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h") - .unwrap_or(false) - ); - assert!(!is_data_url("https://kernel.org").unwrap_or(false)); - assert!(!is_data_url("//kernel.org").unwrap_or(false)); - } -} diff --git a/src/js.rs b/src/js.rs new file mode 100644 index 0000000..32f946c --- /dev/null +++ b/src/js.rs @@ -0,0 +1,32 @@ +const JS_DOM_EVENT_ATTRS: [&str; 21] = [ + // Input + "onfocus", + "onblur", + "onselect", + "onchange", + "onsubmit", + "onreset", + "onkeydown", + "onkeypress", + "onkeyup", + // Mouse + "onmouseover", + "onmouseout", + "onmousedown", + "onmouseup", + "onmousemove", + // Click + "onclick", + "ondblclick", + // Load + "onload", + "onunload", + "onabort", + "onerror", + "onresize", +]; + +// Returns true if DOM attribute name matches a native JavaScript event handler +pub fn attr_is_event_handler(attr_name: &str) -> bool { + JS_DOM_EVENT_ATTRS.contains(&attr_name.to_lowercase().as_str()) +} diff --git a/src/lib.rs b/src/lib.rs index 5ac119b..89f168c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,4 +7,8 @@ extern crate url; pub mod html; pub mod http; +pub mod js; pub mod utils; + +#[cfg(test)] +pub mod tests; diff --git a/src/main.rs b/src/main.rs index 2c88e22..ff1d29c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,9 +4,10 @@ extern crate monolith; use clap::{App, Arg}; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets}; -use monolith::http::{is_valid_url, retrieve_asset}; +use monolith::http::retrieve_asset; +use monolith::utils::is_valid_url; -static DEFAULT_USER_AGENT: &str = +const DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0"; fn main() { diff --git a/src/tests/html.rs b/src/tests/html.rs new file mode 100644 index 0000000..11de822 --- /dev/null +++ b/src/tests/html.rs @@ -0,0 +1,467 @@ +use crate::html::{ + get_node_name, get_parent_node, html_to_dom, is_icon, stringify_document, walk_and_embed_assets, +}; +use html5ever::rcdom::{Handle, NodeData}; +use html5ever::serialize::{serialize, SerializeOpts}; + +#[test] +fn test_is_icon() { + assert_eq!(is_icon("icon"), true); + assert_eq!(is_icon("Shortcut Icon"), true); + assert_eq!(is_icon("ICON"), true); + assert_eq!(is_icon("mask-icon"), true); + assert_eq!(is_icon("fluid-icon"), true); + assert_eq!(is_icon("stylesheet"), false); + assert_eq!(is_icon(""), false); +} + +#[test] +fn test_get_parent_node_name() { + let html = "

"; + let dom = html_to_dom(&html); + let mut count = 0; + + fn test_walk(node: &Handle, i: &mut i8) { + *i += 1; + + match &node.data { + NodeData::Document => { + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + NodeData::Element { ref name, .. } => { + let node_name = name.local.as_ref().to_string(); + let parent_node_name = get_node_name(&get_parent_node(node)); + if node_name == "head" || node_name == "body" { + assert_eq!(parent_node_name, "html"); + } else if node_name == "div" { + assert_eq!(parent_node_name, "body"); + } else if node_name == "p" { + assert_eq!(parent_node_name, "div"); + } + + println!("{}", node_name); + + for child in node.children.borrow().iter() { + test_walk(child, &mut *i); + } + } + _ => (), + }; + } + + test_walk(&dom.document, &mut count); + + assert_eq!(count, 7); +} + +#[test] +fn test_walk_and_embed_assets() { + let html = "

"; + let dom = html_to_dom(&html); + let url = "http://localhost"; + + let opt_no_css: bool = false; + let opt_no_frames: bool = false; + let opt_no_js: bool = false; + let opt_no_images: bool = false; + let opt_silent = true; + let opt_insecure = false; + + walk_and_embed_assets( + &url, + &dom.document, + opt_no_css, + opt_no_js, + opt_no_images, + "", + opt_silent, + opt_insecure, + opt_no_frames, + ); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "

" + ); +} + +#[test] +fn test_walk_and_embed_assets_ensure_no_recursive_iframe() { + let html = "

"; + let dom = html_to_dom(&html); + let url = "http://localhost"; + + let opt_no_css: bool = false; + let opt_no_frames: bool = false; + let opt_no_js: bool = false; + let opt_no_images: bool = false; + let opt_silent = true; + let opt_insecure = false; + + walk_and_embed_assets( + &url, + &dom.document, + opt_no_css, + opt_no_js, + opt_no_images, + "", + opt_silent, + opt_insecure, + opt_no_frames, + ); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "

" + ); +} + +#[test] +fn test_walk_and_embed_assets_no_css() { + let html = "\ + \ +
"; + let dom = html_to_dom(&html); + let url = "http://localhost"; + + let opt_no_css: bool = true; + let opt_no_frames: bool = false; + let opt_no_js: bool = false; + let opt_no_images: bool = false; + let opt_silent = true; + let opt_insecure = false; + + walk_and_embed_assets( + &url, + &dom.document, + opt_no_css, + opt_no_js, + opt_no_images, + "", + opt_silent, + opt_insecure, + opt_no_frames, + ); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "\ + \ + \ + \ + \ + \ +
\ + \ + " + ); +} + +#[test] +fn test_walk_and_embed_assets_no_images() { + let html = "\ +
"; + let dom = html_to_dom(&html); + let url = "http://localhost"; + + let opt_no_css: bool = false; + let opt_no_frames: bool = false; + let opt_no_js: bool = false; + let opt_no_images: bool = true; + let opt_silent = true; + let opt_insecure = false; + + walk_and_embed_assets( + &url, + &dom.document, + opt_no_css, + opt_no_js, + opt_no_images, + "", + opt_silent, + opt_insecure, + opt_no_frames, + ); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "\ + \ + \ + \ + \ +
\ + \ +
\ + \ + " + ); +} + +#[test] +fn test_walk_and_embed_assets_no_frames() { + let html = ""; + let dom = html_to_dom(&html); + let url = "http://localhost"; + + let opt_no_css: bool = false; + let opt_no_frames: bool = true; + let opt_no_js: bool = false; + let opt_no_images: bool = false; + let opt_silent = true; + let opt_insecure = false; + + walk_and_embed_assets( + &url, + &dom.document, + opt_no_css, + opt_no_js, + opt_no_images, + "", + opt_silent, + opt_insecure, + opt_no_frames, + ); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "" + ); +} + +#[test] +fn test_walk_and_embed_assets_no_js() { + let html = "
\ + \ + \ +
"; + let dom = html_to_dom(&html); + let url = "http://localhost"; + + let opt_no_css: bool = false; + let opt_no_frames: bool = false; + let opt_no_js: bool = true; + let opt_no_images: bool = false; + let opt_silent = true; + let opt_insecure = false; + + walk_and_embed_assets( + &url, + &dom.document, + opt_no_css, + opt_no_js, + opt_no_images, + "", + opt_silent, + opt_insecure, + opt_no_frames, + ); + + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + assert_eq!( + buf.iter().map(|&c| c as char).collect::(), + "
\ +
" + ); +} + +#[test] +fn test_stringify_document() { + let html = "
"; + let dom = html_to_dom(&html); + + let opt_no_css: bool = false; + let opt_no_frames: bool = false; + let opt_no_js: bool = false; + let opt_no_images: bool = false; + let opt_isolate: bool = false; + + assert_eq!( + stringify_document( + &dom.document, + opt_no_css, + opt_no_frames, + opt_no_js, + opt_no_images, + opt_isolate, + ), + "
" + ); +} + +#[test] +fn test_stringify_document_isolate() { + let html = "Isolated document\ + \ + \ +
"; + let dom = html_to_dom(&html); + + let opt_no_css: bool = false; + let opt_no_frames: bool = false; + let opt_no_js: bool = false; + let opt_no_images: bool = false; + let opt_isolate: bool = true; + + assert_eq!( + stringify_document( + &dom.document, + opt_no_css, + opt_no_frames, + opt_no_js, + opt_no_images, + opt_isolate, + ), + "\ + \ + \ + Isolated document\ + \ + \ + \ + \ +
\ + \ +
\ + \ + " + ); +} + +#[test] +fn test_stringify_document_no_css() { + let html = "\ + Unstyled document\ + \ +
"; + let dom = html_to_dom(&html); + + let opt_no_css: bool = true; + let opt_no_frames: bool = false; + let opt_no_js: bool = false; + let opt_no_images: bool = false; + let opt_isolate: bool = false; + + assert_eq!( + stringify_document( + &dom.document, + opt_no_css, + opt_no_frames, + opt_no_js, + opt_no_images, + opt_isolate, + ), + "\ + \ + \ + \ + Unstyled document\ + \ + \ +
\ + " + ); +} + +#[test] +fn test_stringify_document_no_frames() { + let html = "\ + Frameless document\ + \ +
"; + let dom = html_to_dom(&html); + + let opt_no_css: bool = false; + let opt_no_frames: bool = true; + let opt_no_js: bool = false; + let opt_no_images: bool = false; + let opt_isolate: bool = false; + + assert_eq!( + stringify_document( + &dom.document, + opt_no_css, + opt_no_frames, + opt_no_js, + opt_no_images, + opt_isolate, + ), + "\ + \ + \ + \ + Frameless document\ + \ + \ +
\ + " + ); +} + +#[test] +fn test_stringify_document_isolate_no_frames_no_js_no_css_no_images() { + let html = "\ + no-frame no-css no-js no-image isolated document\ + \ + \ +
\ + \ + \ + \ +
"; + let dom = html_to_dom(&html); + + let opt_isolate: bool = true; + let opt_no_css: bool = true; + let opt_no_frames: bool = true; + let opt_no_js: bool = true; + let opt_no_images: bool = true; + + assert_eq!( + stringify_document( + &dom.document, + opt_no_css, + opt_no_frames, + opt_no_js, + opt_no_images, + opt_isolate, + ), + "\ + \ + \ + \ + no-frame no-css no-js no-image isolated document\ + \ + \ + \ + \ +
\ + \ + \ + \ +
\ + \ + " + ); +} diff --git a/src/tests/js.rs b/src/tests/js.rs new file mode 100644 index 0000000..30ebda8 --- /dev/null +++ b/src/tests/js.rs @@ -0,0 +1,13 @@ +use crate::js::attr_is_event_handler; + +#[test] +fn test_attr_is_event_handler() { + // succeeding + assert!(attr_is_event_handler("onBlur")); + assert!(attr_is_event_handler("onclick")); + assert!(attr_is_event_handler("onClick")); + // failing + assert!(!attr_is_event_handler("href")); + assert!(!attr_is_event_handler("")); + assert!(!attr_is_event_handler("class")); +} diff --git a/src/tests/mod.rs b/src/tests/mod.rs new file mode 100644 index 0000000..2efe36b --- /dev/null +++ b/src/tests/mod.rs @@ -0,0 +1,3 @@ +mod html; +mod js; +mod utils; diff --git a/src/tests/utils.rs b/src/tests/utils.rs new file mode 100644 index 0000000..facfd9f --- /dev/null +++ b/src/tests/utils.rs @@ -0,0 +1,160 @@ +use crate::utils::{ + data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url, url_has_protocol, +}; +use url::ParseError; + +#[test] +fn test_data_to_dataurl() { + let mime = "application/javascript"; + let data = "var word = 'hello';\nalert(word);\n"; + let datauri = data_to_dataurl(mime, data.as_bytes()); + assert_eq!( + &datauri, + "data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK" + ); +} + +#[test] +fn test_detect_mimetype() { + // image + assert_eq!(detect_mimetype(b"GIF87a"), "image/gif"); + assert_eq!(detect_mimetype(b"GIF89a"), "image/gif"); + assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg"); + assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png"); + assert_eq!(detect_mimetype(b" Result<(), ParseError> { + let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?; + assert_eq!( + resolved_url.as_str(), + "https://www.kernel.org/category/signatures.html" + ); + + let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?; + assert_eq!( + resolved_url.as_str(), + "https://www.kernel.org/category/signatures.html" + ); + + let resolved_url = resolve_url( + "saved_page.htm", + "https://www.kernel.org/category/signatures.html", + )?; + assert_eq!( + resolved_url.as_str(), + "https://www.kernel.org/category/signatures.html" + ); + + let resolved_url = resolve_url( + "https://www.kernel.org", + "//www.kernel.org/theme/images/logos/tux.png", + )?; + assert_eq!( + resolved_url.as_str(), + "https://www.kernel.org/theme/images/logos/tux.png" + ); + + let resolved_url = resolve_url( + "https://www.kernel.org", + "//another-host.org/theme/images/logos/tux.png", + )?; + assert_eq!( + resolved_url.as_str(), + "https://another-host.org/theme/images/logos/tux.png" + ); + + let resolved_url = resolve_url( + "https://www.kernel.org/category/signatures.html", + "/theme/images/logos/tux.png", + )?; + assert_eq!( + resolved_url.as_str(), + "https://www.kernel.org/theme/images/logos/tux.png" + ); + + let resolved_url = resolve_url( + "https://www.w3schools.com/html/html_iframe.asp", + "default.asp", + )?; + assert_eq!( + resolved_url.as_str(), + "https://www.w3schools.com/html/default.asp" + ); + + Ok(()) +} + +#[test] +fn test_is_data_url() { + // succeeding + assert!( + is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h") + .unwrap_or(false) + ); + // failing + assert!(!is_data_url("https://kernel.org").unwrap_or(false)); + assert!(!is_data_url("//kernel.org").unwrap_or(false)); + assert!(!is_data_url("").unwrap_or(false)); +} diff --git a/src/utils.rs b/src/utils.rs index 8f1e653..61a5d83 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,6 +1,13 @@ extern crate base64; use self::base64::encode; +use regex::Regex; +use url::{ParseError, Url}; + +lazy_static! { + static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); + static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap(); +} static MAGIC: [[&[u8]; 2]; 19] = [ // Image @@ -36,7 +43,7 @@ pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String { format!("data:{};base64,{}", mimetype, encode(data)) } -fn detect_mimetype(data: &[u8]) -> String { +pub fn detect_mimetype(data: &[u8]) -> String { let mut re = String::new(); for item in MAGIC.iter() { @@ -49,44 +56,27 @@ fn detect_mimetype(data: &[u8]) -> String { re } -#[cfg(test)] -mod tests { - use super::*; +pub fn url_has_protocol(url: &str) -> bool { + HAS_PROTOCOL.is_match(&url.to_lowercase()) +} - #[test] - fn test_data_to_dataurl() { - let mime = "application/javascript"; - let data = "var word = 'hello';\nalert(word);\n"; - let datauri = data_to_dataurl(mime, data.as_bytes()); - assert_eq!( - &datauri, - "data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK" - ); - } - - #[test] - fn test_detect_mimetype() { - // Image - assert_eq!(detect_mimetype(b"GIF87a"), "image/gif"); - assert_eq!(detect_mimetype(b"GIF89a"), "image/gif"); - assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg"); - assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png"); - assert_eq!(detect_mimetype(b" Result { + match Url::parse(url) { + Ok(parsed_url) => Ok(parsed_url.scheme() == "data"), + Err(err) => Err(err), } } + +pub fn is_valid_url(path: &str) -> bool { + REGEX_URL.is_match(path) +} + +pub fn resolve_url(from: &str, to: &str) -> Result { + let result = if is_valid_url(to) { + to.to_string() + } else { + Url::parse(from)?.join(to)?.to_string() + }; + + Ok(result) +}