diff --git a/src/html.rs b/src/html.rs index fa10175..3c7057f 100644 --- a/src/html.rs +++ b/src/html.rs @@ -7,6 +7,7 @@ use html5ever::tree_builder::{Attribute, TreeSink}; use html5ever::{local_name, namespace_url, ns}; use http::retrieve_asset; use js::attr_is_event_handler; +use std::collections::HashMap; use std::default::Default; use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol}; @@ -43,6 +44,7 @@ pub fn is_icon(attr_value: &str) -> bool { } pub fn walk_and_embed_assets( + cache: &mut HashMap, url: &str, node: &Handle, opt_no_css: bool, @@ -58,6 +60,7 @@ pub fn walk_and_embed_assets( // Dig deeper for child in node.children.borrow().iter() { walk_and_embed_assets( + cache, &url, child, opt_no_css, @@ -103,6 +106,7 @@ pub fn walk_and_embed_assets( resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let (favicon_dataurl, _) = retrieve_asset( + cache, &href_full_url, true, "", @@ -126,6 +130,7 @@ pub fn walk_and_embed_assets( resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let (css_dataurl, _) = retrieve_asset( + cache, &href_full_url, true, "text/css", @@ -168,6 +173,7 @@ pub fn walk_and_embed_assets( let src_full_url: String = resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone()); let (img_dataurl, _) = retrieve_asset( + cache, &src_full_url, true, "", @@ -201,6 +207,7 @@ pub fn walk_and_embed_assets( resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let (source_dataurl, _) = retrieve_asset( + cache, &srcset_full_url, true, "", @@ -247,6 +254,7 @@ pub fn walk_and_embed_assets( resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let (js_dataurl, _) = retrieve_asset( + cache, &src_full_url, true, "application/javascript", @@ -300,6 +308,7 @@ pub fn walk_and_embed_assets( let src_full_url: String = resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone()); let (iframe_data, iframe_final_url) = retrieve_asset( + cache, &src_full_url, false, "text/html", @@ -310,6 +319,7 @@ pub fn walk_and_embed_assets( .unwrap_or((EMPTY_STRING.clone(), src_full_url)); let dom = html_to_dom(&iframe_data); walk_and_embed_assets( + cache, &iframe_final_url, &dom.document, opt_no_css, @@ -344,6 +354,7 @@ pub fn walk_and_embed_assets( let poster_full_url: String = resolve_url(&url, &video_poster) .unwrap_or(EMPTY_STRING.clone()); let (poster_dataurl, _) = retrieve_asset( + cache, &poster_full_url, true, "", @@ -392,6 +403,7 @@ pub fn walk_and_embed_assets( // Dig deeper for child in node.children.borrow().iter() { walk_and_embed_assets( + cache, &url, child, opt_no_css, diff --git a/src/http.rs b/src/http.rs index 19bf902..97e77a7 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,9 +1,11 @@ use reqwest::header::{CONTENT_TYPE, USER_AGENT}; use reqwest::Client; +use std::collections::HashMap; use std::time::Duration; use utils::{data_to_dataurl, is_data_url}; pub fn retrieve_asset( + cache: &mut HashMap, url: &str, as_dataurl: bool, mime: &str, @@ -14,42 +16,54 @@ pub fn retrieve_asset( if is_data_url(&url).unwrap() { Ok((url.to_string(), url.to_string())) } else { - let client = Client::builder() - .timeout(Duration::from_secs(10)) - .danger_accept_invalid_certs(opt_insecure) - .build()?; - let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?; - - if !opt_silent { - if url == response.url().as_str() { - eprintln!("[ {} ]", &url); - } else { - eprintln!("[ {} -> {} ]", &url, &response.url().as_str()); + if cache.contains_key(&url.to_string()) { + // url is in cache + if !opt_silent { + eprintln!("[ {} ] (from cache)", &url); } - } - - if as_dataurl { - // Convert response into a byte array - let mut data: Vec = vec![]; - response.copy_to(&mut data)?; - - // Attempt to obtain MIME type by reading the Content-Type header - let mimetype = if mime == "" { - response - .headers() - .get(CONTENT_TYPE) - .and_then(|header| header.to_str().ok()) - .unwrap_or(&mime) - } else { - mime - }; - - Ok(( - data_to_dataurl(&mimetype, &data), - response.url().to_string(), - )) + let data = cache.get(&url.to_string()).unwrap(); + Ok((data.to_string(), url.to_string())) } else { - Ok((response.text().unwrap(), response.url().to_string())) + // url not in cache, we request it + let client = Client::builder() + .timeout(Duration::from_secs(10)) + .danger_accept_invalid_certs(opt_insecure) + .build()?; + let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?; + + if !opt_silent { + if url == response.url().as_str() { + eprintln!("[ {} ]", &url); + } else { + eprintln!("[ {} -> {} ]", &url, &response.url().as_str()); + } + } + + if as_dataurl { + // Convert response into a byte array + let mut data: Vec = vec![]; + response.copy_to(&mut data)?; + + // Attempt to obtain MIME type by reading the Content-Type header + let mimetype = if mime == "" { + response + .headers() + .get(CONTENT_TYPE) + .and_then(|header| header.to_str().ok()) + .unwrap_or(&mime) + } else { + mime + }; + let dataurl = data_to_dataurl(&mimetype, &data); + // insert in cache + cache.insert(response.url().to_string(), dataurl.to_string()); + Ok((dataurl, response.url().to_string())) + } else { + let content = response.text().unwrap(); + // insert in cache + cache.insert(response.url().to_string(), content.clone()); + Ok((content, response.url().to_string())) + } } } } diff --git a/src/main.rs b/src/main.rs index e6b72ae..50fab9e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,11 +8,14 @@ use args::AppArgs; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets}; use monolith::http::retrieve_asset; use monolith::utils::is_valid_url; +use std::collections::HashMap; fn main() { let app_args = AppArgs::get(); + let cache = &mut HashMap::new(); if is_valid_url(app_args.url_target.as_str()) { let (data, final_url) = retrieve_asset( + cache, app_args.url_target.as_str(), false, "", @@ -24,6 +27,7 @@ fn main() { let dom = html_to_dom(&data); walk_and_embed_assets( + cache, &final_url, &dom.document, app_args.no_css, diff --git a/src/tests/html.rs b/src/tests/html.rs index 11de822..b7a571b 100644 --- a/src/tests/html.rs +++ b/src/tests/html.rs @@ -3,6 +3,7 @@ use crate::html::{ }; use html5ever::rcdom::{Handle, NodeData}; use html5ever::serialize::{serialize, SerializeOpts}; +use std::collections::HashMap; #[test] fn test_is_icon() { @@ -58,6 +59,8 @@ fn test_get_parent_node_name() { #[test] fn test_walk_and_embed_assets() { + let cache = &mut HashMap::new(); + let html = "

"; let dom = html_to_dom(&html); let url = "http://localhost"; @@ -70,6 +73,7 @@ fn test_walk_and_embed_assets() { let opt_insecure = false; walk_and_embed_assets( + cache, &url, &dom.document, opt_no_css, @@ -95,6 +99,7 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() { let html = "

"; let dom = html_to_dom(&html); let url = "http://localhost"; + let cache = &mut HashMap::new(); let opt_no_css: bool = false; let opt_no_frames: bool = false; @@ -104,6 +109,7 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() { let opt_insecure = false; walk_and_embed_assets( + cache, &url, &dom.document, opt_no_css, @@ -131,6 +137,7 @@ fn test_walk_and_embed_assets_no_css() {
"; let dom = html_to_dom(&html); let url = "http://localhost"; + let cache = &mut HashMap::new(); let opt_no_css: bool = true; let opt_no_frames: bool = false; @@ -140,6 +147,7 @@ fn test_walk_and_embed_assets_no_css() { let opt_insecure = false; walk_and_embed_assets( + cache, &url, &dom.document, opt_no_css, @@ -174,6 +182,7 @@ fn test_walk_and_embed_assets_no_images() {
"; let dom = html_to_dom(&html); let url = "http://localhost"; + let cache = &mut HashMap::new(); let opt_no_css: bool = false; let opt_no_frames: bool = false; @@ -183,6 +192,7 @@ fn test_walk_and_embed_assets_no_images() { let opt_insecure = false; walk_and_embed_assets( + cache, &url, &dom.document, opt_no_css, @@ -219,6 +229,7 @@ fn test_walk_and_embed_assets_no_frames() { let html = ""; let dom = html_to_dom(&html); let url = "http://localhost"; + let cache = &mut HashMap::new(); let opt_no_css: bool = false; let opt_no_frames: bool = true; @@ -228,6 +239,7 @@ fn test_walk_and_embed_assets_no_frames() { let opt_insecure = false; walk_and_embed_assets( + cache, &url, &dom.document, opt_no_css, @@ -256,6 +268,7 @@ fn test_walk_and_embed_assets_no_js() { "; let dom = html_to_dom(&html); let url = "http://localhost"; + let cache = &mut HashMap::new(); let opt_no_css: bool = false; let opt_no_frames: bool = false; @@ -265,6 +278,7 @@ fn test_walk_and_embed_assets_no_js() { let opt_insecure = false; walk_and_embed_assets( + cache, &url, &dom.document, opt_no_css, diff --git a/src/tests/http.rs b/src/tests/http.rs index 003919b..b603c2b 100644 --- a/src/tests/http.rs +++ b/src/tests/http.rs @@ -1,13 +1,23 @@ use crate::http::retrieve_asset; - +use std::collections::HashMap; #[test] fn test_retrieve_asset() { - let (data, final_url) = - retrieve_asset("data:text/html;base64,...", true, "", "", true, false).unwrap(); + let cache = &mut HashMap::new(); + let (data, final_url) = retrieve_asset( + cache, + "data:text/html;base64,...", + true, + "", + "", + true, + false, + ) + .unwrap(); assert_eq!(&data, "data:text/html;base64,..."); assert_eq!(&final_url, "data:text/html;base64,..."); let (data, final_url) = retrieve_asset( + cache, "data:text/html;base64,...", true, "image/png",