diff --git a/src/html.rs b/src/html.rs
index fa10175..3c7057f 100644
--- a/src/html.rs
+++ b/src/html.rs
@@ -7,6 +7,7 @@ use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset;
use js::attr_is_event_handler;
+use std::collections::HashMap;
use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
@@ -43,6 +44,7 @@ pub fn is_icon(attr_value: &str) -> bool {
}
pub fn walk_and_embed_assets(
+ cache: &mut HashMap,
url: &str,
node: &Handle,
opt_no_css: bool,
@@ -58,6 +60,7 @@ pub fn walk_and_embed_assets(
// Dig deeper
for child in node.children.borrow().iter() {
walk_and_embed_assets(
+ cache,
&url,
child,
opt_no_css,
@@ -103,6 +106,7 @@ pub fn walk_and_embed_assets(
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let (favicon_dataurl, _) = retrieve_asset(
+ cache,
&href_full_url,
true,
"",
@@ -126,6 +130,7 @@ pub fn walk_and_embed_assets(
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let (css_dataurl, _) = retrieve_asset(
+ cache,
&href_full_url,
true,
"text/css",
@@ -168,6 +173,7 @@ pub fn walk_and_embed_assets(
let src_full_url: String =
resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone());
let (img_dataurl, _) = retrieve_asset(
+ cache,
&src_full_url,
true,
"",
@@ -201,6 +207,7 @@ pub fn walk_and_embed_assets(
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let (source_dataurl, _) = retrieve_asset(
+ cache,
&srcset_full_url,
true,
"",
@@ -247,6 +254,7 @@ pub fn walk_and_embed_assets(
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let (js_dataurl, _) = retrieve_asset(
+ cache,
&src_full_url,
true,
"application/javascript",
@@ -300,6 +308,7 @@ pub fn walk_and_embed_assets(
let src_full_url: String =
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
let (iframe_data, iframe_final_url) = retrieve_asset(
+ cache,
&src_full_url,
false,
"text/html",
@@ -310,6 +319,7 @@ pub fn walk_and_embed_assets(
.unwrap_or((EMPTY_STRING.clone(), src_full_url));
let dom = html_to_dom(&iframe_data);
walk_and_embed_assets(
+ cache,
&iframe_final_url,
&dom.document,
opt_no_css,
@@ -344,6 +354,7 @@ pub fn walk_and_embed_assets(
let poster_full_url: String = resolve_url(&url, &video_poster)
.unwrap_or(EMPTY_STRING.clone());
let (poster_dataurl, _) = retrieve_asset(
+ cache,
&poster_full_url,
true,
"",
@@ -392,6 +403,7 @@ pub fn walk_and_embed_assets(
// Dig deeper
for child in node.children.borrow().iter() {
walk_and_embed_assets(
+ cache,
&url,
child,
opt_no_css,
diff --git a/src/http.rs b/src/http.rs
index 19bf902..97e77a7 100644
--- a/src/http.rs
+++ b/src/http.rs
@@ -1,9 +1,11 @@
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
use reqwest::Client;
+use std::collections::HashMap;
use std::time::Duration;
use utils::{data_to_dataurl, is_data_url};
pub fn retrieve_asset(
+ cache: &mut HashMap,
url: &str,
as_dataurl: bool,
mime: &str,
@@ -14,42 +16,54 @@ pub fn retrieve_asset(
if is_data_url(&url).unwrap() {
Ok((url.to_string(), url.to_string()))
} else {
- let client = Client::builder()
- .timeout(Duration::from_secs(10))
- .danger_accept_invalid_certs(opt_insecure)
- .build()?;
- let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
-
- if !opt_silent {
- if url == response.url().as_str() {
- eprintln!("[ {} ]", &url);
- } else {
- eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
+ if cache.contains_key(&url.to_string()) {
+ // url is in cache
+ if !opt_silent {
+ eprintln!("[ {} ] (from cache)", &url);
}
- }
-
- if as_dataurl {
- // Convert response into a byte array
- let mut data: Vec = vec![];
- response.copy_to(&mut data)?;
-
- // Attempt to obtain MIME type by reading the Content-Type header
- let mimetype = if mime == "" {
- response
- .headers()
- .get(CONTENT_TYPE)
- .and_then(|header| header.to_str().ok())
- .unwrap_or(&mime)
- } else {
- mime
- };
-
- Ok((
- data_to_dataurl(&mimetype, &data),
- response.url().to_string(),
- ))
+ let data = cache.get(&url.to_string()).unwrap();
+ Ok((data.to_string(), url.to_string()))
} else {
- Ok((response.text().unwrap(), response.url().to_string()))
+ // url not in cache, we request it
+ let client = Client::builder()
+ .timeout(Duration::from_secs(10))
+ .danger_accept_invalid_certs(opt_insecure)
+ .build()?;
+ let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
+
+ if !opt_silent {
+ if url == response.url().as_str() {
+ eprintln!("[ {} ]", &url);
+ } else {
+ eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
+ }
+ }
+
+ if as_dataurl {
+ // Convert response into a byte array
+ let mut data: Vec = vec![];
+ response.copy_to(&mut data)?;
+
+ // Attempt to obtain MIME type by reading the Content-Type header
+ let mimetype = if mime == "" {
+ response
+ .headers()
+ .get(CONTENT_TYPE)
+ .and_then(|header| header.to_str().ok())
+ .unwrap_or(&mime)
+ } else {
+ mime
+ };
+ let dataurl = data_to_dataurl(&mimetype, &data);
+ // insert in cache
+ cache.insert(response.url().to_string(), dataurl.to_string());
+ Ok((dataurl, response.url().to_string()))
+ } else {
+ let content = response.text().unwrap();
+ // insert in cache
+ cache.insert(response.url().to_string(), content.clone());
+ Ok((content, response.url().to_string()))
+ }
}
}
}
diff --git a/src/main.rs b/src/main.rs
index e6b72ae..50fab9e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -8,11 +8,14 @@ use args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset;
use monolith::utils::is_valid_url;
+use std::collections::HashMap;
fn main() {
let app_args = AppArgs::get();
+ let cache = &mut HashMap::new();
if is_valid_url(app_args.url_target.as_str()) {
let (data, final_url) = retrieve_asset(
+ cache,
app_args.url_target.as_str(),
false,
"",
@@ -24,6 +27,7 @@ fn main() {
let dom = html_to_dom(&data);
walk_and_embed_assets(
+ cache,
&final_url,
&dom.document,
app_args.no_css,
diff --git a/src/tests/html.rs b/src/tests/html.rs
index 11de822..b7a571b 100644
--- a/src/tests/html.rs
+++ b/src/tests/html.rs
@@ -3,6 +3,7 @@ use crate::html::{
};
use html5ever::rcdom::{Handle, NodeData};
use html5ever::serialize::{serialize, SerializeOpts};
+use std::collections::HashMap;
#[test]
fn test_is_icon() {
@@ -58,6 +59,8 @@ fn test_get_parent_node_name() {
#[test]
fn test_walk_and_embed_assets() {
+ let cache = &mut HashMap::new();
+
let html = "";
let dom = html_to_dom(&html);
let url = "http://localhost";
@@ -70,6 +73,7 @@ fn test_walk_and_embed_assets() {
let opt_insecure = false;
walk_and_embed_assets(
+ cache,
&url,
&dom.document,
opt_no_css,
@@ -95,6 +99,7 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
let html = "";
let dom = html_to_dom(&html);
let url = "http://localhost";
+ let cache = &mut HashMap::new();
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
@@ -104,6 +109,7 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
let opt_insecure = false;
walk_and_embed_assets(
+ cache,
&url,
&dom.document,
opt_no_css,
@@ -131,6 +137,7 @@ fn test_walk_and_embed_assets_no_css() {
";
let dom = html_to_dom(&html);
let url = "http://localhost";
+ let cache = &mut HashMap::new();
let opt_no_css: bool = true;
let opt_no_frames: bool = false;
@@ -140,6 +147,7 @@ fn test_walk_and_embed_assets_no_css() {
let opt_insecure = false;
walk_and_embed_assets(
+ cache,
&url,
&dom.document,
opt_no_css,
@@ -174,6 +182,7 @@ fn test_walk_and_embed_assets_no_images() {
";
let dom = html_to_dom(&html);
let url = "http://localhost";
+ let cache = &mut HashMap::new();
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
@@ -183,6 +192,7 @@ fn test_walk_and_embed_assets_no_images() {
let opt_insecure = false;
walk_and_embed_assets(
+ cache,
&url,
&dom.document,
opt_no_css,
@@ -219,6 +229,7 @@ fn test_walk_and_embed_assets_no_frames() {
let html = "";
let dom = html_to_dom(&html);
let url = "http://localhost";
+ let cache = &mut HashMap::new();
let opt_no_css: bool = false;
let opt_no_frames: bool = true;
@@ -228,6 +239,7 @@ fn test_walk_and_embed_assets_no_frames() {
let opt_insecure = false;
walk_and_embed_assets(
+ cache,
&url,
&dom.document,
opt_no_css,
@@ -256,6 +268,7 @@ fn test_walk_and_embed_assets_no_js() {
";
let dom = html_to_dom(&html);
let url = "http://localhost";
+ let cache = &mut HashMap::new();
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
@@ -265,6 +278,7 @@ fn test_walk_and_embed_assets_no_js() {
let opt_insecure = false;
walk_and_embed_assets(
+ cache,
&url,
&dom.document,
opt_no_css,
diff --git a/src/tests/http.rs b/src/tests/http.rs
index 003919b..b603c2b 100644
--- a/src/tests/http.rs
+++ b/src/tests/http.rs
@@ -1,13 +1,23 @@
use crate::http::retrieve_asset;
-
+use std::collections::HashMap;
#[test]
fn test_retrieve_asset() {
- let (data, final_url) =
- retrieve_asset("data:text/html;base64,...", true, "", "", true, false).unwrap();
+ let cache = &mut HashMap::new();
+ let (data, final_url) = retrieve_asset(
+ cache,
+ "data:text/html;base64,...",
+ true,
+ "",
+ "",
+ true,
+ false,
+ )
+ .unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
let (data, final_url) = retrieve_asset(
+ cache,
"data:text/html;base64,...",
true,
"image/png",