Use HashMap as cache to minimize the number of HTTP requests (#75)
Use HashMap as cache to minimize the number of HTTP requests
This commit is contained in:
parent
8add3a8746
commit
1ff5e91087
5 changed files with 91 additions and 37 deletions
12
src/html.rs
12
src/html.rs
|
@ -7,6 +7,7 @@ use html5ever::tree_builder::{Attribute, TreeSink};
|
|||
use html5ever::{local_name, namespace_url, ns};
|
||||
use http::retrieve_asset;
|
||||
use js::attr_is_event_handler;
|
||||
use std::collections::HashMap;
|
||||
use std::default::Default;
|
||||
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
|
||||
|
||||
|
@ -43,6 +44,7 @@ pub fn is_icon(attr_value: &str) -> bool {
|
|||
}
|
||||
|
||||
pub fn walk_and_embed_assets(
|
||||
cache: &mut HashMap<String, String>,
|
||||
url: &str,
|
||||
node: &Handle,
|
||||
opt_no_css: bool,
|
||||
|
@ -58,6 +60,7 @@ pub fn walk_and_embed_assets(
|
|||
// Dig deeper
|
||||
for child in node.children.borrow().iter() {
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&url,
|
||||
child,
|
||||
opt_no_css,
|
||||
|
@ -103,6 +106,7 @@ pub fn walk_and_embed_assets(
|
|||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let (favicon_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
&href_full_url,
|
||||
true,
|
||||
"",
|
||||
|
@ -126,6 +130,7 @@ pub fn walk_and_embed_assets(
|
|||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let (css_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
&href_full_url,
|
||||
true,
|
||||
"text/css",
|
||||
|
@ -168,6 +173,7 @@ pub fn walk_and_embed_assets(
|
|||
let src_full_url: String =
|
||||
resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone());
|
||||
let (img_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
&src_full_url,
|
||||
true,
|
||||
"",
|
||||
|
@ -201,6 +207,7 @@ pub fn walk_and_embed_assets(
|
|||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let (source_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
&srcset_full_url,
|
||||
true,
|
||||
"",
|
||||
|
@ -247,6 +254,7 @@ pub fn walk_and_embed_assets(
|
|||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let (js_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
&src_full_url,
|
||||
true,
|
||||
"application/javascript",
|
||||
|
@ -300,6 +308,7 @@ pub fn walk_and_embed_assets(
|
|||
let src_full_url: String =
|
||||
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
|
||||
let (iframe_data, iframe_final_url) = retrieve_asset(
|
||||
cache,
|
||||
&src_full_url,
|
||||
false,
|
||||
"text/html",
|
||||
|
@ -310,6 +319,7 @@ pub fn walk_and_embed_assets(
|
|||
.unwrap_or((EMPTY_STRING.clone(), src_full_url));
|
||||
let dom = html_to_dom(&iframe_data);
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&iframe_final_url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
|
@ -344,6 +354,7 @@ pub fn walk_and_embed_assets(
|
|||
let poster_full_url: String = resolve_url(&url, &video_poster)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let (poster_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
&poster_full_url,
|
||||
true,
|
||||
"",
|
||||
|
@ -392,6 +403,7 @@ pub fn walk_and_embed_assets(
|
|||
// Dig deeper
|
||||
for child in node.children.borrow().iter() {
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&url,
|
||||
child,
|
||||
opt_no_css,
|
||||
|
|
82
src/http.rs
82
src/http.rs
|
@ -1,9 +1,11 @@
|
|||
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use utils::{data_to_dataurl, is_data_url};
|
||||
|
||||
pub fn retrieve_asset(
|
||||
cache: &mut HashMap<String, String>,
|
||||
url: &str,
|
||||
as_dataurl: bool,
|
||||
mime: &str,
|
||||
|
@ -14,42 +16,54 @@ pub fn retrieve_asset(
|
|||
if is_data_url(&url).unwrap() {
|
||||
Ok((url.to_string(), url.to_string()))
|
||||
} else {
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(opt_insecure)
|
||||
.build()?;
|
||||
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
|
||||
|
||||
if !opt_silent {
|
||||
if url == response.url().as_str() {
|
||||
eprintln!("[ {} ]", &url);
|
||||
} else {
|
||||
eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
|
||||
if cache.contains_key(&url.to_string()) {
|
||||
// url is in cache
|
||||
if !opt_silent {
|
||||
eprintln!("[ {} ] (from cache)", &url);
|
||||
}
|
||||
}
|
||||
|
||||
if as_dataurl {
|
||||
// Convert response into a byte array
|
||||
let mut data: Vec<u8> = vec![];
|
||||
response.copy_to(&mut data)?;
|
||||
|
||||
// Attempt to obtain MIME type by reading the Content-Type header
|
||||
let mimetype = if mime == "" {
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|header| header.to_str().ok())
|
||||
.unwrap_or(&mime)
|
||||
} else {
|
||||
mime
|
||||
};
|
||||
|
||||
Ok((
|
||||
data_to_dataurl(&mimetype, &data),
|
||||
response.url().to_string(),
|
||||
))
|
||||
let data = cache.get(&url.to_string()).unwrap();
|
||||
Ok((data.to_string(), url.to_string()))
|
||||
} else {
|
||||
Ok((response.text().unwrap(), response.url().to_string()))
|
||||
// url not in cache, we request it
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(opt_insecure)
|
||||
.build()?;
|
||||
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
|
||||
|
||||
if !opt_silent {
|
||||
if url == response.url().as_str() {
|
||||
eprintln!("[ {} ]", &url);
|
||||
} else {
|
||||
eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
|
||||
}
|
||||
}
|
||||
|
||||
if as_dataurl {
|
||||
// Convert response into a byte array
|
||||
let mut data: Vec<u8> = vec![];
|
||||
response.copy_to(&mut data)?;
|
||||
|
||||
// Attempt to obtain MIME type by reading the Content-Type header
|
||||
let mimetype = if mime == "" {
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|header| header.to_str().ok())
|
||||
.unwrap_or(&mime)
|
||||
} else {
|
||||
mime
|
||||
};
|
||||
let dataurl = data_to_dataurl(&mimetype, &data);
|
||||
// insert in cache
|
||||
cache.insert(response.url().to_string(), dataurl.to_string());
|
||||
Ok((dataurl, response.url().to_string()))
|
||||
} else {
|
||||
let content = response.text().unwrap();
|
||||
// insert in cache
|
||||
cache.insert(response.url().to_string(), content.clone());
|
||||
Ok((content, response.url().to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,11 +8,14 @@ use args::AppArgs;
|
|||
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
|
||||
use monolith::http::retrieve_asset;
|
||||
use monolith::utils::is_valid_url;
|
||||
use std::collections::HashMap;
|
||||
|
||||
fn main() {
|
||||
let app_args = AppArgs::get();
|
||||
let cache = &mut HashMap::new();
|
||||
if is_valid_url(app_args.url_target.as_str()) {
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
app_args.url_target.as_str(),
|
||||
false,
|
||||
"",
|
||||
|
@ -24,6 +27,7 @@ fn main() {
|
|||
let dom = html_to_dom(&data);
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&final_url,
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
|
|
|
@ -3,6 +3,7 @@ use crate::html::{
|
|||
};
|
||||
use html5ever::rcdom::{Handle, NodeData};
|
||||
use html5ever::serialize::{serialize, SerializeOpts};
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_is_icon() {
|
||||
|
@ -58,6 +59,8 @@ fn test_get_parent_node_name() {
|
|||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets() {
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let html = "<div><P></P></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
@ -70,6 +73,7 @@ fn test_walk_and_embed_assets() {
|
|||
let opt_insecure = false;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
|
@ -95,6 +99,7 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
|
|||
let html = "<div><P></P><iframe src=\"\"></iframe></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
|
@ -104,6 +109,7 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
|
|||
let opt_insecure = false;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
|
@ -131,6 +137,7 @@ fn test_walk_and_embed_assets_no_css() {
|
|||
<div style=\"display: none;\"></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = false;
|
||||
|
@ -140,6 +147,7 @@ fn test_walk_and_embed_assets_no_css() {
|
|||
let opt_insecure = false;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
|
@ -174,6 +182,7 @@ fn test_walk_and_embed_assets_no_images() {
|
|||
<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
|
@ -183,6 +192,7 @@ fn test_walk_and_embed_assets_no_images() {
|
|||
let opt_insecure = false;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
|
@ -219,6 +229,7 @@ fn test_walk_and_embed_assets_no_frames() {
|
|||
let html = "<iframe src=\"http://trackbook.com\"></iframe>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = true;
|
||||
|
@ -228,6 +239,7 @@ fn test_walk_and_embed_assets_no_frames() {
|
|||
let opt_insecure = false;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
|
@ -256,6 +268,7 @@ fn test_walk_and_embed_assets_no_js() {
|
|||
</div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
|
@ -265,6 +278,7 @@ fn test_walk_and_embed_assets_no_js() {
|
|||
let opt_insecure = false;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
|
|
|
@ -1,13 +1,23 @@
|
|||
use crate::http::retrieve_asset;
|
||||
|
||||
use std::collections::HashMap;
|
||||
#[test]
|
||||
fn test_retrieve_asset() {
|
||||
let (data, final_url) =
|
||||
retrieve_asset("data:text/html;base64,...", true, "", "", true, false).unwrap();
|
||||
let cache = &mut HashMap::new();
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
"data:text/html;base64,...",
|
||||
true,
|
||||
"",
|
||||
"",
|
||||
true,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(&data, "data:text/html;base64,...");
|
||||
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
"data:text/html;base64,...",
|
||||
true,
|
||||
"image/png",
|
||||
|
|
Loading…
Reference in a new issue