Merge pull request #81 from Alch-Emi/shared-client

Use a shared client for HTTP requests
This commit is contained in:
Sunshine 2019-12-13 03:51:19 -05:00 committed by GitHub
commit b0fc24d77f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 64 additions and 79 deletions

View file

@ -7,6 +7,7 @@ use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns}; use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset; use http::retrieve_asset;
use js::attr_is_event_handler; use js::attr_is_event_handler;
use reqwest::Client;
use std::collections::HashMap; use std::collections::HashMap;
use std::default::Default; use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol}; use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
@ -45,14 +46,13 @@ pub fn is_icon(attr_value: &str) -> bool {
pub fn walk_and_embed_assets( pub fn walk_and_embed_assets(
cache: &mut HashMap<String, String>, cache: &mut HashMap<String, String>,
client: &Client,
url: &str, url: &str,
node: &Handle, node: &Handle,
opt_no_css: bool, opt_no_css: bool,
opt_no_js: bool, opt_no_js: bool,
opt_no_images: bool, opt_no_images: bool,
opt_user_agent: &str,
opt_silent: bool, opt_silent: bool,
opt_insecure: bool,
opt_no_frames: bool, opt_no_frames: bool,
) { ) {
match node.data { match node.data {
@ -61,14 +61,13 @@ pub fn walk_and_embed_assets(
for child in node.children.borrow().iter() { for child in node.children.borrow().iter() {
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
client,
&url, &url,
child, child,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
} }
@ -107,12 +106,11 @@ pub fn walk_and_embed_assets(
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let (favicon_dataurl, _) = retrieve_asset( let (favicon_dataurl, _) = retrieve_asset(
cache, cache,
client,
&href_full_url, &href_full_url,
true, true,
"", "",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
@ -131,23 +129,21 @@ pub fn walk_and_embed_assets(
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let replacement_text = match retrieve_asset( let replacement_text = match retrieve_asset(
cache, cache,
client,
&href_full_url, &href_full_url,
false, false,
"text/css", "text/css",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) { ) {
// On successful retrieval, traverse CSS // On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports( Ok((css_data, _)) => resolve_css_imports(
cache, cache,
client,
&css_data, &css_data,
true, true,
&href_full_url, &href_full_url,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
), ),
// If a network error occured, warn // If a network error occured, warn
@ -194,12 +190,11 @@ pub fn walk_and_embed_assets(
resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone()); resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone());
let (img_dataurl, _) = retrieve_asset( let (img_dataurl, _) = retrieve_asset(
cache, cache,
client,
&src_full_url, &src_full_url,
true, true,
"", "",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
@ -228,12 +223,11 @@ pub fn walk_and_embed_assets(
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let (source_dataurl, _) = retrieve_asset( let (source_dataurl, _) = retrieve_asset(
cache, cache,
client,
&srcset_full_url, &srcset_full_url,
true, true,
"", "",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
@ -275,12 +269,11 @@ pub fn walk_and_embed_assets(
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let (js_dataurl, _) = retrieve_asset( let (js_dataurl, _) = retrieve_asset(
cache, cache,
client,
&src_full_url, &src_full_url,
true, true,
"application/javascript", "application/javascript",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
@ -299,13 +292,12 @@ pub fn walk_and_embed_assets(
let mut tendril = contents.borrow_mut(); let mut tendril = contents.borrow_mut();
let replacement = resolve_css_imports( let replacement = resolve_css_imports(
cache, cache,
client,
tendril.as_ref(), tendril.as_ref(),
false, false,
&url, &url,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
); );
tendril.clear(); tendril.clear();
tendril.push_slice(&replacement); tendril.push_slice(&replacement);
@ -347,25 +339,23 @@ pub fn walk_and_embed_assets(
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone()); resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
let (iframe_data, iframe_final_url) = retrieve_asset( let (iframe_data, iframe_final_url) = retrieve_asset(
cache, cache,
client,
&src_full_url, &src_full_url,
false, false,
"text/html", "text/html",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((EMPTY_STRING.clone(), src_full_url)); .unwrap_or((EMPTY_STRING.clone(), src_full_url));
let dom = html_to_dom(&iframe_data); let dom = html_to_dom(&iframe_data);
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
client,
&iframe_final_url, &iframe_final_url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
@ -393,12 +383,11 @@ pub fn walk_and_embed_assets(
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let (poster_dataurl, _) = retrieve_asset( let (poster_dataurl, _) = retrieve_asset(
cache, cache,
client,
&poster_full_url, &poster_full_url,
true, true,
"", "",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((poster_full_url, EMPTY_STRING.clone())); .unwrap_or((poster_full_url, EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
@ -431,13 +420,12 @@ pub fn walk_and_embed_assets(
{ {
let replacement = resolve_css_imports( let replacement = resolve_css_imports(
cache, cache,
client,
attribute.value.as_ref(), attribute.value.as_ref(),
false, false,
&url, &url,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
); );
attribute.value.clear(); attribute.value.clear();
attribute.value.push_slice(&replacement); attribute.value.push_slice(&replacement);
@ -462,14 +450,13 @@ pub fn walk_and_embed_assets(
for child in node.children.borrow().iter() { for child in node.children.borrow().iter() {
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
client,
&url, &url,
child, child,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
} }

View file

@ -1,17 +1,15 @@
use reqwest::header::{CONTENT_TYPE, USER_AGENT}; use reqwest::header::CONTENT_TYPE;
use reqwest::Client; use reqwest::Client;
use std::collections::HashMap; use std::collections::HashMap;
use std::time::Duration;
use utils::{data_to_dataurl, is_data_url}; use utils::{data_to_dataurl, is_data_url};
pub fn retrieve_asset( pub fn retrieve_asset(
cache: &mut HashMap<String, String>, cache: &mut HashMap<String, String>,
client: &Client,
url: &str, url: &str,
as_dataurl: bool, as_dataurl: bool,
mime: &str, mime: &str,
opt_user_agent: &str,
opt_silent: bool, opt_silent: bool,
opt_insecure: bool,
) -> Result<(String, String), reqwest::Error> { ) -> Result<(String, String), reqwest::Error> {
if is_data_url(&url).unwrap() { if is_data_url(&url).unwrap() {
Ok((url.to_string(), url.to_string())) Ok((url.to_string(), url.to_string()))
@ -25,11 +23,7 @@ pub fn retrieve_asset(
Ok((data.to_string(), url.to_string())) Ok((data.to_string(), url.to_string()))
} else { } else {
// url not in cache, we request it // url not in cache, we request it
let client = Client::builder() let mut response = client.get(url).send()?;
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(opt_insecure)
.build()?;
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
if !opt_silent { if !opt_silent {
if url == response.url().as_str() { if url == response.url().as_str() {

View file

@ -1,6 +1,7 @@
#[macro_use] #[macro_use]
extern crate clap; extern crate clap;
extern crate monolith; extern crate monolith;
extern crate reqwest;
mod args; mod args;
@ -8,34 +9,50 @@ use args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets}; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset; use monolith::http::retrieve_asset;
use monolith::utils::is_valid_url; use monolith::utils::is_valid_url;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap; use std::collections::HashMap;
use std::time::Duration;
fn main() { fn main() {
let app_args = AppArgs::get(); let app_args = AppArgs::get();
let cache = &mut HashMap::new(); let cache = &mut HashMap::new();
if is_valid_url(app_args.url_target.as_str()) { if is_valid_url(app_args.url_target.as_str()) {
// Initialize client
let mut header_map = HeaderMap::new();
match HeaderValue::from_str(&app_args.user_agent) {
Ok(header) => header_map.insert(USER_AGENT, header),
Err(err) => {
eprintln!("Invalid user agent! {}", err);
return;
}
};
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(app_args.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
let (data, final_url) = retrieve_asset( let (data, final_url) = retrieve_asset(
cache, cache,
&client,
app_args.url_target.as_str(), app_args.url_target.as_str(),
false, false,
"", "",
app_args.user_agent.as_str(),
app_args.silent, app_args.silent,
app_args.insecure,
) )
.unwrap(); .unwrap();
let dom = html_to_dom(&data); let dom = html_to_dom(&data);
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&final_url, &final_url,
&dom.document, &dom.document,
app_args.no_css, app_args.no_css,
app_args.no_js, app_args.no_js,
app_args.no_images, app_args.no_images,
app_args.user_agent.as_str(),
app_args.silent, app_args.silent,
app_args.insecure,
app_args.no_frames, app_args.no_frames,
); );

View file

@ -70,18 +70,18 @@ fn test_walk_and_embed_assets() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -106,18 +106,18 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -144,18 +144,17 @@ fn test_walk_and_embed_assets_no_css() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false; let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -189,18 +188,18 @@ fn test_walk_and_embed_assets_no_images() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = true; let opt_no_images: bool = true;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -236,18 +235,17 @@ fn test_walk_and_embed_assets_no_frames() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false; let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -275,18 +273,18 @@ fn test_walk_and_embed_assets_no_js() {
let opt_no_js: bool = true; let opt_no_js: bool = true;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );

View file

@ -3,26 +3,18 @@ use std::collections::HashMap;
#[test] #[test]
fn test_retrieve_asset() { fn test_retrieve_asset() {
let cache = &mut HashMap::new(); let cache = &mut HashMap::new();
let (data, final_url) = retrieve_asset( let client = reqwest::Client::new();
cache, let (data, final_url) =
"data:text/html;base64,...", retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
true,
"",
"",
true,
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,..."); assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,..."); assert_eq!(&final_url, "data:text/html;base64,...");
let (data, final_url) = retrieve_asset( let (data, final_url) = retrieve_asset(
cache, cache,
&client,
"data:text/html;base64,...", "data:text/html;base64,...",
true, true,
"image/png", "image/png",
"",
true,
false, false,
) )
.unwrap(); .unwrap();

View file

@ -3,6 +3,7 @@ extern crate base64;
use self::base64::encode; use self::base64::encode;
use http::retrieve_asset; use http::retrieve_asset;
use regex::Regex; use regex::Regex;
use reqwest::Client;
use std::collections::HashMap; use std::collections::HashMap;
use url::{ParseError, Url}; use url::{ParseError, Url};
@ -112,13 +113,12 @@ pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<Strin
pub fn resolve_css_imports( pub fn resolve_css_imports(
cache: &mut HashMap<String, String>, cache: &mut HashMap<String, String>,
client: &Client,
css_string: &str, css_string: &str,
as_dataurl: bool, as_dataurl: bool,
href: &str, href: &str,
opt_no_images: bool, opt_no_images: bool,
opt_user_agent: &str,
opt_silent: bool, opt_silent: bool,
opt_insecure: bool,
) -> String { ) -> String {
let mut resolved_css = String::from(css_string); let mut resolved_css = String::from(css_string);
@ -141,35 +141,32 @@ pub fn resolve_css_imports(
// The link is an @import link // The link is an @import link
retrieve_asset( retrieve_asset(
cache, cache,
client,
&embedded_url, &embedded_url,
false, // Formating as data URL will be done later false, // Formating as data URL will be done later
"text/css", // Expect CSS "text/css", // Expect CSS
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.map(|(content, _)| { .map(|(content, _)| {
resolve_css_imports( resolve_css_imports(
cache, cache,
client,
&content, &content,
true, // Finally, convert to a dataurl true, // Finally, convert to a dataurl
&embedded_url, &embedded_url,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
}) })
} else if (is_image && !opt_no_images) || is_font { } else if (is_image && !opt_no_images) || is_font {
// The link is some other, non-@import link // The link is some other, non-@import link
retrieve_asset( retrieve_asset(
cache, cache,
client,
&embedded_url, &embedded_url,
true, // Format as data URL true, // Format as data URL
"", // Unknown MIME type "", // Unknown MIME type
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.map(|(a, _)| a) .map(|(a, _)| a)
} else { } else {