diff --git a/README.md b/README.md index 2ba7048..175f15a 100644 --- a/README.md +++ b/README.md @@ -54,9 +54,10 @@ The guide can be found [here](docs/containers.md) --------------------------------------------------- ## Options - - `-c`: Ignore styles - - `-f`: Exclude frames - - `-F`: Omit web fonts + - `-c`: Exclude CSS + - `-e`: Ignore network errors + - `-f`: Omit frames + - `-F`: Exclude web fonts - `-i`: Remove images - `-I`: Isolate the document - `-j`: Exclude JavaScript diff --git a/docs/arch/0003-network-request-timeout.md b/docs/arch/0003-network-request-timeout.md index d1e9e59..3fc1d7b 100644 --- a/docs/arch/0003-network-request-timeout.md +++ b/docs/arch/0003-network-request-timeout.md @@ -1,4 +1,4 @@ -# 2. Network request timeout +# 3. Network request timeout Date: 2020-02-15 diff --git a/docs/arch/0005-asset-minimization.md b/docs/arch/0005-asset-minimization.md index 0c5e825..75890dd 100644 --- a/docs/arch/0005-asset-minimization.md +++ b/docs/arch/0005-asset-minimization.md @@ -1,4 +1,4 @@ -# 4. Asset Minimization +# 5. Asset Minimization Date: 2020-03-14 diff --git a/docs/arch/0006-reload-and-location-meta-tags.md b/docs/arch/0006-reload-and-location-meta-tags.md index 9605c9a..6f4d3d7 100644 --- a/docs/arch/0006-reload-and-location-meta-tags.md +++ b/docs/arch/0006-reload-and-location-meta-tags.md @@ -1,4 +1,4 @@ -# 4. Reload and location `meta` tags +# 6. Reload and location `meta` tags Date: 2020-06-25 diff --git a/docs/arch/0007-network-errors.md b/docs/arch/0007-network-errors.md new file mode 100644 index 0000000..1ca797d --- /dev/null +++ b/docs/arch/0007-network-errors.md @@ -0,0 +1,19 @@ +# 7. Network errors + +Date: 2020-11-22 + +## Status + +Accepted + +## Context + +Servers may return information with HTTP response codes other than `200`, however those responses may still contain useful data. + +## Decision + +Fail by default, notifying of the network error. Add option to continue retrieving assets by treating all response codes as `200`. + +## Consequences + +Monolith will fail to obtain resources with status other than `200`, unless told to ignore network errors. diff --git a/src/css.rs b/src/css.rs index 2a646fe..2fe6148 100644 --- a/src/css.rs +++ b/src/css.rs @@ -176,7 +176,7 @@ pub fn process_css<'a>( client, &parent_url, &import_full_url, - options.silent, + options, depth + 1, ) { Ok((import_contents, import_final_url, _import_media_type)) => { @@ -227,7 +227,7 @@ pub fn process_css<'a>( client, &parent_url, &resolved_url, - options.silent, + options, depth + 1, ) { Ok((data, final_url, media_type)) => { @@ -315,14 +315,8 @@ pub fn process_css<'a>( if is_import { let full_url = resolve_url(&parent_url, value).unwrap_or_default(); let url_fragment = get_url_fragment(full_url.clone()); - match retrieve_asset( - cache, - client, - &parent_url, - &full_url, - options.silent, - depth + 1, - ) { + match retrieve_asset(cache, client, &parent_url, &full_url, options, depth + 1) + { Ok((css, final_url, _media_type)) => { let data_url = data_to_data_url( "text/css", @@ -361,7 +355,7 @@ pub fn process_css<'a>( client, &parent_url, &full_url, - options.silent, + options, depth + 1, ) { Ok((data, final_url, media_type)) => { diff --git a/src/html.rs b/src/html.rs index 5a1ac4c..d16cba4 100644 --- a/src/html.rs +++ b/src/html.rs @@ -148,7 +148,7 @@ pub fn embed_srcset( client, &parent_url, &image_full_url, - options.silent, + options, depth + 1, ) { Ok((image_data, image_final_url, image_media_type)) => { @@ -539,7 +539,7 @@ pub fn walk_and_embed_assets( client, &url, &link_href_full_url, - options.silent, + options, depth + 1, ) { Ok(( @@ -612,7 +612,7 @@ pub fn walk_and_embed_assets( client, &url, &link_href_full_url, - options.silent, + options, depth + 1, ) { Ok(( @@ -723,7 +723,7 @@ pub fn walk_and_embed_assets( client, &url, &background_full_url, - options.silent, + options, depth + 1, ) { Ok((background_data, background_final_url, background_media_type)) => { @@ -807,7 +807,7 @@ pub fn walk_and_embed_assets( client, &url, &img_full_url, - options.silent, + options, depth + 1, ) { Ok((img_data, img_final_url, img_media_type)) => { @@ -898,7 +898,7 @@ pub fn walk_and_embed_assets( client, &url, &input_image_full_url, - options.silent, + options, depth + 1, ) { Ok(( @@ -961,7 +961,7 @@ pub fn walk_and_embed_assets( client, &url, &image_full_url, - options.silent, + options, depth + 1, ) { Ok((image_data, image_final_url, image_media_type)) => { @@ -1020,7 +1020,7 @@ pub fn walk_and_embed_assets( client, &url, &srcset_full_url, - options.silent, + options, depth + 1, ) { Ok((srcset_data, srcset_final_url, srcset_media_type)) => { @@ -1103,7 +1103,7 @@ pub fn walk_and_embed_assets( client, &url, &script_full_url, - options.silent, + options, depth + 1, ) { Ok((script_data, script_final_url, _script_media_type)) => { @@ -1196,7 +1196,7 @@ pub fn walk_and_embed_assets( client, &url, &frame_full_url, - options.silent, + options, depth + 1, ) { Ok((frame_data, frame_final_url, frame_media_type)) => { @@ -1269,7 +1269,7 @@ pub fn walk_and_embed_assets( client, &url, &video_poster_full_url, - options.silent, + options, depth + 1, ) { Ok(( diff --git a/src/main.rs b/src/main.rs index e8bba9e..4930d6a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -113,20 +113,15 @@ fn main() { // Retrieve target document if is_file_url(target_url) || is_http_url(target_url) { - match retrieve_asset( - &mut cache, - &client, - target_url, - target_url, - options.silent, - 0, - ) { + match retrieve_asset(&mut cache, &client, target_url, target_url, &options, 0) { Ok((data, final_url, _media_type)) => { base_url = final_url; dom = html_to_dom(&String::from_utf8_lossy(&data)); } Err(_) => { - eprintln!("Could not retrieve target document"); + if !options.silent { + eprintln!("Could not retrieve target document"); + } process::exit(1); } } @@ -159,7 +154,7 @@ fn main() { &client, &base_url, &favicon_ico_url, - options.silent, + &options, 0, ) { Ok((data, final_url, media_type)) => { diff --git a/src/opts.rs b/src/opts.rs index aa8b239..81ae9e7 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -4,6 +4,7 @@ use clap::{App, Arg}; pub struct Options { pub target: String, pub no_css: bool, + pub ignore_errors: bool, pub no_fonts: bool, pub no_frames: bool, pub no_images: bool, @@ -45,6 +46,7 @@ impl Options { ) // .args_from_usage("-a, --no-audio 'Removes audio sources'") .args_from_usage("-c, --no-css 'Removes CSS'") + .args_from_usage("-e, --ignore-errors 'Ignore network errors'") .args_from_usage("-f, --no-frames 'Removes frames and iframes'") .args_from_usage("-F, --no-fonts 'Removes fonts'") .args_from_usage("-i, --no-images 'Removes images'") @@ -66,6 +68,7 @@ impl Options { .expect("please set target") .to_string(); options.no_css = app.is_present("no-css"); + options.ignore_errors = app.is_present("ignore-errors"); options.no_frames = app.is_present("no-frames"); options.no_fonts = app.is_present("no-fonts"); options.no_images = app.is_present("no-images"); diff --git a/src/tests/utils/retrieve_asset.rs b/src/tests/utils/retrieve_asset.rs index 023f2e3..919165d 100644 --- a/src/tests/utils/retrieve_asset.rs +++ b/src/tests/utils/retrieve_asset.rs @@ -11,6 +11,7 @@ mod passing { use std::collections::HashMap; use std::env; + use crate::opts::Options; use crate::url; use crate::utils; @@ -19,6 +20,9 @@ mod passing { let cache = &mut HashMap::new(); let client = Client::new(); + let mut options = Options::default(); + options.silent = true; + // If both source and target are data URLs, // ensure the result contains target data URL let (data, final_url, media_type) = utils::retrieve_asset( @@ -26,7 +30,7 @@ mod passing { &client, "data:text/html;base64,c291cmNl", "data:text/html;base64,dGFyZ2V0", - false, + &options, 0, ) .unwrap(); @@ -46,6 +50,9 @@ mod passing { let cache = &mut HashMap::new(); let client = Client::new(); + let mut options = Options::default(); + options.silent = true; + let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; // Inclusion of local assets from local sources should be allowed @@ -63,7 +70,7 @@ mod passing { file = file_url_protocol, cwd = cwd.to_str().unwrap() ), - false, + &options, 0, ) .unwrap(); @@ -91,6 +98,7 @@ mod failing { use reqwest::blocking::Client; use std::collections::HashMap; + use crate::opts::Options; use crate::utils; #[test] @@ -98,13 +106,16 @@ mod failing { let cache = &mut HashMap::new(); let client = Client::new(); + let mut options = Options::default(); + options.silent = true; + // Inclusion of local assets from data URL sources should not be allowed match utils::retrieve_asset( cache, &client, "data:text/html;base64,SoUrCe", "file:///etc/passwd", - false, + &options, 0, ) { Ok((..)) => { @@ -121,13 +132,16 @@ mod failing { let cache = &mut HashMap::new(); let client = Client::new(); + let mut options = Options::default(); + options.silent = true; + // Inclusion of local assets from remote sources should not be allowed match utils::retrieve_asset( cache, &client, "https://kernel.org/", "file:///etc/passwd", - false, + &options, 0, ) { Ok((..)) => { diff --git a/src/utils.rs b/src/utils.rs index 9016b94..014ee6e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,6 +4,7 @@ use std::collections::HashMap; use std::fs; use std::path::Path; +use crate::opts::Options; use crate::url::{clean_url, data_url_to_data, file_url_to_fs_path, is_data_url, is_file_url}; const INDENT: &str = " "; @@ -73,7 +74,7 @@ pub fn retrieve_asset( client: &Client, parent_url: &str, url: &str, - opt_silent: bool, + options: &Options, depth: u32, ) -> Result<(Vec, String, String), reqwest::Error> { if url.len() == 0 { @@ -95,7 +96,7 @@ pub fn retrieve_asset( let fs_file_path: String = file_url_to_fs_path(url); let path = Path::new(&fs_file_path); if path.exists() { - if !opt_silent { + if !options.silent { eprintln!("{}{}", indent(depth).as_str(), &url); } @@ -109,7 +110,7 @@ pub fn retrieve_asset( if cache.contains_key(&cache_key) { // URL is in cache, we get and return it - if !opt_silent { + if !options.silent { eprintln!("{}{} (from cache)", indent(depth).as_str(), &url); } @@ -120,34 +121,46 @@ pub fn retrieve_asset( )) } else { // URL not in cache, we retrieve the file - let mut response = client.get(url).send()?; - let res_url = response.url().to_string(); + match client.get(url).send() { + Ok(mut response) => { + if !options.ignore_errors && response.status() != 200 { + if !options.silent { + eprintln!("Unable to retrieve {} ({})", &url, response.status()); + } + // Provoke error + return Err(client.get("").send().unwrap_err()); + } - if !opt_silent { - if url == res_url { - eprintln!("{}{}", indent(depth).as_str(), &url); - } else { - eprintln!("{}{} -> {}", indent(depth).as_str(), &url, &res_url); + let res_url = response.url().to_string(); + + if !options.silent { + if url == res_url { + eprintln!("{}{}", indent(depth).as_str(), &url); + } else { + eprintln!("{}{} -> {}", indent(depth).as_str(), &url, &res_url); + } + } + + let new_cache_key: String = clean_url(&res_url); + + // Convert response into a byte array + let mut data: Vec = vec![]; + response.copy_to(&mut data)?; + + // Attempt to obtain media type by reading the Content-Type header + let media_type = response + .headers() + .get(CONTENT_TYPE) + .and_then(|header| header.to_str().ok()) + .unwrap_or(""); + + // Add retrieved resource to cache + cache.insert(new_cache_key, data.clone()); + + Ok((data, res_url, media_type.to_string())) } + Err(error) => Err(error), } - - let new_cache_key: String = clean_url(&res_url); - - // Convert response into a byte array - let mut data: Vec = vec![]; - response.copy_to(&mut data)?; - - // Attempt to obtain media type by reading the Content-Type header - let media_type = response - .headers() - .get(CONTENT_TYPE) - .and_then(|header| header.to_str().ok()) - .unwrap_or(""); - - // Add to cache - cache.insert(new_cache_key, data.clone()); - - Ok((data, res_url, media_type.to_string())) } } }