From e89b79492b7197b68d9342ae7de5370aa03c131a Mon Sep 17 00:00:00 2001 From: Sunshine Date: Sun, 27 Jun 2021 03:26:52 -1000 Subject: [PATCH] add option to exclude specific domains --- README.md | 2 + src/opts.rs | 10 +++ src/utils.rs | 41 +++++++++++- tests/utils/domain_is_within_domain.rs | 86 ++++++++++++++++++++++++++ tests/utils/mod.rs | 1 + 5 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 tests/utils/domain_is_within_domain.rs diff --git a/README.md b/README.md index 722a53f..eaf8662 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,9 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html - `-b`: Use custom `base URL` - `-c`: Exclude CSS - `-C`: Save document using custom `charset` + - `-D`: Allow retrieving assets only from specified `domains` - `-e`: Ignore network errors + - `-E`: Exclude all assets located within domains specified in whitelist - `-f`: Omit frames - `-F`: Exclude web fonts - `-i`: Remove images diff --git a/src/opts.rs b/src/opts.rs index f2618af..8ef0d2c 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -7,7 +7,9 @@ pub struct Options { pub base_url: Option, pub no_css: bool, pub charset: Option, + pub domains: Option>, pub ignore_errors: bool, + pub exclude_domains: bool, pub no_frames: bool, pub no_fonts: bool, pub no_images: bool, @@ -50,7 +52,11 @@ impl Options { .args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'") .args_from_usage("-c, --no-css 'Removes CSS'") .args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'") + .args_from_usage( + "-D, --domains=[bad.org,ads.site,0.0.0.0,127.0.0.0:8080] 'Whitelist of domains'", + ) .args_from_usage("-e, --ignore-errors 'Ignore network errors'") + .args_from_usage("-E, --exclude-domains 'Treat list of specified domains as blacklist'") .args_from_usage("-f, --no-frames 'Removes frames and iframes'") .args_from_usage("-F, --no-fonts 'Removes fonts'") .args_from_usage("-i, --no-images 'Removes images'") @@ -91,7 +97,11 @@ impl Options { if let Some(charset) = app.value_of("charset") { options.charset = Some(charset.to_string()); } + if let Some(domains) = app.value_of("domains") { + options.domains = Some(domains.split(",").map(|s| s.to_string()).collect()); + } options.ignore_errors = app.is_present("ignore-errors"); + options.exclude_domains = app.is_present("exclude-domains"); options.no_frames = app.is_present("no-frames"); options.no_fonts = app.is_present("no-fonts"); options.no_images = app.is_present("no-images"); diff --git a/src/utils.rs b/src/utils.rs index 5d66a53..6ea15b7 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -92,6 +92,36 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String { mime.to_string() } +pub fn domain_is_within_domain(domain: &str, domain_to_match_against: &str) -> bool { + let domain_partials: Vec<&str> = domain.split(".").collect(); + let domain_to_match_against_partials: Vec<&str> = domain_to_match_against + .trim_start_matches(".") + .split(".") + .collect(); + + let mut i: usize = domain_partials.len(); + let mut j: usize = domain_to_match_against_partials.len(); + + if i >= j { + while j > 0 { + if !domain_partials + .get(i - 1) + .unwrap() + .eq_ignore_ascii_case(&domain_to_match_against_partials.get(j - 1).unwrap()) + { + break; + } + + i -= 1; + j -= 1; + } + + j == 0 + } else { + false + } +} + pub fn indent(level: u32) -> String { let mut result: String = String::new(); let mut l: u32 = level; @@ -148,7 +178,7 @@ pub fn retrieve_asset( let (media_type, charset, data) = parse_data_url(url); Ok((data, url.clone(), media_type, charset)) } else if url.scheme() == "file" { - // Check if parent_url is also file:/// (if not, then we don't embed the asset) + // Check if parent_url is also a file: URL (if not, then we don't embed the asset) if parent_url.scheme() != "file" { if !options.silent { eprintln!( @@ -236,6 +266,15 @@ pub fn retrieve_asset( "".to_string(), )) } else { + if let Some(domains) = &options.domains { + if domains + .iter() + .any(|d| domain_is_within_domain(url.host_str().unwrap(), &d.trim())) + { + return Err(client.get("").send().unwrap_err()); + } + } + // URL not in cache, we retrieve the file match client.get(url.as_str()).send() { Ok(response) => { diff --git a/tests/utils/domain_is_within_domain.rs b/tests/utils/domain_is_within_domain.rs new file mode 100644 index 0000000..8a1ccea --- /dev/null +++ b/tests/utils/domain_is_within_domain.rs @@ -0,0 +1,86 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use monolith::utils; + + #[test] + fn sub_domain_is_within_domain() { + assert!(utils::domain_is_within_domain( + "news.ycombinator.com", + "ycombinator.com" + )); + } + + #[test] + fn sub_domain_is_within_dotted_domain() { + assert!(utils::domain_is_within_domain( + "news.ycombinator.com", + ".ycombinator.com" + )); + } + + #[test] + fn domain_is_within_top_level_domain() { + assert!(utils::domain_is_within_domain("ycombinator.com", "com")); + } + + #[test] + fn domain_is_within_itself() { + assert!(utils::domain_is_within_domain( + "ycombinator.com", + "ycombinator.com" + )); + } + + #[test] + fn sub_domain_is_within_dotted_itself() { + assert!(utils::domain_is_within_domain( + "ycombinator.com", + ".ycombinator.com" + )); + } +} + +// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ +// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ +// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ +// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod failing { + use monolith::utils; + + #[test] + fn sub_domain_is_not_within_domain() { + assert!(!utils::domain_is_within_domain( + "news.ycombinator.com", + "kernel.org" + )); + } + + #[test] + fn sub_domain_is_not_within_top_level_domain() { + assert!(!utils::domain_is_within_domain( + "news.ycombinator.com", + "org" + )); + } + + #[test] + fn no_domain_is_not_within_dot() { + assert!(!utils::domain_is_within_domain("news.ycombinator.com", ".")); + } + + #[test] + fn no_domain_is_within_empty_domain() { + assert!(!utils::domain_is_within_domain("news.ycombinator.com", "")); + } +} diff --git a/tests/utils/mod.rs b/tests/utils/mod.rs index e7c7739..60a3ce6 100644 --- a/tests/utils/mod.rs +++ b/tests/utils/mod.rs @@ -1,4 +1,5 @@ mod detect_media_type; +mod domain_is_within_domain; mod indent; mod parse_content_type; mod retrieve_asset;