add option to blacklist/whitelist domains

This commit is contained in:
Sunshine 2022-09-01 19:35:52 -04:00 committed by GitHub
parent 54609b10e5
commit 89ce5029b9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 243 additions and 2 deletions

View file

@ -113,7 +113,9 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
- `-b`: Use custom `base URL` - `-b`: Use custom `base URL`
- `-c`: Exclude CSS - `-c`: Exclude CSS
- `-C`: Save document using custom `charset` - `-C`: Save document using custom `charset`
- `-d`: Allow retrieving assets only from specified `domain(s)`
- `-e`: Ignore network errors - `-e`: Ignore network errors
- `-E`: Avoid retrieving assets located within specified domains
- `-f`: Omit frames - `-f`: Omit frames
- `-F`: Exclude web fonts - `-F`: Exclude web fonts
- `-i`: Remove images - `-i`: Remove images

View file

@ -1,4 +1,4 @@
use clap::{App, Arg}; use clap::{App, Arg, ArgAction};
use std::env; use std::env;
#[derive(Default)] #[derive(Default)]
@ -7,7 +7,9 @@ pub struct Options {
pub base_url: Option<String>, pub base_url: Option<String>,
pub no_css: bool, pub no_css: bool,
pub charset: Option<String>, pub charset: Option<String>,
pub domains: Option<Vec<String>>,
pub ignore_errors: bool, pub ignore_errors: bool,
pub exclude_domains: bool,
pub no_frames: bool, pub no_frames: bool,
pub no_fonts: bool, pub no_fonts: bool,
pub no_images: bool, pub no_images: bool,
@ -50,7 +52,17 @@ impl Options {
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'") .args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
.args_from_usage("-c, --no-css 'Removes CSS'") .args_from_usage("-c, --no-css 'Removes CSS'")
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'") .args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
.arg(
Arg::with_name("domains")
.short('d')
.long("domains")
.takes_value(true)
.value_name("DOMAINS")
.action(ArgAction::Append)
.help("Whitelist of domains"),
)
.args_from_usage("-e, --ignore-errors 'Ignore network errors'") .args_from_usage("-e, --ignore-errors 'Ignore network errors'")
.args_from_usage("-E, --exclude-domains 'Treat specified domains as blacklist'")
.args_from_usage("-f, --no-frames 'Removes frames and iframes'") .args_from_usage("-f, --no-frames 'Removes frames and iframes'")
.args_from_usage("-F, --no-fonts 'Removes fonts'") .args_from_usage("-F, --no-fonts 'Removes fonts'")
.args_from_usage("-i, --no-images 'Removes images'") .args_from_usage("-i, --no-images 'Removes images'")
@ -91,7 +103,12 @@ impl Options {
if let Some(charset) = app.value_of("charset") { if let Some(charset) = app.value_of("charset") {
options.charset = Some(charset.to_string()); options.charset = Some(charset.to_string());
} }
if let Some(domains) = app.get_many::<String>("domains") {
let list_of_domains: Vec<String> = domains.map(|v| v.clone()).collect::<Vec<_>>();
options.domains = Some(list_of_domains);
}
options.ignore_errors = app.is_present("ignore-errors"); options.ignore_errors = app.is_present("ignore-errors");
options.exclude_domains = app.is_present("exclude-domains");
options.no_frames = app.is_present("no-frames"); options.no_frames = app.is_present("no-frames");
options.no_fonts = app.is_present("no-fonts"); options.no_fonts = app.is_present("no-fonts");
options.no_images = app.is_present("no-images"); options.no_images = app.is_present("no-images");

View file

@ -92,6 +92,62 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String {
mime.to_string() mime.to_string()
} }
pub fn domain_is_within_domain(domain: &str, domain_to_match_against: &str) -> bool {
if domain_to_match_against.len() == 0 {
return false;
}
if domain_to_match_against == "." {
return true;
}
let domain_partials: Vec<&str> = domain.trim_end_matches(".").rsplit(".").collect();
let domain_to_match_against_partials: Vec<&str> = domain_to_match_against
.trim_end_matches(".")
.rsplit(".")
.collect();
let domain_to_match_against_starts_with_a_dot = domain_to_match_against.starts_with(".");
let mut i: usize = 0;
let l: usize = std::cmp::max(
domain_partials.len(),
domain_to_match_against_partials.len(),
);
let mut ok: bool = true;
while i < l {
// Exit and return false if went out of bounds of domain to match against, and it didn't start with a dot
if !domain_to_match_against_starts_with_a_dot
&& domain_to_match_against_partials.len() < i + 1
{
ok = false;
break;
}
let domain_partial = if domain_partials.len() < i + 1 {
""
} else {
domain_partials.get(i).unwrap()
};
let domain_to_match_against_partial = if domain_to_match_against_partials.len() < i + 1 {
""
} else {
domain_to_match_against_partials.get(i).unwrap()
};
let parts_match = domain_to_match_against_partial.eq_ignore_ascii_case(domain_partial);
if !parts_match && domain_to_match_against_partial.len() != 0 {
ok = false;
break;
}
i += 1;
}
ok
}
pub fn indent(level: u32) -> String { pub fn indent(level: u32) -> String {
let mut result: String = String::new(); let mut result: String = String::new();
let mut l: u32 = level; let mut l: u32 = level;
@ -148,7 +204,7 @@ pub fn retrieve_asset(
let (media_type, charset, data) = parse_data_url(url); let (media_type, charset, data) = parse_data_url(url);
Ok((data, url.clone(), media_type, charset)) Ok((data, url.clone(), media_type, charset))
} else if url.scheme() == "file" { } else if url.scheme() == "file" {
// Check if parent_url is also file:/// (if not, then we don't embed the asset) // Check if parent_url is also a file: URL (if not, then we don't embed the asset)
if parent_url.scheme() != "file" { if parent_url.scheme() != "file" {
if !options.silent { if !options.silent {
eprintln!( eprintln!(
@ -236,6 +292,17 @@ pub fn retrieve_asset(
"".to_string(), "".to_string(),
)) ))
} else { } else {
if let Some(domains) = &options.domains {
let domain_matches = domains
.iter()
.any(|d| domain_is_within_domain(url.host_str().unwrap(), &d.trim()));
if (options.exclude_domains && domain_matches)
|| (!options.exclude_domains && !domain_matches)
{
return Err(client.get("").send().unwrap_err());
}
}
// URL not in cache, we retrieve the file // URL not in cache, we retrieve the file
match client.get(url.as_str()).send() { match client.get(url.as_str()).send() {
Ok(response) => { Ok(response) => {

View file

@ -0,0 +1,154 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::utils;
#[test]
fn sub_domain_is_within_dotted_sub_domain() {
assert!(utils::domain_is_within_domain(
"news.ycombinator.com",
".news.ycombinator.com"
));
}
#[test]
fn domain_is_within_dotted_domain() {
assert!(utils::domain_is_within_domain(
"ycombinator.com",
".ycombinator.com"
));
}
#[test]
fn sub_domain_is_within_dotted_domain() {
assert!(utils::domain_is_within_domain(
"news.ycombinator.com",
".ycombinator.com"
));
}
#[test]
fn sub_domain_is_within_dotted_top_level_domain() {
assert!(utils::domain_is_within_domain(
"news.ycombinator.com",
".com"
));
}
#[test]
fn domain_is_within_itself() {
assert!(utils::domain_is_within_domain(
"ycombinator.com",
"ycombinator.com"
));
}
#[test]
fn domain_with_trailing_dot_is_within_itself() {
assert!(utils::domain_is_within_domain(
"ycombinator.com.",
"ycombinator.com"
));
}
#[test]
fn domain_with_trailing_dot_is_within_single_dot() {
assert!(utils::domain_is_within_domain("ycombinator.com.", "."));
}
#[test]
fn domain_matches_single_dot() {
assert!(utils::domain_is_within_domain("ycombinator.com", "."));
}
#[test]
fn dotted_domain_must_be_within_dotted_domain() {
assert!(utils::domain_is_within_domain(
".ycombinator.com",
".ycombinator.com"
));
}
#[test]
fn empty_is_within_dot() {
assert!(utils::domain_is_within_domain("", "."));
}
#[test]
fn both_dots() {
assert!(utils::domain_is_within_domain(".", "."));
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use monolith::utils;
#[test]
fn sub_domain_must_not_be_within_domain() {
assert!(!utils::domain_is_within_domain(
"news.ycombinator.com",
"ycombinator.com"
));
}
#[test]
fn domain_must_not_be_within_top_level_domain() {
assert!(!utils::domain_is_within_domain("ycombinator.com", "com"));
}
#[test]
fn different_domains_must_not_be_within_one_another() {
assert!(!utils::domain_is_within_domain(
"news.ycombinator.com",
"kernel.org"
));
}
#[test]
fn sub_domain_is_not_within_wrong_top_level_domain() {
assert!(!utils::domain_is_within_domain(
"news.ycombinator.com",
"org"
));
}
#[test]
fn dotted_domain_is_not_within_domain() {
assert!(!utils::domain_is_within_domain(
".ycombinator.com",
"ycombinator.com"
));
}
#[test]
fn different_domain_is_not_within_dotted_domain() {
assert!(!utils::domain_is_within_domain(
"www.doodleoptimize.com",
".ycombinator.com"
));
}
#[test]
fn no_domain_can_be_within_empty_domain() {
assert!(!utils::domain_is_within_domain("ycombinator.com", ""));
}
#[test]
fn both_can_not_be_empty() {
assert!(!utils::domain_is_within_domain("", ""));
}
}

View file

@ -1,4 +1,5 @@
mod detect_media_type; mod detect_media_type;
mod domain_is_within_domain;
mod indent; mod indent;
mod parse_content_type; mod parse_content_type;
mod retrieve_asset; mod retrieve_asset;