add option to exclude specific domains
This commit is contained in:
parent
9a782d5e9c
commit
e89b79492b
5 changed files with 139 additions and 1 deletions
|
@ -113,7 +113,9 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
|
||||||
- `-b`: Use custom `base URL`
|
- `-b`: Use custom `base URL`
|
||||||
- `-c`: Exclude CSS
|
- `-c`: Exclude CSS
|
||||||
- `-C`: Save document using custom `charset`
|
- `-C`: Save document using custom `charset`
|
||||||
|
- `-D`: Allow retrieving assets only from specified `domains`
|
||||||
- `-e`: Ignore network errors
|
- `-e`: Ignore network errors
|
||||||
|
- `-E`: Exclude all assets located within domains specified in whitelist
|
||||||
- `-f`: Omit frames
|
- `-f`: Omit frames
|
||||||
- `-F`: Exclude web fonts
|
- `-F`: Exclude web fonts
|
||||||
- `-i`: Remove images
|
- `-i`: Remove images
|
||||||
|
|
10
src/opts.rs
10
src/opts.rs
|
@ -7,7 +7,9 @@ pub struct Options {
|
||||||
pub base_url: Option<String>,
|
pub base_url: Option<String>,
|
||||||
pub no_css: bool,
|
pub no_css: bool,
|
||||||
pub charset: Option<String>,
|
pub charset: Option<String>,
|
||||||
|
pub domains: Option<Vec<String>>,
|
||||||
pub ignore_errors: bool,
|
pub ignore_errors: bool,
|
||||||
|
pub exclude_domains: bool,
|
||||||
pub no_frames: bool,
|
pub no_frames: bool,
|
||||||
pub no_fonts: bool,
|
pub no_fonts: bool,
|
||||||
pub no_images: bool,
|
pub no_images: bool,
|
||||||
|
@ -50,7 +52,11 @@ impl Options {
|
||||||
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
|
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
|
||||||
.args_from_usage("-c, --no-css 'Removes CSS'")
|
.args_from_usage("-c, --no-css 'Removes CSS'")
|
||||||
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
|
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
|
||||||
|
.args_from_usage(
|
||||||
|
"-D, --domains=[bad.org,ads.site,0.0.0.0,127.0.0.0:8080] 'Whitelist of domains'",
|
||||||
|
)
|
||||||
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
|
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
|
||||||
|
.args_from_usage("-E, --exclude-domains 'Treat list of specified domains as blacklist'")
|
||||||
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
|
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
|
||||||
.args_from_usage("-F, --no-fonts 'Removes fonts'")
|
.args_from_usage("-F, --no-fonts 'Removes fonts'")
|
||||||
.args_from_usage("-i, --no-images 'Removes images'")
|
.args_from_usage("-i, --no-images 'Removes images'")
|
||||||
|
@ -91,7 +97,11 @@ impl Options {
|
||||||
if let Some(charset) = app.value_of("charset") {
|
if let Some(charset) = app.value_of("charset") {
|
||||||
options.charset = Some(charset.to_string());
|
options.charset = Some(charset.to_string());
|
||||||
}
|
}
|
||||||
|
if let Some(domains) = app.value_of("domains") {
|
||||||
|
options.domains = Some(domains.split(",").map(|s| s.to_string()).collect());
|
||||||
|
}
|
||||||
options.ignore_errors = app.is_present("ignore-errors");
|
options.ignore_errors = app.is_present("ignore-errors");
|
||||||
|
options.exclude_domains = app.is_present("exclude-domains");
|
||||||
options.no_frames = app.is_present("no-frames");
|
options.no_frames = app.is_present("no-frames");
|
||||||
options.no_fonts = app.is_present("no-fonts");
|
options.no_fonts = app.is_present("no-fonts");
|
||||||
options.no_images = app.is_present("no-images");
|
options.no_images = app.is_present("no-images");
|
||||||
|
|
41
src/utils.rs
41
src/utils.rs
|
@ -92,6 +92,36 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String {
|
||||||
mime.to_string()
|
mime.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn domain_is_within_domain(domain: &str, domain_to_match_against: &str) -> bool {
|
||||||
|
let domain_partials: Vec<&str> = domain.split(".").collect();
|
||||||
|
let domain_to_match_against_partials: Vec<&str> = domain_to_match_against
|
||||||
|
.trim_start_matches(".")
|
||||||
|
.split(".")
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut i: usize = domain_partials.len();
|
||||||
|
let mut j: usize = domain_to_match_against_partials.len();
|
||||||
|
|
||||||
|
if i >= j {
|
||||||
|
while j > 0 {
|
||||||
|
if !domain_partials
|
||||||
|
.get(i - 1)
|
||||||
|
.unwrap()
|
||||||
|
.eq_ignore_ascii_case(&domain_to_match_against_partials.get(j - 1).unwrap())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
i -= 1;
|
||||||
|
j -= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
j == 0
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn indent(level: u32) -> String {
|
pub fn indent(level: u32) -> String {
|
||||||
let mut result: String = String::new();
|
let mut result: String = String::new();
|
||||||
let mut l: u32 = level;
|
let mut l: u32 = level;
|
||||||
|
@ -148,7 +178,7 @@ pub fn retrieve_asset(
|
||||||
let (media_type, charset, data) = parse_data_url(url);
|
let (media_type, charset, data) = parse_data_url(url);
|
||||||
Ok((data, url.clone(), media_type, charset))
|
Ok((data, url.clone(), media_type, charset))
|
||||||
} else if url.scheme() == "file" {
|
} else if url.scheme() == "file" {
|
||||||
// Check if parent_url is also file:/// (if not, then we don't embed the asset)
|
// Check if parent_url is also a file: URL (if not, then we don't embed the asset)
|
||||||
if parent_url.scheme() != "file" {
|
if parent_url.scheme() != "file" {
|
||||||
if !options.silent {
|
if !options.silent {
|
||||||
eprintln!(
|
eprintln!(
|
||||||
|
@ -236,6 +266,15 @@ pub fn retrieve_asset(
|
||||||
"".to_string(),
|
"".to_string(),
|
||||||
))
|
))
|
||||||
} else {
|
} else {
|
||||||
|
if let Some(domains) = &options.domains {
|
||||||
|
if domains
|
||||||
|
.iter()
|
||||||
|
.any(|d| domain_is_within_domain(url.host_str().unwrap(), &d.trim()))
|
||||||
|
{
|
||||||
|
return Err(client.get("").send().unwrap_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// URL not in cache, we retrieve the file
|
// URL not in cache, we retrieve the file
|
||||||
match client.get(url.as_str()).send() {
|
match client.get(url.as_str()).send() {
|
||||||
Ok(response) => {
|
Ok(response) => {
|
||||||
|
|
86
tests/utils/domain_is_within_domain.rs
Normal file
86
tests/utils/domain_is_within_domain.rs
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
|
||||||
|
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
|
||||||
|
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
|
||||||
|
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
|
||||||
|
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
|
||||||
|
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod passing {
|
||||||
|
use monolith::utils;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sub_domain_is_within_domain() {
|
||||||
|
assert!(utils::domain_is_within_domain(
|
||||||
|
"news.ycombinator.com",
|
||||||
|
"ycombinator.com"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sub_domain_is_within_dotted_domain() {
|
||||||
|
assert!(utils::domain_is_within_domain(
|
||||||
|
"news.ycombinator.com",
|
||||||
|
".ycombinator.com"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn domain_is_within_top_level_domain() {
|
||||||
|
assert!(utils::domain_is_within_domain("ycombinator.com", "com"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn domain_is_within_itself() {
|
||||||
|
assert!(utils::domain_is_within_domain(
|
||||||
|
"ycombinator.com",
|
||||||
|
"ycombinator.com"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sub_domain_is_within_dotted_itself() {
|
||||||
|
assert!(utils::domain_is_within_domain(
|
||||||
|
"ycombinator.com",
|
||||||
|
".ycombinator.com"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
|
||||||
|
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
|
||||||
|
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
|
||||||
|
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
|
||||||
|
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
|
||||||
|
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod failing {
|
||||||
|
use monolith::utils;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sub_domain_is_not_within_domain() {
|
||||||
|
assert!(!utils::domain_is_within_domain(
|
||||||
|
"news.ycombinator.com",
|
||||||
|
"kernel.org"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sub_domain_is_not_within_top_level_domain() {
|
||||||
|
assert!(!utils::domain_is_within_domain(
|
||||||
|
"news.ycombinator.com",
|
||||||
|
"org"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn no_domain_is_not_within_dot() {
|
||||||
|
assert!(!utils::domain_is_within_domain("news.ycombinator.com", "."));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn no_domain_is_within_empty_domain() {
|
||||||
|
assert!(!utils::domain_is_within_domain("news.ycombinator.com", ""));
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,4 +1,5 @@
|
||||||
mod detect_media_type;
|
mod detect_media_type;
|
||||||
|
mod domain_is_within_domain;
|
||||||
mod indent;
|
mod indent;
|
||||||
mod parse_content_type;
|
mod parse_content_type;
|
||||||
mod retrieve_asset;
|
mod retrieve_asset;
|
||||||
|
|
Loading…
Reference in a new issue