Compare commits
5 Commits
Author | SHA1 | Date |
---|---|---|
Jakub Jirutka | 7c61b462dd | |
Simone Mosciatti | ef3684025b | |
Simone Mosciatti | db7ee697b3 | |
Sunshine | 89ce5029b9 | |
dependabot[bot] | 54609b10e5 |
|
@ -8,15 +8,6 @@ version = "1.0.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "0.7.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "alloc-no-stdlib"
|
||||
version = "2.0.3"
|
||||
|
@ -599,9 +590,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.44"
|
||||
version = "0.1.46"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "808cf7d67cf4a22adc5be66e75ebdf769b3f2ea032041437a7061f97a63dad4b"
|
||||
checksum = "ad2bfd338099682614d3ee3fe0cd72e0b6a41ca6a87f6a74a3bd593c91650501"
|
||||
dependencies = [
|
||||
"android_system_properties",
|
||||
"core-foundation-sys",
|
||||
|
@ -1211,8 +1202,6 @@ version = "1.6.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
|
|
|
@ -30,10 +30,15 @@ cssparser = "0.29.6"
|
|||
encoding_rs = "0.8.31"
|
||||
html5ever = "0.24.1"
|
||||
percent-encoding = "2.1.0"
|
||||
regex = "1.6.0" # Used for parsing srcset and NOSCRIPT
|
||||
sha2 = "0.10.2" # Used for calculating checksums during integrity checks
|
||||
url = "2.2.2"
|
||||
|
||||
# Used for parsing srcset and NOSCRIPT
|
||||
[dependencies.regex]
|
||||
version = "1.6.0"
|
||||
default-features = false
|
||||
features = ["std", "perf-dfa", "unicode-perl"]
|
||||
|
||||
[dependencies.reqwest]
|
||||
version = "0.11.11"
|
||||
default-features = false
|
||||
|
|
|
@ -113,7 +113,9 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
|
|||
- `-b`: Use custom `base URL`
|
||||
- `-c`: Exclude CSS
|
||||
- `-C`: Save document using custom `charset`
|
||||
- `-d`: Allow retrieving assets only from specified `domain(s)`
|
||||
- `-e`: Ignore network errors
|
||||
- `-E`: Avoid retrieving assets located within specified domains
|
||||
- `-f`: Omit frames
|
||||
- `-F`: Exclude web fonts
|
||||
- `-i`: Remove images
|
||||
|
|
90
src/main.rs
90
src/main.rs
|
@ -65,10 +65,9 @@ pub fn read_stdin() -> Vec<u8> {
|
|||
|
||||
fn main() {
|
||||
let options = Options::from_args();
|
||||
let mut target: String = options.target.clone();
|
||||
|
||||
// Check if target was provided
|
||||
if target.len() == 0 {
|
||||
if options.target.len() == 0 {
|
||||
if !options.silent {
|
||||
eprintln!("No target specified");
|
||||
}
|
||||
|
@ -83,65 +82,62 @@ fn main() {
|
|||
}
|
||||
}
|
||||
|
||||
let target_url: Url;
|
||||
let mut use_stdin: bool = false;
|
||||
|
||||
// Determine exact target URL
|
||||
if target.clone() == "-" {
|
||||
// Read from pipe (stdin)
|
||||
use_stdin = true;
|
||||
// Set default target URL to an empty data URL; the user can set it via --base-url
|
||||
target_url = Url::parse("data:text/html,").unwrap();
|
||||
} else {
|
||||
match Url::parse(&target.clone()) {
|
||||
Ok(parsed_url) => {
|
||||
if parsed_url.scheme() == "data"
|
||||
|| parsed_url.scheme() == "file"
|
||||
|| (parsed_url.scheme() == "http" || parsed_url.scheme() == "https")
|
||||
{
|
||||
target_url = parsed_url;
|
||||
} else {
|
||||
let target_url = match options.target.as_str() {
|
||||
"-" => {
|
||||
// Read from pipe (stdin)
|
||||
use_stdin = true;
|
||||
// Set default target URL to an empty data URL; the user can set it via --base-url
|
||||
Url::parse("data:text/html,").unwrap()
|
||||
}
|
||||
target => match Url::parse(&target) {
|
||||
Ok(url) => match url.scheme() {
|
||||
"data" | "file" | "http" | "https" => url,
|
||||
unsupported_scheme => {
|
||||
if !options.silent {
|
||||
eprintln!("Unsupported target URL type: {}", &parsed_url.scheme());
|
||||
eprintln!("Unsupported target URL type: {}", unsupported_scheme);
|
||||
}
|
||||
process::exit(1);
|
||||
process::exit(1)
|
||||
}
|
||||
}
|
||||
Err(_err) => {
|
||||
},
|
||||
Err(_) => {
|
||||
// Failed to parse given base URL (perhaps it's a filesystem path?)
|
||||
let path: &Path = Path::new(&target);
|
||||
|
||||
if path.exists() {
|
||||
if path.is_file() {
|
||||
match Url::from_file_path(fs::canonicalize(&path).unwrap()) {
|
||||
Ok(file_url) => {
|
||||
target_url = file_url;
|
||||
}
|
||||
Err(_err) => {
|
||||
if !options.silent {
|
||||
eprintln!(
|
||||
"Could not generate file URL out of given path: {}",
|
||||
"err"
|
||||
);
|
||||
match path.exists() {
|
||||
true => match path.is_file() {
|
||||
true => {
|
||||
let canonical_path = fs::canonicalize(&path).unwrap();
|
||||
match Url::from_file_path(canonical_path) {
|
||||
Ok(url) => url,
|
||||
Err(_) => {
|
||||
if !options.silent {
|
||||
eprintln!(
|
||||
"Could not generate file URL out of given path: {}",
|
||||
&target
|
||||
);
|
||||
}
|
||||
process::exit(1);
|
||||
}
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if !options.silent {
|
||||
eprintln!("Local target is not a file: {}", &options.target);
|
||||
false => {
|
||||
if !options.silent {
|
||||
eprintln!("Local target is not a file: {}", &target);
|
||||
}
|
||||
process::exit(1);
|
||||
}
|
||||
process::exit(1);
|
||||
},
|
||||
false => {
|
||||
// It is not a FS path, now we do what browsers do:
|
||||
// prepend "http://" and hope it points to a website
|
||||
Url::parse(&format!("http://{hopefully_url}", hopefully_url = &target))
|
||||
.unwrap()
|
||||
}
|
||||
} else {
|
||||
// Last chance, now we do what browsers do:
|
||||
// prepend "http://" and hope it points to a website
|
||||
target.insert_str(0, "http://");
|
||||
target_url = Url::parse(&target).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
// Initialize client
|
||||
let mut cache = HashMap::new();
|
||||
|
|
19
src/opts.rs
19
src/opts.rs
|
@ -1,4 +1,4 @@
|
|||
use clap::{App, Arg};
|
||||
use clap::{App, Arg, ArgAction};
|
||||
use std::env;
|
||||
|
||||
#[derive(Default)]
|
||||
|
@ -7,7 +7,9 @@ pub struct Options {
|
|||
pub base_url: Option<String>,
|
||||
pub no_css: bool,
|
||||
pub charset: Option<String>,
|
||||
pub domains: Option<Vec<String>>,
|
||||
pub ignore_errors: bool,
|
||||
pub exclude_domains: bool,
|
||||
pub no_frames: bool,
|
||||
pub no_fonts: bool,
|
||||
pub no_images: bool,
|
||||
|
@ -50,7 +52,17 @@ impl Options {
|
|||
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
|
||||
.args_from_usage("-c, --no-css 'Removes CSS'")
|
||||
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
|
||||
.arg(
|
||||
Arg::with_name("domains")
|
||||
.short('d')
|
||||
.long("domains")
|
||||
.takes_value(true)
|
||||
.value_name("DOMAINS")
|
||||
.action(ArgAction::Append)
|
||||
.help("Whitelist of domains"),
|
||||
)
|
||||
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
|
||||
.args_from_usage("-E, --exclude-domains 'Treat specified domains as blacklist'")
|
||||
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
|
||||
.args_from_usage("-F, --no-fonts 'Removes fonts'")
|
||||
.args_from_usage("-i, --no-images 'Removes images'")
|
||||
|
@ -91,7 +103,12 @@ impl Options {
|
|||
if let Some(charset) = app.value_of("charset") {
|
||||
options.charset = Some(charset.to_string());
|
||||
}
|
||||
if let Some(domains) = app.get_many::<String>("domains") {
|
||||
let list_of_domains: Vec<String> = domains.map(|v| v.clone()).collect::<Vec<_>>();
|
||||
options.domains = Some(list_of_domains);
|
||||
}
|
||||
options.ignore_errors = app.is_present("ignore-errors");
|
||||
options.exclude_domains = app.is_present("exclude-domains");
|
||||
options.no_frames = app.is_present("no-frames");
|
||||
options.no_fonts = app.is_present("no-fonts");
|
||||
options.no_images = app.is_present("no-images");
|
||||
|
|
69
src/utils.rs
69
src/utils.rs
|
@ -92,6 +92,62 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String {
|
|||
mime.to_string()
|
||||
}
|
||||
|
||||
pub fn domain_is_within_domain(domain: &str, domain_to_match_against: &str) -> bool {
|
||||
if domain_to_match_against.len() == 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
if domain_to_match_against == "." {
|
||||
return true;
|
||||
}
|
||||
|
||||
let domain_partials: Vec<&str> = domain.trim_end_matches(".").rsplit(".").collect();
|
||||
let domain_to_match_against_partials: Vec<&str> = domain_to_match_against
|
||||
.trim_end_matches(".")
|
||||
.rsplit(".")
|
||||
.collect();
|
||||
let domain_to_match_against_starts_with_a_dot = domain_to_match_against.starts_with(".");
|
||||
|
||||
let mut i: usize = 0;
|
||||
let l: usize = std::cmp::max(
|
||||
domain_partials.len(),
|
||||
domain_to_match_against_partials.len(),
|
||||
);
|
||||
let mut ok: bool = true;
|
||||
|
||||
while i < l {
|
||||
// Exit and return false if went out of bounds of domain to match against, and it didn't start with a dot
|
||||
if !domain_to_match_against_starts_with_a_dot
|
||||
&& domain_to_match_against_partials.len() < i + 1
|
||||
{
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
|
||||
let domain_partial = if domain_partials.len() < i + 1 {
|
||||
""
|
||||
} else {
|
||||
domain_partials.get(i).unwrap()
|
||||
};
|
||||
let domain_to_match_against_partial = if domain_to_match_against_partials.len() < i + 1 {
|
||||
""
|
||||
} else {
|
||||
domain_to_match_against_partials.get(i).unwrap()
|
||||
};
|
||||
|
||||
let parts_match = domain_to_match_against_partial.eq_ignore_ascii_case(domain_partial);
|
||||
|
||||
if !parts_match && domain_to_match_against_partial.len() != 0 {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
|
||||
ok
|
||||
}
|
||||
|
||||
pub fn indent(level: u32) -> String {
|
||||
let mut result: String = String::new();
|
||||
let mut l: u32 = level;
|
||||
|
@ -148,7 +204,7 @@ pub fn retrieve_asset(
|
|||
let (media_type, charset, data) = parse_data_url(url);
|
||||
Ok((data, url.clone(), media_type, charset))
|
||||
} else if url.scheme() == "file" {
|
||||
// Check if parent_url is also file:/// (if not, then we don't embed the asset)
|
||||
// Check if parent_url is also a file: URL (if not, then we don't embed the asset)
|
||||
if parent_url.scheme() != "file" {
|
||||
if !options.silent {
|
||||
eprintln!(
|
||||
|
@ -236,6 +292,17 @@ pub fn retrieve_asset(
|
|||
"".to_string(),
|
||||
))
|
||||
} else {
|
||||
if let Some(domains) = &options.domains {
|
||||
let domain_matches = domains
|
||||
.iter()
|
||||
.any(|d| domain_is_within_domain(url.host_str().unwrap(), &d.trim()));
|
||||
if (options.exclude_domains && domain_matches)
|
||||
|| (!options.exclude_domains && !domain_matches)
|
||||
{
|
||||
return Err(client.get("").send().unwrap_err());
|
||||
}
|
||||
}
|
||||
|
||||
// URL not in cache, we retrieve the file
|
||||
match client.get(url.as_str()).send() {
|
||||
Ok(response) => {
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
|
||||
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
|
||||
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
|
||||
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
|
||||
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
|
||||
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
|
||||
|
||||
#[cfg(test)]
|
||||
mod passing {
|
||||
use monolith::utils;
|
||||
|
||||
#[test]
|
||||
fn sub_domain_is_within_dotted_sub_domain() {
|
||||
assert!(utils::domain_is_within_domain(
|
||||
"news.ycombinator.com",
|
||||
".news.ycombinator.com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn domain_is_within_dotted_domain() {
|
||||
assert!(utils::domain_is_within_domain(
|
||||
"ycombinator.com",
|
||||
".ycombinator.com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sub_domain_is_within_dotted_domain() {
|
||||
assert!(utils::domain_is_within_domain(
|
||||
"news.ycombinator.com",
|
||||
".ycombinator.com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sub_domain_is_within_dotted_top_level_domain() {
|
||||
assert!(utils::domain_is_within_domain(
|
||||
"news.ycombinator.com",
|
||||
".com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn domain_is_within_itself() {
|
||||
assert!(utils::domain_is_within_domain(
|
||||
"ycombinator.com",
|
||||
"ycombinator.com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn domain_with_trailing_dot_is_within_itself() {
|
||||
assert!(utils::domain_is_within_domain(
|
||||
"ycombinator.com.",
|
||||
"ycombinator.com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn domain_with_trailing_dot_is_within_single_dot() {
|
||||
assert!(utils::domain_is_within_domain("ycombinator.com.", "."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn domain_matches_single_dot() {
|
||||
assert!(utils::domain_is_within_domain("ycombinator.com", "."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dotted_domain_must_be_within_dotted_domain() {
|
||||
assert!(utils::domain_is_within_domain(
|
||||
".ycombinator.com",
|
||||
".ycombinator.com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_is_within_dot() {
|
||||
assert!(utils::domain_is_within_domain("", "."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn both_dots() {
|
||||
assert!(utils::domain_is_within_domain(".", "."));
|
||||
}
|
||||
}
|
||||
|
||||
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
|
||||
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
|
||||
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
|
||||
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
|
||||
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
|
||||
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
|
||||
|
||||
#[cfg(test)]
|
||||
mod failing {
|
||||
use monolith::utils;
|
||||
|
||||
#[test]
|
||||
fn sub_domain_must_not_be_within_domain() {
|
||||
assert!(!utils::domain_is_within_domain(
|
||||
"news.ycombinator.com",
|
||||
"ycombinator.com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn domain_must_not_be_within_top_level_domain() {
|
||||
assert!(!utils::domain_is_within_domain("ycombinator.com", "com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_domains_must_not_be_within_one_another() {
|
||||
assert!(!utils::domain_is_within_domain(
|
||||
"news.ycombinator.com",
|
||||
"kernel.org"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sub_domain_is_not_within_wrong_top_level_domain() {
|
||||
assert!(!utils::domain_is_within_domain(
|
||||
"news.ycombinator.com",
|
||||
"org"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dotted_domain_is_not_within_domain() {
|
||||
assert!(!utils::domain_is_within_domain(
|
||||
".ycombinator.com",
|
||||
"ycombinator.com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_domain_is_not_within_dotted_domain() {
|
||||
assert!(!utils::domain_is_within_domain(
|
||||
"www.doodleoptimize.com",
|
||||
".ycombinator.com"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_domain_can_be_within_empty_domain() {
|
||||
assert!(!utils::domain_is_within_domain("ycombinator.com", ""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn both_can_not_be_empty() {
|
||||
assert!(!utils::domain_is_within_domain("", ""));
|
||||
}
|
||||
}
|
|
@ -1,4 +1,5 @@
|
|||
mod detect_media_type;
|
||||
mod domain_is_within_domain;
|
||||
mod indent;
|
||||
mod parse_content_type;
|
||||
mod retrieve_asset;
|
||||
|
|
Loading…
Reference in New Issue