2020-01-07 05:22:28 +01:00
|
|
|
use reqwest::blocking::Client;
|
2019-12-10 03:13:25 +01:00
|
|
|
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
2019-10-23 00:33:22 +02:00
|
|
|
use std::collections::HashMap;
|
2020-03-08 20:31:42 +01:00
|
|
|
use std::env;
|
|
|
|
use std::fs;
|
2020-12-26 08:23:29 +01:00
|
|
|
use std::io::{self, prelude::*, Error, Write};
|
2020-03-08 20:31:42 +01:00
|
|
|
use std::path::Path;
|
2020-01-15 04:26:04 +01:00
|
|
|
use std::process;
|
2019-12-10 03:13:25 +01:00
|
|
|
use std::time::Duration;
|
2019-08-23 05:17:15 +02:00
|
|
|
|
2020-07-14 08:58:29 +02:00
|
|
|
use monolith::html::{
|
2020-11-23 06:12:26 +01:00
|
|
|
add_favicon, create_metadata_tag, get_base_url, has_favicon, html_to_dom, set_base_url,
|
2020-08-01 07:44:09 +02:00
|
|
|
stringify_document, walk_and_embed_assets,
|
2020-07-14 08:58:29 +02:00
|
|
|
};
|
2020-06-28 07:36:41 +02:00
|
|
|
use monolith::opts::Options;
|
2020-07-14 08:58:29 +02:00
|
|
|
use monolith::url::{
|
2020-11-23 06:12:26 +01:00
|
|
|
data_to_data_url, is_data_url, is_file_url, is_http_url, parse_data_url, resolve_url,
|
2020-07-14 08:58:29 +02:00
|
|
|
};
|
2020-06-24 09:16:40 +02:00
|
|
|
use monolith::utils::retrieve_asset;
|
|
|
|
|
2020-03-29 09:54:20 +02:00
|
|
|
mod macros;
|
|
|
|
|
2020-01-15 04:26:04 +01:00
|
|
|
enum Output {
|
|
|
|
Stdout(io::Stdout),
|
2020-03-08 20:31:42 +01:00
|
|
|
File(fs::File),
|
2020-01-15 04:26:04 +01:00
|
|
|
}
|
2019-12-26 06:41:03 +01:00
|
|
|
|
2020-01-15 04:26:04 +01:00
|
|
|
impl Output {
|
|
|
|
fn new(file_path: &str) -> Result<Output, Error> {
|
2020-07-14 09:35:59 +02:00
|
|
|
if file_path.is_empty() || file_path.eq("-") {
|
2020-01-15 04:26:04 +01:00
|
|
|
Ok(Output::Stdout(io::stdout()))
|
|
|
|
} else {
|
2020-03-08 20:31:42 +01:00
|
|
|
Ok(Output::File(fs::File::create(file_path)?))
|
2020-01-15 04:26:04 +01:00
|
|
|
}
|
2019-12-26 06:41:03 +01:00
|
|
|
}
|
|
|
|
|
2020-01-15 04:26:04 +01:00
|
|
|
fn writeln_str(&mut self, s: &str) -> Result<(), Error> {
|
|
|
|
match self {
|
|
|
|
Output::Stdout(stdout) => {
|
|
|
|
writeln!(stdout, "{}", s)?;
|
|
|
|
stdout.flush()
|
|
|
|
}
|
|
|
|
Output::File(f) => {
|
|
|
|
writeln!(f, "{}", s)?;
|
|
|
|
f.flush()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-12-26 06:41:03 +01:00
|
|
|
}
|
|
|
|
|
2020-12-26 08:23:29 +01:00
|
|
|
pub fn read_stdin() -> String {
|
|
|
|
let mut buffer = String::new();
|
|
|
|
for line in io::stdin().lock().lines() {
|
|
|
|
buffer += line.unwrap_or_default().as_str();
|
|
|
|
buffer += "\n";
|
|
|
|
}
|
|
|
|
buffer
|
|
|
|
}
|
|
|
|
|
2019-08-23 05:17:15 +02:00
|
|
|
fn main() {
|
2020-06-28 07:36:41 +02:00
|
|
|
let options = Options::from_args();
|
|
|
|
let original_target: &str = &options.target;
|
2020-03-08 20:31:42 +01:00
|
|
|
let target_url: &str;
|
2020-11-23 06:12:26 +01:00
|
|
|
let mut base_url: String;
|
2020-07-14 08:58:29 +02:00
|
|
|
let mut dom;
|
2020-12-26 08:23:29 +01:00
|
|
|
let mut use_stdin: bool = false;
|
2019-12-26 06:41:03 +01:00
|
|
|
|
2020-03-08 20:31:42 +01:00
|
|
|
// Pre-process the input
|
|
|
|
let cwd_normalized: String =
|
|
|
|
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
|
2020-03-29 09:54:20 +02:00
|
|
|
let path = Path::new(original_target);
|
|
|
|
let mut target: String = str!(original_target.clone()).replace("\\", "/");
|
2020-03-08 20:31:42 +01:00
|
|
|
let path_is_relative: bool = path.is_relative();
|
2020-03-29 09:54:20 +02:00
|
|
|
|
2020-06-26 00:23:56 +02:00
|
|
|
// Determine exact target URL
|
2020-03-29 09:54:20 +02:00
|
|
|
if target.clone().len() == 0 {
|
2020-11-23 06:12:26 +01:00
|
|
|
if !options.silent {
|
|
|
|
eprintln!("No target specified");
|
|
|
|
}
|
2020-01-15 04:26:04 +01:00
|
|
|
process::exit(1);
|
2020-12-26 08:23:29 +01:00
|
|
|
} else if target.clone() == "-" {
|
|
|
|
// Read from pipe (stdin)
|
|
|
|
use_stdin = true;
|
|
|
|
// Default target URL to empty data URL; the user can control it via --base-url
|
|
|
|
target_url = "data:text/html,"
|
2020-03-29 09:54:20 +02:00
|
|
|
} else if is_http_url(target.clone()) || is_data_url(target.clone()) {
|
|
|
|
target_url = target.as_str();
|
|
|
|
} else if is_file_url(target.clone()) {
|
|
|
|
target_url = target.as_str();
|
2020-03-08 20:31:42 +01:00
|
|
|
} else if path.exists() {
|
|
|
|
if !path.is_file() {
|
2020-11-23 06:12:26 +01:00
|
|
|
if !options.silent {
|
|
|
|
eprintln!("Local target is not a file: {}", original_target);
|
|
|
|
}
|
2020-03-08 20:31:42 +01:00
|
|
|
process::exit(1);
|
|
|
|
}
|
2020-03-29 09:54:20 +02:00
|
|
|
target.insert_str(0, if cfg!(windows) { "file:///" } else { "file://" });
|
2020-03-08 20:31:42 +01:00
|
|
|
if path_is_relative {
|
2020-03-29 09:54:20 +02:00
|
|
|
target.insert_str(if cfg!(windows) { 8 } else { 7 }, &cwd_normalized);
|
|
|
|
target.insert_str(
|
2020-03-08 20:31:42 +01:00
|
|
|
if cfg!(windows) { 8 } else { 7 } + &cwd_normalized.len(),
|
|
|
|
"/",
|
|
|
|
);
|
|
|
|
}
|
2020-03-29 09:54:20 +02:00
|
|
|
target_url = target.as_str();
|
2020-03-08 20:31:42 +01:00
|
|
|
} else {
|
2020-03-29 09:54:20 +02:00
|
|
|
target.insert_str(0, "http://");
|
|
|
|
target_url = target.as_str();
|
2019-12-26 06:41:03 +01:00
|
|
|
}
|
|
|
|
|
2020-06-26 00:23:56 +02:00
|
|
|
// Define output
|
2020-06-28 07:36:41 +02:00
|
|
|
let mut output = Output::new(&options.output).expect("Could not prepare output");
|
2020-01-21 05:01:22 +01:00
|
|
|
|
|
|
|
// Initialize client
|
2020-01-15 04:26:04 +01:00
|
|
|
let mut cache = HashMap::new();
|
2020-01-21 05:01:22 +01:00
|
|
|
let mut header_map = HeaderMap::new();
|
2020-12-31 23:17:17 +01:00
|
|
|
if let Some(user_agent) = &options.user_agent {
|
|
|
|
header_map.insert(
|
|
|
|
USER_AGENT,
|
|
|
|
HeaderValue::from_str(&user_agent).expect("Invalid User-Agent header specified"),
|
|
|
|
);
|
|
|
|
}
|
2020-06-28 07:36:41 +02:00
|
|
|
let timeout: u64 = if options.timeout > 0 {
|
|
|
|
options.timeout
|
2020-02-03 06:38:21 +01:00
|
|
|
} else {
|
|
|
|
std::u64::MAX / 4
|
|
|
|
};
|
2020-01-21 05:01:22 +01:00
|
|
|
let client = Client::builder()
|
2020-02-03 06:38:21 +01:00
|
|
|
.timeout(Duration::from_secs(timeout))
|
2020-06-28 07:36:41 +02:00
|
|
|
.danger_accept_invalid_certs(options.insecure)
|
2020-01-21 05:01:22 +01:00
|
|
|
.default_headers(header_map)
|
|
|
|
.build()
|
|
|
|
.expect("Failed to initialize HTTP client");
|
2019-09-22 02:06:00 +02:00
|
|
|
|
2020-11-23 06:12:26 +01:00
|
|
|
// At this stage we assume that the base URL is the same as the target URL
|
|
|
|
base_url = str!(target_url);
|
|
|
|
|
2020-06-26 00:23:56 +02:00
|
|
|
// Retrieve target document
|
2020-12-26 08:23:29 +01:00
|
|
|
if use_stdin {
|
|
|
|
dom = html_to_dom(&read_stdin());
|
|
|
|
} else if is_file_url(target_url) || is_http_url(target_url) {
|
2020-11-23 03:49:26 +01:00
|
|
|
match retrieve_asset(&mut cache, &client, target_url, target_url, &options, 0) {
|
2020-05-02 12:13:28 +02:00
|
|
|
Ok((data, final_url, _media_type)) => {
|
2020-11-23 06:12:26 +01:00
|
|
|
if options.base_url.clone().unwrap_or(str!()).is_empty() {
|
|
|
|
base_url = final_url
|
|
|
|
}
|
2020-05-02 12:13:28 +02:00
|
|
|
dom = html_to_dom(&String::from_utf8_lossy(&data));
|
|
|
|
}
|
|
|
|
Err(_) => {
|
2020-11-23 03:49:26 +01:00
|
|
|
if !options.silent {
|
|
|
|
eprintln!("Could not retrieve target document");
|
|
|
|
}
|
2020-05-02 12:13:28 +02:00
|
|
|
process::exit(1);
|
|
|
|
}
|
|
|
|
}
|
2020-02-13 06:56:30 +01:00
|
|
|
} else if is_data_url(target_url) {
|
2020-11-23 06:12:26 +01:00
|
|
|
let (media_type, data): (String, Vec<u8>) = parse_data_url(target_url);
|
2020-04-10 11:06:07 +02:00
|
|
|
if !media_type.eq_ignore_ascii_case("text/html") {
|
2020-11-23 06:12:26 +01:00
|
|
|
if !options.silent {
|
|
|
|
eprintln!("Unsupported data URL media type");
|
|
|
|
}
|
2020-02-14 05:46:08 +01:00
|
|
|
process::exit(1);
|
|
|
|
}
|
2020-05-02 12:13:28 +02:00
|
|
|
dom = html_to_dom(&String::from_utf8_lossy(&data));
|
2020-02-13 06:56:30 +01:00
|
|
|
} else {
|
|
|
|
process::exit(1);
|
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
|
2020-11-23 06:12:26 +01:00
|
|
|
// Use custom base URL if specified, read and use what's in the DOM otherwise
|
|
|
|
if !options.base_url.clone().unwrap_or(str!()).is_empty() {
|
|
|
|
if is_data_url(options.base_url.clone().unwrap()) {
|
|
|
|
if !options.silent {
|
|
|
|
eprintln!("Data URLs cannot be used as base URL");
|
|
|
|
}
|
|
|
|
process::exit(1);
|
|
|
|
} else {
|
|
|
|
base_url = options.base_url.clone().unwrap();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if let Some(existing_base_url) = get_base_url(&dom.document) {
|
|
|
|
base_url = resolve_url(target_url, existing_base_url).unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-01 07:44:09 +02:00
|
|
|
// Embed remote assets
|
|
|
|
walk_and_embed_assets(&mut cache, &client, &base_url, &dom.document, &options, 0);
|
|
|
|
|
2020-11-23 06:12:26 +01:00
|
|
|
// Update or add new BASE tag to reroute network requests and hash-links in the final document
|
|
|
|
if let Some(new_base_url) = options.base_url.clone() {
|
|
|
|
dom = set_base_url(&dom.document, new_base_url);
|
2020-08-01 07:44:09 +02:00
|
|
|
}
|
|
|
|
|
2020-07-14 08:58:29 +02:00
|
|
|
// Request and embed /favicon.ico (unless it's already linked in the document)
|
|
|
|
if !options.no_images && is_http_url(target_url) && !has_favicon(&dom.document) {
|
|
|
|
let favicon_ico_url: String = resolve_url(&base_url, "/favicon.ico").unwrap();
|
|
|
|
|
|
|
|
match retrieve_asset(
|
|
|
|
&mut cache,
|
|
|
|
&client,
|
|
|
|
&base_url,
|
|
|
|
&favicon_ico_url,
|
2020-11-23 03:49:26 +01:00
|
|
|
&options,
|
2020-07-14 08:58:29 +02:00
|
|
|
0,
|
|
|
|
) {
|
|
|
|
Ok((data, final_url, media_type)) => {
|
|
|
|
let favicon_data_url: String = data_to_data_url(&media_type, &data, &final_url);
|
|
|
|
dom = add_favicon(&dom.document, favicon_data_url);
|
|
|
|
}
|
|
|
|
Err(_) => {
|
|
|
|
// Failed to retrieve favicon.ico
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-26 00:23:56 +02:00
|
|
|
// Serialize DOM tree
|
2020-06-28 07:36:41 +02:00
|
|
|
let mut result: String = stringify_document(&dom.document, &options);
|
2020-01-15 04:26:04 +01:00
|
|
|
|
2020-06-26 00:23:56 +02:00
|
|
|
// Add metadata tag
|
2020-06-28 07:36:41 +02:00
|
|
|
if !options.no_metadata {
|
2020-12-26 08:23:29 +01:00
|
|
|
let metadata_comment: String = create_metadata_tag(&target_url);
|
2020-06-26 00:23:56 +02:00
|
|
|
result.insert_str(0, &metadata_comment);
|
|
|
|
if metadata_comment.len() > 0 {
|
|
|
|
result.insert_str(metadata_comment.len(), "\n");
|
2020-06-01 11:28:02 +02:00
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
2020-04-26 02:59:34 +02:00
|
|
|
|
2020-06-26 00:23:56 +02:00
|
|
|
// Write result into stdout or file
|
2020-01-15 04:26:04 +01:00
|
|
|
output
|
2020-06-26 00:23:56 +02:00
|
|
|
.writeln_str(&result)
|
2020-01-15 04:26:04 +01:00
|
|
|
.expect("Could not write HTML output");
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|