diff --git a/src/http.rs b/src/http.rs index 1899f1a..d458c34 100644 --- a/src/http.rs +++ b/src/http.rs @@ -13,7 +13,7 @@ pub fn retrieve_asset( ) -> Result<(String, String), reqwest::Error> { let cache_key = clean_url(&url); - if is_data_url(&url).unwrap() { + if is_data_url(&url) { Ok((url.to_string(), url.to_string())) } else { if cache.contains_key(&cache_key) { diff --git a/src/main.rs b/src/main.rs index 40928a5..d1c338c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,7 @@ mod macros; use crate::args::AppArgs; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets}; use monolith::http::retrieve_asset; -use monolith::utils::is_http_url; +use monolith::utils::{data_url_to_text, is_data_url, is_http_url}; use reqwest::blocking::Client; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use std::collections::HashMap; @@ -46,11 +46,14 @@ impl Output { fn main() { let app_args = AppArgs::get(); + let target_url: &str = app_args.url_target.as_str(); + let base_url; + let dom; - if !is_http_url(app_args.url_target.as_str()) { + if !is_http_url(target_url) && !is_data_url(target_url) { eprintln!( - "Only HTTP and HTTPS URLs are allowed but got: {}", - &app_args.url_target + "Only HTTP(S) or data URLs are allowed but got: {}", + &target_url ); process::exit(1); } @@ -78,21 +81,23 @@ fn main() { .expect("Failed to initialize HTTP client"); // Retrieve root document - let (data, final_url) = retrieve_asset( - &mut cache, - &client, - app_args.url_target.as_str(), - false, - "", - app_args.silent, - ) - .expect("Could not retrieve assets in HTML"); - let dom = html_to_dom(&data); + if is_http_url(target_url) { + let (data, final_url) = + retrieve_asset(&mut cache, &client, target_url, false, "", app_args.silent) + .expect("Could not retrieve assets in HTML"); + dom = html_to_dom(&data); + base_url = final_url; + } else if is_data_url(target_url) { + base_url = target_url.to_string(); + dom = html_to_dom(&data_url_to_text(target_url)); + } else { + process::exit(1); + } walk_and_embed_assets( &mut cache, &client, - &final_url, + &base_url, &dom.document, app_args.no_css, app_args.no_js, diff --git a/src/tests/utils.rs b/src/tests/utils.rs index af8d73a..f1169c6 100644 --- a/src/tests/utils.rs +++ b/src/tests/utils.rs @@ -1,6 +1,6 @@ use crate::utils::{ - clean_url, data_to_data_url, detect_mimetype, is_data_url, is_http_url, resolve_url, - url_has_protocol, + clean_url, data_to_data_url, data_url_to_text, detect_mimetype, is_data_url, is_http_url, + resolve_url, url_has_protocol, }; use url::ParseError; @@ -144,20 +144,35 @@ fn test_resolve_url() -> Result<(), ParseError> { "https://www.w3schools.com/html/default.asp" ); + let resolved_url = resolve_url( + "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", + "https://www.kernel.org/category/signatures.html", + )?; + assert_eq!( + resolved_url.as_str(), + "https://www.kernel.org/category/signatures.html" + ); + + let resolved_url = resolve_url( + "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", + "//www.w3schools.com/html/html_iframe.asp", + ) + .unwrap_or(str!()); + assert_eq!(resolved_url.as_str(), ""); + Ok(()) } #[test] fn test_is_data_url() { // passing - assert!( - is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h") - .unwrap_or(false) - ); + assert!(is_data_url( + "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" + )); // failing - assert!(!is_data_url("https://kernel.org").unwrap_or(false)); - assert!(!is_data_url("//kernel.org").unwrap_or(false)); - assert!(!is_data_url("").unwrap_or(false)); + assert!(!is_data_url("https://kernel.org")); + assert!(!is_data_url("//kernel.org")); + assert!(!is_data_url("")); } #[test] @@ -175,3 +190,25 @@ fn test_clean_url() { "https://somewhere.com/font.eot" ); } + +#[test] +fn test_data_url_to_text() { + assert_eq!( + data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="), + "Work expands so as to fill the time available for its completion" + ); + + assert_eq!( + data_url_to_text( + "data:text/html;utf8,Work expands so as to fill the time available for its completion" + ), + "Work expands so as to fill the time available for its completion" + ); + + assert_eq!( + data_url_to_text( + "data:text/html,Work expands so as to fill the time available for its completion" + ), + "Work expands so as to fill the time available for its completion" + ); +} diff --git a/src/utils.rs b/src/utils.rs index d247a36..142bf0d 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,5 +1,5 @@ use crate::http::retrieve_asset; -use base64::encode; +use base64::{decode, encode}; use regex::Regex; use reqwest::blocking::Client; use std::collections::HashMap; @@ -37,8 +37,6 @@ use url::{ParseError, Url}; const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P@import)|(?Psrc\s*:))\s+)?url\((?P['"]?(?P[^"'\)]+)['"]?)\)"###; lazy_static! { - static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); - static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap(); static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap(); } @@ -82,19 +80,25 @@ pub fn detect_mimetype(data: &[u8]) -> String { return String::from_utf8(item[1].to_vec()).unwrap(); } } - "".to_owned() + str!() } pub fn url_has_protocol>(url: T) -> bool { - HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str()) + Url::parse(url.as_ref()) + .and_then(|u| Ok(u.scheme().len() > 0)) + .unwrap_or(false) } -pub fn is_data_url>(url: T) -> Result { - Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data")) +pub fn is_data_url>(url: T) -> bool { + Url::parse(url.as_ref()) + .and_then(|u| Ok(u.scheme() == "data")) + .unwrap_or(false) } -pub fn is_http_url>(path: T) -> bool { - REGEX_URL.is_match(path.as_ref()) +pub fn is_http_url>(url: T) -> bool { + Url::parse(url.as_ref()) + .and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https")) + .unwrap_or(false) } pub fn resolve_url, U: AsRef>(from: T, to: U) -> Result { @@ -205,3 +209,33 @@ pub fn clean_url>(url: T) -> String { } result.to_string() } + +pub fn data_url_to_text>(url: T) -> String { + let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap()); + let mut data: String = parsed_url.path().to_string(); + + if data.to_lowercase().starts_with("text/html") { + data = data.chars().skip(9).collect(); + + if data.starts_with(";") { + // Encoding specified, find out which one + data = data.chars().skip(1).collect(); + + if data.to_lowercase().starts_with("base64,") { + data = data.chars().skip(7).collect(); + String::from_utf8(decode(&data).unwrap_or(vec![])).unwrap_or(str!()) + } else if data.to_lowercase().starts_with("utf8,") { + data.chars().skip(5).collect() + } else { + str!() + } + } else if data.starts_with(",") { + // Plaintext, no encoding specified + data.chars().skip(1).collect() + } else { + str!() + } + } else { + str!() + } +}