add support for data URL targets
This commit is contained in:
parent
b8b6d8cff6
commit
3d2d40e7cd
4 changed files with 110 additions and 34 deletions
|
@ -13,7 +13,7 @@ pub fn retrieve_asset(
|
|||
) -> Result<(String, String), reqwest::Error> {
|
||||
let cache_key = clean_url(&url);
|
||||
|
||||
if is_data_url(&url).unwrap() {
|
||||
if is_data_url(&url) {
|
||||
Ok((url.to_string(), url.to_string()))
|
||||
} else {
|
||||
if cache.contains_key(&cache_key) {
|
||||
|
|
35
src/main.rs
35
src/main.rs
|
@ -7,7 +7,7 @@ mod macros;
|
|||
use crate::args::AppArgs;
|
||||
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
|
||||
use monolith::http::retrieve_asset;
|
||||
use monolith::utils::is_http_url;
|
||||
use monolith::utils::{data_url_to_text, is_data_url, is_http_url};
|
||||
use reqwest::blocking::Client;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use std::collections::HashMap;
|
||||
|
@ -46,11 +46,14 @@ impl Output {
|
|||
|
||||
fn main() {
|
||||
let app_args = AppArgs::get();
|
||||
let target_url: &str = app_args.url_target.as_str();
|
||||
let base_url;
|
||||
let dom;
|
||||
|
||||
if !is_http_url(app_args.url_target.as_str()) {
|
||||
if !is_http_url(target_url) && !is_data_url(target_url) {
|
||||
eprintln!(
|
||||
"Only HTTP and HTTPS URLs are allowed but got: {}",
|
||||
&app_args.url_target
|
||||
"Only HTTP(S) or data URLs are allowed but got: {}",
|
||||
&target_url
|
||||
);
|
||||
process::exit(1);
|
||||
}
|
||||
|
@ -78,21 +81,23 @@ fn main() {
|
|||
.expect("Failed to initialize HTTP client");
|
||||
|
||||
// Retrieve root document
|
||||
let (data, final_url) = retrieve_asset(
|
||||
&mut cache,
|
||||
&client,
|
||||
app_args.url_target.as_str(),
|
||||
false,
|
||||
"",
|
||||
app_args.silent,
|
||||
)
|
||||
.expect("Could not retrieve assets in HTML");
|
||||
let dom = html_to_dom(&data);
|
||||
if is_http_url(target_url) {
|
||||
let (data, final_url) =
|
||||
retrieve_asset(&mut cache, &client, target_url, false, "", app_args.silent)
|
||||
.expect("Could not retrieve assets in HTML");
|
||||
dom = html_to_dom(&data);
|
||||
base_url = final_url;
|
||||
} else if is_data_url(target_url) {
|
||||
base_url = target_url.to_string();
|
||||
dom = html_to_dom(&data_url_to_text(target_url));
|
||||
} else {
|
||||
process::exit(1);
|
||||
}
|
||||
|
||||
walk_and_embed_assets(
|
||||
&mut cache,
|
||||
&client,
|
||||
&final_url,
|
||||
&base_url,
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
app_args.no_js,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
use crate::utils::{
|
||||
clean_url, data_to_data_url, detect_mimetype, is_data_url, is_http_url, resolve_url,
|
||||
url_has_protocol,
|
||||
clean_url, data_to_data_url, data_url_to_text, detect_mimetype, is_data_url, is_http_url,
|
||||
resolve_url, url_has_protocol,
|
||||
};
|
||||
use url::ParseError;
|
||||
|
||||
|
@ -144,20 +144,35 @@ fn test_resolve_url() -> Result<(), ParseError> {
|
|||
"https://www.w3schools.com/html/default.asp"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
|
||||
"https://www.kernel.org/category/signatures.html",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
|
||||
"//www.w3schools.com/html/html_iframe.asp",
|
||||
)
|
||||
.unwrap_or(str!());
|
||||
assert_eq!(resolved_url.as_str(), "");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_data_url() {
|
||||
// passing
|
||||
assert!(
|
||||
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
|
||||
.unwrap_or(false)
|
||||
);
|
||||
assert!(is_data_url(
|
||||
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
|
||||
));
|
||||
// failing
|
||||
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("//kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("").unwrap_or(false));
|
||||
assert!(!is_data_url("https://kernel.org"));
|
||||
assert!(!is_data_url("//kernel.org"));
|
||||
assert!(!is_data_url(""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -175,3 +190,25 @@ fn test_clean_url() {
|
|||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_data_url_to_text() {
|
||||
assert_eq!(
|
||||
data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="),
|
||||
"Work expands so as to fill the time available for its completion"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
data_url_to_text(
|
||||
"data:text/html;utf8,Work expands so as to fill the time available for its completion"
|
||||
),
|
||||
"Work expands so as to fill the time available for its completion"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
data_url_to_text(
|
||||
"data:text/html,Work expands so as to fill the time available for its completion"
|
||||
),
|
||||
"Work expands so as to fill the time available for its completion"
|
||||
);
|
||||
}
|
||||
|
|
52
src/utils.rs
52
src/utils.rs
|
@ -1,5 +1,5 @@
|
|||
use crate::http::retrieve_asset;
|
||||
use base64::encode;
|
||||
use base64::{decode, encode};
|
||||
use regex::Regex;
|
||||
use reqwest::blocking::Client;
|
||||
use std::collections::HashMap;
|
||||
|
@ -37,8 +37,6 @@ use url::{ParseError, Url};
|
|||
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
|
||||
|
||||
lazy_static! {
|
||||
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
||||
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
|
||||
}
|
||||
|
||||
|
@ -82,19 +80,25 @@ pub fn detect_mimetype(data: &[u8]) -> String {
|
|||
return String::from_utf8(item[1].to_vec()).unwrap();
|
||||
}
|
||||
}
|
||||
"".to_owned()
|
||||
str!()
|
||||
}
|
||||
|
||||
pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {
|
||||
HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str())
|
||||
Url::parse(url.as_ref())
|
||||
.and_then(|u| Ok(u.scheme().len() > 0))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
pub fn is_data_url<T: AsRef<str>>(url: T) -> Result<bool, ParseError> {
|
||||
Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data"))
|
||||
pub fn is_data_url<T: AsRef<str>>(url: T) -> bool {
|
||||
Url::parse(url.as_ref())
|
||||
.and_then(|u| Ok(u.scheme() == "data"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
pub fn is_http_url<T: AsRef<str>>(path: T) -> bool {
|
||||
REGEX_URL.is_match(path.as_ref())
|
||||
pub fn is_http_url<T: AsRef<str>>(url: T) -> bool {
|
||||
Url::parse(url.as_ref())
|
||||
.and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
|
||||
|
@ -205,3 +209,33 @@ pub fn clean_url<T: AsRef<str>>(url: T) -> String {
|
|||
}
|
||||
result.to_string()
|
||||
}
|
||||
|
||||
pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
|
||||
let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap());
|
||||
let mut data: String = parsed_url.path().to_string();
|
||||
|
||||
if data.to_lowercase().starts_with("text/html") {
|
||||
data = data.chars().skip(9).collect();
|
||||
|
||||
if data.starts_with(";") {
|
||||
// Encoding specified, find out which one
|
||||
data = data.chars().skip(1).collect();
|
||||
|
||||
if data.to_lowercase().starts_with("base64,") {
|
||||
data = data.chars().skip(7).collect();
|
||||
String::from_utf8(decode(&data).unwrap_or(vec![])).unwrap_or(str!())
|
||||
} else if data.to_lowercase().starts_with("utf8,") {
|
||||
data.chars().skip(5).collect()
|
||||
} else {
|
||||
str!()
|
||||
}
|
||||
} else if data.starts_with(",") {
|
||||
// Plaintext, no encoding specified
|
||||
data.chars().skip(1).collect()
|
||||
} else {
|
||||
str!()
|
||||
}
|
||||
} else {
|
||||
str!()
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue