add support for data URL targets
This commit is contained in:
parent
b8b6d8cff6
commit
3d2d40e7cd
4 changed files with 110 additions and 34 deletions
|
@ -13,7 +13,7 @@ pub fn retrieve_asset(
|
||||||
) -> Result<(String, String), reqwest::Error> {
|
) -> Result<(String, String), reqwest::Error> {
|
||||||
let cache_key = clean_url(&url);
|
let cache_key = clean_url(&url);
|
||||||
|
|
||||||
if is_data_url(&url).unwrap() {
|
if is_data_url(&url) {
|
||||||
Ok((url.to_string(), url.to_string()))
|
Ok((url.to_string(), url.to_string()))
|
||||||
} else {
|
} else {
|
||||||
if cache.contains_key(&cache_key) {
|
if cache.contains_key(&cache_key) {
|
||||||
|
|
35
src/main.rs
35
src/main.rs
|
@ -7,7 +7,7 @@ mod macros;
|
||||||
use crate::args::AppArgs;
|
use crate::args::AppArgs;
|
||||||
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
|
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
|
||||||
use monolith::http::retrieve_asset;
|
use monolith::http::retrieve_asset;
|
||||||
use monolith::utils::is_http_url;
|
use monolith::utils::{data_url_to_text, is_data_url, is_http_url};
|
||||||
use reqwest::blocking::Client;
|
use reqwest::blocking::Client;
|
||||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
@ -46,11 +46,14 @@ impl Output {
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let app_args = AppArgs::get();
|
let app_args = AppArgs::get();
|
||||||
|
let target_url: &str = app_args.url_target.as_str();
|
||||||
|
let base_url;
|
||||||
|
let dom;
|
||||||
|
|
||||||
if !is_http_url(app_args.url_target.as_str()) {
|
if !is_http_url(target_url) && !is_data_url(target_url) {
|
||||||
eprintln!(
|
eprintln!(
|
||||||
"Only HTTP and HTTPS URLs are allowed but got: {}",
|
"Only HTTP(S) or data URLs are allowed but got: {}",
|
||||||
&app_args.url_target
|
&target_url
|
||||||
);
|
);
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
|
@ -78,21 +81,23 @@ fn main() {
|
||||||
.expect("Failed to initialize HTTP client");
|
.expect("Failed to initialize HTTP client");
|
||||||
|
|
||||||
// Retrieve root document
|
// Retrieve root document
|
||||||
let (data, final_url) = retrieve_asset(
|
if is_http_url(target_url) {
|
||||||
&mut cache,
|
let (data, final_url) =
|
||||||
&client,
|
retrieve_asset(&mut cache, &client, target_url, false, "", app_args.silent)
|
||||||
app_args.url_target.as_str(),
|
.expect("Could not retrieve assets in HTML");
|
||||||
false,
|
dom = html_to_dom(&data);
|
||||||
"",
|
base_url = final_url;
|
||||||
app_args.silent,
|
} else if is_data_url(target_url) {
|
||||||
)
|
base_url = target_url.to_string();
|
||||||
.expect("Could not retrieve assets in HTML");
|
dom = html_to_dom(&data_url_to_text(target_url));
|
||||||
let dom = html_to_dom(&data);
|
} else {
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
walk_and_embed_assets(
|
walk_and_embed_assets(
|
||||||
&mut cache,
|
&mut cache,
|
||||||
&client,
|
&client,
|
||||||
&final_url,
|
&base_url,
|
||||||
&dom.document,
|
&dom.document,
|
||||||
app_args.no_css,
|
app_args.no_css,
|
||||||
app_args.no_js,
|
app_args.no_js,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
use crate::utils::{
|
use crate::utils::{
|
||||||
clean_url, data_to_data_url, detect_mimetype, is_data_url, is_http_url, resolve_url,
|
clean_url, data_to_data_url, data_url_to_text, detect_mimetype, is_data_url, is_http_url,
|
||||||
url_has_protocol,
|
resolve_url, url_has_protocol,
|
||||||
};
|
};
|
||||||
use url::ParseError;
|
use url::ParseError;
|
||||||
|
|
||||||
|
@ -144,20 +144,35 @@ fn test_resolve_url() -> Result<(), ParseError> {
|
||||||
"https://www.w3schools.com/html/default.asp"
|
"https://www.w3schools.com/html/default.asp"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let resolved_url = resolve_url(
|
||||||
|
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
|
||||||
|
"https://www.kernel.org/category/signatures.html",
|
||||||
|
)?;
|
||||||
|
assert_eq!(
|
||||||
|
resolved_url.as_str(),
|
||||||
|
"https://www.kernel.org/category/signatures.html"
|
||||||
|
);
|
||||||
|
|
||||||
|
let resolved_url = resolve_url(
|
||||||
|
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
|
||||||
|
"//www.w3schools.com/html/html_iframe.asp",
|
||||||
|
)
|
||||||
|
.unwrap_or(str!());
|
||||||
|
assert_eq!(resolved_url.as_str(), "");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_is_data_url() {
|
fn test_is_data_url() {
|
||||||
// passing
|
// passing
|
||||||
assert!(
|
assert!(is_data_url(
|
||||||
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
|
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
|
||||||
.unwrap_or(false)
|
));
|
||||||
);
|
|
||||||
// failing
|
// failing
|
||||||
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
|
assert!(!is_data_url("https://kernel.org"));
|
||||||
assert!(!is_data_url("//kernel.org").unwrap_or(false));
|
assert!(!is_data_url("//kernel.org"));
|
||||||
assert!(!is_data_url("").unwrap_or(false));
|
assert!(!is_data_url(""));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -175,3 +190,25 @@ fn test_clean_url() {
|
||||||
"https://somewhere.com/font.eot"
|
"https://somewhere.com/font.eot"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_data_url_to_text() {
|
||||||
|
assert_eq!(
|
||||||
|
data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="),
|
||||||
|
"Work expands so as to fill the time available for its completion"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
data_url_to_text(
|
||||||
|
"data:text/html;utf8,Work expands so as to fill the time available for its completion"
|
||||||
|
),
|
||||||
|
"Work expands so as to fill the time available for its completion"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
data_url_to_text(
|
||||||
|
"data:text/html,Work expands so as to fill the time available for its completion"
|
||||||
|
),
|
||||||
|
"Work expands so as to fill the time available for its completion"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
52
src/utils.rs
52
src/utils.rs
|
@ -1,5 +1,5 @@
|
||||||
use crate::http::retrieve_asset;
|
use crate::http::retrieve_asset;
|
||||||
use base64::encode;
|
use base64::{decode, encode};
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use reqwest::blocking::Client;
|
use reqwest::blocking::Client;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
@ -37,8 +37,6 @@ use url::{ParseError, Url};
|
||||||
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
|
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
|
||||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
|
||||||
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
|
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -82,19 +80,25 @@ pub fn detect_mimetype(data: &[u8]) -> String {
|
||||||
return String::from_utf8(item[1].to_vec()).unwrap();
|
return String::from_utf8(item[1].to_vec()).unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"".to_owned()
|
str!()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {
|
pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {
|
||||||
HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str())
|
Url::parse(url.as_ref())
|
||||||
|
.and_then(|u| Ok(u.scheme().len() > 0))
|
||||||
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_data_url<T: AsRef<str>>(url: T) -> Result<bool, ParseError> {
|
pub fn is_data_url<T: AsRef<str>>(url: T) -> bool {
|
||||||
Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data"))
|
Url::parse(url.as_ref())
|
||||||
|
.and_then(|u| Ok(u.scheme() == "data"))
|
||||||
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_http_url<T: AsRef<str>>(path: T) -> bool {
|
pub fn is_http_url<T: AsRef<str>>(url: T) -> bool {
|
||||||
REGEX_URL.is_match(path.as_ref())
|
Url::parse(url.as_ref())
|
||||||
|
.and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https"))
|
||||||
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
|
pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
|
||||||
|
@ -205,3 +209,33 @@ pub fn clean_url<T: AsRef<str>>(url: T) -> String {
|
||||||
}
|
}
|
||||||
result.to_string()
|
result.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
|
||||||
|
let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap());
|
||||||
|
let mut data: String = parsed_url.path().to_string();
|
||||||
|
|
||||||
|
if data.to_lowercase().starts_with("text/html") {
|
||||||
|
data = data.chars().skip(9).collect();
|
||||||
|
|
||||||
|
if data.starts_with(";") {
|
||||||
|
// Encoding specified, find out which one
|
||||||
|
data = data.chars().skip(1).collect();
|
||||||
|
|
||||||
|
if data.to_lowercase().starts_with("base64,") {
|
||||||
|
data = data.chars().skip(7).collect();
|
||||||
|
String::from_utf8(decode(&data).unwrap_or(vec![])).unwrap_or(str!())
|
||||||
|
} else if data.to_lowercase().starts_with("utf8,") {
|
||||||
|
data.chars().skip(5).collect()
|
||||||
|
} else {
|
||||||
|
str!()
|
||||||
|
}
|
||||||
|
} else if data.starts_with(",") {
|
||||||
|
// Plaintext, no encoding specified
|
||||||
|
data.chars().skip(1).collect()
|
||||||
|
} else {
|
||||||
|
str!()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
str!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue