add support for data URL targets

This commit is contained in:
Sunshine 2020-02-13 00:56:30 -05:00
parent b8b6d8cff6
commit 3d2d40e7cd
No known key found for this signature in database
GPG key ID: B80CA68703CD8AB1
4 changed files with 110 additions and 34 deletions

View file

@ -13,7 +13,7 @@ pub fn retrieve_asset(
) -> Result<(String, String), reqwest::Error> { ) -> Result<(String, String), reqwest::Error> {
let cache_key = clean_url(&url); let cache_key = clean_url(&url);
if is_data_url(&url).unwrap() { if is_data_url(&url) {
Ok((url.to_string(), url.to_string())) Ok((url.to_string(), url.to_string()))
} else { } else {
if cache.contains_key(&cache_key) { if cache.contains_key(&cache_key) {

View file

@ -7,7 +7,7 @@ mod macros;
use crate::args::AppArgs; use crate::args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets}; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset; use monolith::http::retrieve_asset;
use monolith::utils::is_http_url; use monolith::utils::{data_url_to_text, is_data_url, is_http_url};
use reqwest::blocking::Client; use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap; use std::collections::HashMap;
@ -46,11 +46,14 @@ impl Output {
fn main() { fn main() {
let app_args = AppArgs::get(); let app_args = AppArgs::get();
let target_url: &str = app_args.url_target.as_str();
let base_url;
let dom;
if !is_http_url(app_args.url_target.as_str()) { if !is_http_url(target_url) && !is_data_url(target_url) {
eprintln!( eprintln!(
"Only HTTP and HTTPS URLs are allowed but got: {}", "Only HTTP(S) or data URLs are allowed but got: {}",
&app_args.url_target &target_url
); );
process::exit(1); process::exit(1);
} }
@ -78,21 +81,23 @@ fn main() {
.expect("Failed to initialize HTTP client"); .expect("Failed to initialize HTTP client");
// Retrieve root document // Retrieve root document
let (data, final_url) = retrieve_asset( if is_http_url(target_url) {
&mut cache, let (data, final_url) =
&client, retrieve_asset(&mut cache, &client, target_url, false, "", app_args.silent)
app_args.url_target.as_str(),
false,
"",
app_args.silent,
)
.expect("Could not retrieve assets in HTML"); .expect("Could not retrieve assets in HTML");
let dom = html_to_dom(&data); dom = html_to_dom(&data);
base_url = final_url;
} else if is_data_url(target_url) {
base_url = target_url.to_string();
dom = html_to_dom(&data_url_to_text(target_url));
} else {
process::exit(1);
}
walk_and_embed_assets( walk_and_embed_assets(
&mut cache, &mut cache,
&client, &client,
&final_url, &base_url,
&dom.document, &dom.document,
app_args.no_css, app_args.no_css,
app_args.no_js, app_args.no_js,

View file

@ -1,6 +1,6 @@
use crate::utils::{ use crate::utils::{
clean_url, data_to_data_url, detect_mimetype, is_data_url, is_http_url, resolve_url, clean_url, data_to_data_url, data_url_to_text, detect_mimetype, is_data_url, is_http_url,
url_has_protocol, resolve_url, url_has_protocol,
}; };
use url::ParseError; use url::ParseError;
@ -144,20 +144,35 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.w3schools.com/html/default.asp" "https://www.w3schools.com/html/default.asp"
); );
let resolved_url = resolve_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
"https://www.kernel.org/category/signatures.html",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
"//www.w3schools.com/html/html_iframe.asp",
)
.unwrap_or(str!());
assert_eq!(resolved_url.as_str(), "");
Ok(()) Ok(())
} }
#[test] #[test]
fn test_is_data_url() { fn test_is_data_url() {
// passing // passing
assert!( assert!(is_data_url(
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h") "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
.unwrap_or(false) ));
);
// failing // failing
assert!(!is_data_url("https://kernel.org").unwrap_or(false)); assert!(!is_data_url("https://kernel.org"));
assert!(!is_data_url("//kernel.org").unwrap_or(false)); assert!(!is_data_url("//kernel.org"));
assert!(!is_data_url("").unwrap_or(false)); assert!(!is_data_url(""));
} }
#[test] #[test]
@ -175,3 +190,25 @@ fn test_clean_url() {
"https://somewhere.com/font.eot" "https://somewhere.com/font.eot"
); );
} }
#[test]
fn test_data_url_to_text() {
assert_eq!(
data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="),
"Work expands so as to fill the time available for its completion"
);
assert_eq!(
data_url_to_text(
"data:text/html;utf8,Work expands so as to fill the time available for its completion"
),
"Work expands so as to fill the time available for its completion"
);
assert_eq!(
data_url_to_text(
"data:text/html,Work expands so as to fill the time available for its completion"
),
"Work expands so as to fill the time available for its completion"
);
}

View file

@ -1,5 +1,5 @@
use crate::http::retrieve_asset; use crate::http::retrieve_asset;
use base64::encode; use base64::{decode, encode};
use regex::Regex; use regex::Regex;
use reqwest::blocking::Client; use reqwest::blocking::Client;
use std::collections::HashMap; use std::collections::HashMap;
@ -37,8 +37,6 @@ use url::{ParseError, Url};
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###; const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
lazy_static! { lazy_static! {
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap(); static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
} }
@ -82,19 +80,25 @@ pub fn detect_mimetype(data: &[u8]) -> String {
return String::from_utf8(item[1].to_vec()).unwrap(); return String::from_utf8(item[1].to_vec()).unwrap();
} }
} }
"".to_owned() str!()
} }
pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool { pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {
HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str()) Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme().len() > 0))
.unwrap_or(false)
} }
pub fn is_data_url<T: AsRef<str>>(url: T) -> Result<bool, ParseError> { pub fn is_data_url<T: AsRef<str>>(url: T) -> bool {
Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data")) Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme() == "data"))
.unwrap_or(false)
} }
pub fn is_http_url<T: AsRef<str>>(path: T) -> bool { pub fn is_http_url<T: AsRef<str>>(url: T) -> bool {
REGEX_URL.is_match(path.as_ref()) Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https"))
.unwrap_or(false)
} }
pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> { pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
@ -205,3 +209,33 @@ pub fn clean_url<T: AsRef<str>>(url: T) -> String {
} }
result.to_string() result.to_string()
} }
pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap());
let mut data: String = parsed_url.path().to_string();
if data.to_lowercase().starts_with("text/html") {
data = data.chars().skip(9).collect();
if data.starts_with(";") {
// Encoding specified, find out which one
data = data.chars().skip(1).collect();
if data.to_lowercase().starts_with("base64,") {
data = data.chars().skip(7).collect();
String::from_utf8(decode(&data).unwrap_or(vec![])).unwrap_or(str!())
} else if data.to_lowercase().starts_with("utf8,") {
data.chars().skip(5).collect()
} else {
str!()
}
} else if data.starts_with(",") {
// Plaintext, no encoding specified
data.chars().skip(1).collect()
} else {
str!()
}
} else {
str!()
}
}