Merge pull request #159 from snshn/implement-data-url-media-type-detection

Improve data URL media type detection
This commit is contained in:
Sunshine 2020-04-10 06:04:49 -04:00 committed by GitHub
commit 53160f01c7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 68 additions and 40 deletions

View file

@ -122,9 +122,9 @@ fn main() {
base_url = final_url;
dom = html_to_dom(&data);
} else if is_data_url(target_url) {
let text: String = data_url_to_text(target_url);
if text.len() == 0 {
eprintln!("Unsupported data URL input");
let (media_type, text): (String, String) = data_url_to_text(target_url);
if !media_type.eq_ignore_ascii_case("text/html") {
eprintln!("Unsupported data URL media type");
process::exit(1);
}
base_url = str!(target_url);

View file

@ -62,7 +62,7 @@ fn passing_bad_input_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDERR should contain error description
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
"Unsupported data URL input\n"
"Unsupported data URL media type\n"
);
// The exit code should be 1

View file

@ -9,48 +9,74 @@ use crate::utils;
#[test]
fn passing_parse_text_html_base64() {
let (media_type, text) = utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==");
assert_eq!(media_type, "text/html");
assert_eq!(
utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="),
text,
"Work expands so as to fill the time available for its completion"
);
}
#[test]
fn passing_parse_text_html_utf8() {
let (media_type, text) = utils::data_url_to_text(
"data:text/html;utf8,Work expands so as to fill the time available for its completion",
);
assert_eq!(media_type, "text/html");
assert_eq!(
utils::data_url_to_text(
"data:text/html;utf8,Work expands so as to fill the time available for its completion"
),
text,
"Work expands so as to fill the time available for its completion"
);
}
#[test]
fn passing_parse_text_html_plaintext() {
let (media_type, text) = utils::data_url_to_text(
"data:text/html,Work expands so as to fill the time available for its completion",
);
assert_eq!(media_type, "text/html");
assert_eq!(
utils::data_url_to_text(
"data:text/html,Work expands so as to fill the time available for its completion"
),
text,
"Work expands so as to fill the time available for its completion"
);
}
#[test]
fn passing_parse_text_html_charset_utf_8_between_two_whitespaces() {
let (media_type, text) = utils::data_url_to_text(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion ");
assert_eq!(media_type, "text/html");
assert_eq!(
utils::data_url_to_text(
" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "
),
text,
"Work expands so as to fill the time available for its completion"
);
}
#[test]
fn passing_parse_text_css_url_encoded() {
assert_eq!(
utils::data_url_to_text("data:text/css,div{background-color:%23000}"),
"div{background-color:#000}"
);
let (media_type, text) = utils::data_url_to_text("data:text/css,div{background-color:%23000}");
assert_eq!(media_type, "text/css");
assert_eq!(text, "div{background-color:#000}");
}
#[test]
fn passing_parse_no_media_type_base64() {
let (media_type, text) = utils::data_url_to_text("data:;base64,dGVzdA==");
assert_eq!(media_type, "");
assert_eq!(text, "test");
}
#[test]
fn passing_parse_no_media_type_no_encoding() {
let (media_type, text) = utils::data_url_to_text("data:;,test%20test");
assert_eq!(media_type, "");
assert_eq!(text, "test test");
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
@ -62,5 +88,8 @@ fn passing_parse_text_css_url_encoded() {
#[test]
fn failing_just_word_data() {
assert_eq!(utils::data_url_to_text("data"), "");
let (media_type, text) = utils::data_url_to_text("data");
assert_eq!(media_type, "");
assert_eq!(text, "");
}

View file

@ -133,50 +133,47 @@ pub fn clean_url<T: AsRef<str>>(url: T) -> String {
result.to_string()
}
pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap());
pub fn data_url_to_text<T: AsRef<str>>(url: T) -> (String, String) {
let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap());
let path: String = parsed_url.path().to_string();
let comma_loc: usize = path.find(',').unwrap_or(path.len());
if comma_loc == path.len() {
return str!();
}
let meta_data: String = path.chars().take(comma_loc).collect();
let raw_data: String = path.chars().skip(comma_loc + 1).collect();
let data: String = decode_url(raw_data);
let meta_data_items: Vec<&str> = meta_data.split(';').collect();
let mut media_type: &str = "";
let mut encoding: &str = "";
// Detect media type and encoding
let mut media_type: String = str!();
let mut text: String = str!();
let mut i: i8 = 0;
for item in &meta_data_items {
if i == 0 {
if is_plaintext_media_type(item) {
media_type = item;
continue;
media_type = str!(item);
} else {
if item.eq_ignore_ascii_case("base64")
|| item.eq_ignore_ascii_case("utf8")
|| item.eq_ignore_ascii_case("charset=UTF-8")
{
encoding = item;
}
}
if item.eq_ignore_ascii_case("base64") || item.eq_ignore_ascii_case("utf8") {
encoding = item;
}
i = i + 1;
}
if is_plaintext_media_type(media_type) {
if is_plaintext_media_type(&media_type) || media_type.is_empty() {
if encoding.eq_ignore_ascii_case("base64") {
String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!())
text = String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!())
} else {
data
text = data
}
} else {
str!()
}
(media_type, text)
}
pub fn decode_url(input: String) -> String {
@ -238,7 +235,9 @@ pub fn retrieve_asset(
if as_data_url {
Ok((url.to_string(), url.to_string()))
} else {
Ok((data_url_to_text(url), url.to_string()))
let (_media_type, text) = data_url_to_text(url);
Ok((text, url.to_string()))
}
} else if is_file_url(&url) {
// Check if parent_url is also file:///