Merge pull request #159 from snshn/implement-data-url-media-type-detection

Improve data URL media type detection
This commit is contained in:
Sunshine 2020-04-10 06:04:49 -04:00 committed by GitHub
commit 53160f01c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 68 additions and 40 deletions

View File

@ -122,9 +122,9 @@ fn main() {
base_url = final_url; base_url = final_url;
dom = html_to_dom(&data); dom = html_to_dom(&data);
} else if is_data_url(target_url) { } else if is_data_url(target_url) {
let text: String = data_url_to_text(target_url); let (media_type, text): (String, String) = data_url_to_text(target_url);
if text.len() == 0 { if !media_type.eq_ignore_ascii_case("text/html") {
eprintln!("Unsupported data URL input"); eprintln!("Unsupported data URL media type");
process::exit(1); process::exit(1);
} }
base_url = str!(target_url); base_url = str!(target_url);

View File

@ -62,7 +62,7 @@ fn passing_bad_input_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDERR should contain error description // STDERR should contain error description
assert_eq!( assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(), std::str::from_utf8(&out.stderr).unwrap(),
"Unsupported data URL input\n" "Unsupported data URL media type\n"
); );
// The exit code should be 1 // The exit code should be 1

View File

@ -9,48 +9,74 @@ use crate::utils;
#[test] #[test]
fn passing_parse_text_html_base64() { fn passing_parse_text_html_base64() {
let (media_type, text) = utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==");
assert_eq!(media_type, "text/html");
assert_eq!( assert_eq!(
utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="), text,
"Work expands so as to fill the time available for its completion" "Work expands so as to fill the time available for its completion"
); );
} }
#[test] #[test]
fn passing_parse_text_html_utf8() { fn passing_parse_text_html_utf8() {
let (media_type, text) = utils::data_url_to_text(
"data:text/html;utf8,Work expands so as to fill the time available for its completion",
);
assert_eq!(media_type, "text/html");
assert_eq!( assert_eq!(
utils::data_url_to_text( text,
"data:text/html;utf8,Work expands so as to fill the time available for its completion"
),
"Work expands so as to fill the time available for its completion" "Work expands so as to fill the time available for its completion"
); );
} }
#[test] #[test]
fn passing_parse_text_html_plaintext() { fn passing_parse_text_html_plaintext() {
let (media_type, text) = utils::data_url_to_text(
"data:text/html,Work expands so as to fill the time available for its completion",
);
assert_eq!(media_type, "text/html");
assert_eq!( assert_eq!(
utils::data_url_to_text( text,
"data:text/html,Work expands so as to fill the time available for its completion"
),
"Work expands so as to fill the time available for its completion" "Work expands so as to fill the time available for its completion"
); );
} }
#[test] #[test]
fn passing_parse_text_html_charset_utf_8_between_two_whitespaces() { fn passing_parse_text_html_charset_utf_8_between_two_whitespaces() {
let (media_type, text) = utils::data_url_to_text(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion ");
assert_eq!(media_type, "text/html");
assert_eq!( assert_eq!(
utils::data_url_to_text( text,
" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "
),
"Work expands so as to fill the time available for its completion" "Work expands so as to fill the time available for its completion"
); );
} }
#[test] #[test]
fn passing_parse_text_css_url_encoded() { fn passing_parse_text_css_url_encoded() {
assert_eq!( let (media_type, text) = utils::data_url_to_text("data:text/css,div{background-color:%23000}");
utils::data_url_to_text("data:text/css,div{background-color:%23000}"),
"div{background-color:#000}" assert_eq!(media_type, "text/css");
); assert_eq!(text, "div{background-color:#000}");
}
#[test]
fn passing_parse_no_media_type_base64() {
let (media_type, text) = utils::data_url_to_text("data:;base64,dGVzdA==");
assert_eq!(media_type, "");
assert_eq!(text, "test");
}
#[test]
fn passing_parse_no_media_type_no_encoding() {
let (media_type, text) = utils::data_url_to_text("data:;,test%20test");
assert_eq!(media_type, "");
assert_eq!(text, "test test");
} }
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ // ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
@ -62,5 +88,8 @@ fn passing_parse_text_css_url_encoded() {
#[test] #[test]
fn failing_just_word_data() { fn failing_just_word_data() {
assert_eq!(utils::data_url_to_text("data"), ""); let (media_type, text) = utils::data_url_to_text("data");
assert_eq!(media_type, "");
assert_eq!(text, "");
} }

View File

@ -133,50 +133,47 @@ pub fn clean_url<T: AsRef<str>>(url: T) -> String {
result.to_string() result.to_string()
} }
pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String { pub fn data_url_to_text<T: AsRef<str>>(url: T) -> (String, String) {
let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap()); let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap());
let path: String = parsed_url.path().to_string(); let path: String = parsed_url.path().to_string();
let comma_loc: usize = path.find(',').unwrap_or(path.len()); let comma_loc: usize = path.find(',').unwrap_or(path.len());
if comma_loc == path.len() {
return str!();
}
let meta_data: String = path.chars().take(comma_loc).collect(); let meta_data: String = path.chars().take(comma_loc).collect();
let raw_data: String = path.chars().skip(comma_loc + 1).collect(); let raw_data: String = path.chars().skip(comma_loc + 1).collect();
let data: String = decode_url(raw_data); let data: String = decode_url(raw_data);
let meta_data_items: Vec<&str> = meta_data.split(';').collect(); let meta_data_items: Vec<&str> = meta_data.split(';').collect();
let mut media_type: &str = "";
let mut encoding: &str = ""; let mut encoding: &str = "";
// Detect media type and encoding let mut media_type: String = str!();
let mut text: String = str!();
let mut i: i8 = 0; let mut i: i8 = 0;
for item in &meta_data_items { for item in &meta_data_items {
if i == 0 { if i == 0 {
if is_plaintext_media_type(item) { media_type = str!(item);
media_type = item; } else {
continue; if item.eq_ignore_ascii_case("base64")
|| item.eq_ignore_ascii_case("utf8")
|| item.eq_ignore_ascii_case("charset=UTF-8")
{
encoding = item;
} }
} }
if item.eq_ignore_ascii_case("base64") || item.eq_ignore_ascii_case("utf8") {
encoding = item;
}
i = i + 1; i = i + 1;
} }
if is_plaintext_media_type(media_type) { if is_plaintext_media_type(&media_type) || media_type.is_empty() {
if encoding.eq_ignore_ascii_case("base64") { if encoding.eq_ignore_ascii_case("base64") {
String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!()) text = String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!())
} else { } else {
data text = data
} }
} else {
str!()
} }
(media_type, text)
} }
pub fn decode_url(input: String) -> String { pub fn decode_url(input: String) -> String {
@ -238,7 +235,9 @@ pub fn retrieve_asset(
if as_data_url { if as_data_url {
Ok((url.to_string(), url.to_string())) Ok((url.to_string(), url.to_string()))
} else { } else {
Ok((data_url_to_text(url), url.to_string())) let (_media_type, text) = data_url_to_text(url);
Ok((text, url.to_string()))
} }
} else if is_file_url(&url) { } else if is_file_url(&url) {
// Check if parent_url is also file:/// // Check if parent_url is also file:///