From 4bc8043f0fab2d65b00378637b15d9b6766a56a1 Mon Sep 17 00:00:00 2001
From: Sunshine <sunshine@uberspace.net>
Date: Tue, 8 Jun 2021 12:54:16 -1000
Subject: [PATCH] account for charset when creating data URLs

---
 src/css.rs                           | 15 +++---
 src/html.rs                          | 24 ++++-----
 src/main.rs                          |  6 +--
 src/tests/url/create_data_url.rs     | 73 +++++++++++++++++++++++++++-
 src/tests/utils/detect_media_type.rs |  2 +-
 src/tests/utils/retrieve_asset.rs    |  4 +-
 src/url.rs                           | 12 ++++-
 src/utils.rs                         |  4 +-
 8 files changed, 111 insertions(+), 29 deletions(-)

diff --git a/src/css.rs b/src/css.rs
index f19ac0f..40f5810 100644
--- a/src/css.rs
+++ b/src/css.rs
@@ -202,10 +202,11 @@ pub fn process_css<'a>(
                             import_contents,
                             import_final_url,
                             import_media_type,
-                            _import_charset,
+                            import_charset,
                         )) => {
                             let mut import_data_url = create_data_url(
                                 &import_media_type,
+                                &import_charset,
                                 embed_css(
                                     cache,
                                     client,
@@ -252,9 +253,9 @@ pub fn process_css<'a>(
                                 options,
                                 depth + 1,
                             ) {
-                                Ok((data, final_url, media_type, _charset)) => {
+                                Ok((data, final_url, media_type, charset)) => {
                                     let mut data_url =
-                                        create_data_url(&media_type, &data, &final_url);
+                                        create_data_url(&media_type, &charset, &data, &final_url);
                                     data_url.set_fragment(resolved_url.fragment());
                                     result.push_str(
                                         format_quoted_string(&data_url.to_string()).as_str(),
@@ -348,9 +349,10 @@ pub fn process_css<'a>(
                         options,
                         depth + 1,
                     ) {
-                        Ok((css, final_url, media_type, _charset)) => {
+                        Ok((css, final_url, media_type, charset)) => {
                             let mut data_url = create_data_url(
                                 &media_type,
+                                &charset,
                                 embed_css(
                                     cache,
                                     client,
@@ -386,8 +388,9 @@ pub fn process_css<'a>(
                             options,
                             depth + 1,
                         ) {
-                            Ok((data, final_url, media_type, _charset)) => {
-                                let mut data_url = create_data_url(&media_type, &data, &final_url);
+                            Ok((data, final_url, media_type, charset)) => {
+                                let mut data_url =
+                                    create_data_url(&media_type, &charset, &data, &final_url);
                                 data_url.set_fragment(full_url.fragment());
                                 result
                                     .push_str(format_quoted_string(&data_url.to_string()).as_str());
diff --git a/src/html.rs b/src/html.rs
index d9a64f9..25abcf3 100644
--- a/src/html.rs
+++ b/src/html.rs
@@ -188,10 +188,13 @@ pub fn embed_srcset(
                 options,
                 depth + 1,
             ) {
-                Ok((image_data, image_final_url, image_media_type, _image_charset)) => {
-                    // TODO: use image_charset
-                    let mut image_data_url =
-                        create_data_url(&image_media_type, &image_data, &image_final_url);
+                Ok((image_data, image_final_url, image_media_type, image_charset)) => {
+                    let mut image_data_url = create_data_url(
+                        &image_media_type,
+                        &image_charset,
+                        &image_data,
+                        &image_final_url,
+                    );
                     // Append retreved asset as a data URL
                     image_data_url.set_fragment(image_full_url.fragment());
                     result.push_str(image_data_url.as_ref());
@@ -637,12 +640,12 @@ pub fn retrieve_and_embed_asset(
                     let css: String = embed_css(cache, client, &final_url, &s, options, depth + 1);
 
                     // Create and embed data URL
-                    // TODO: use charset
-                    let css_data_url = create_data_url(&media_type, css.as_bytes(), &final_url);
+                    let css_data_url =
+                        create_data_url(&media_type, &charset, css.as_bytes(), &final_url);
                     set_node_attr(&node, attr_name, Some(css_data_url.to_string()));
                 } else if node_name == "frame" || node_name == "iframe" {
                     // (I)FRAMEs are also quite different from conventional resources
-                    let frame_dom = html_to_dom(&data, charset);
+                    let frame_dom = html_to_dom(&data, charset.clone());
                     walk_and_embed_assets(
                         cache,
                         client,
@@ -661,8 +664,8 @@ pub fn retrieve_and_embed_asset(
                     .unwrap();
 
                     // Create and embed data URL
-                    // TODO: use charset
-                    let mut frame_data_url = create_data_url(&media_type, &frame_data, &final_url);
+                    let mut frame_data_url =
+                        create_data_url(&media_type, &charset, &frame_data, &final_url);
                     frame_data_url.set_fragment(resolved_url.fragment());
                     set_node_attr(node, attr_name, Some(frame_data_url.to_string()));
                 } else {
@@ -681,8 +684,7 @@ pub fn retrieve_and_embed_asset(
                     }
 
                     // Create and embed data URL
-                    // TODO: use charset
-                    let mut data_url = create_data_url(&media_type, &data, &final_url);
+                    let mut data_url = create_data_url(&media_type, &charset, &data, &final_url);
                     data_url.set_fragment(resolved_url.fragment());
                     set_node_attr(node, attr_name, Some(data_url.to_string()));
                 }
diff --git a/src/main.rs b/src/main.rs
index 6b5e8e4..7798759 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -299,9 +299,9 @@ fn main() {
             &options,
             0,
         ) {
-            Ok((data, final_url, media_type, _charset)) => {
-                // TODO: use charset
-                let favicon_data_url: Url = create_data_url(&media_type, &data, &final_url);
+            Ok((data, final_url, media_type, charset)) => {
+                let favicon_data_url: Url =
+                    create_data_url(&media_type, &charset, &data, &final_url);
                 dom = add_favicon(&dom.document, favicon_data_url.to_string());
             }
             Err(_) => {
diff --git a/src/tests/url/create_data_url.rs b/src/tests/url/create_data_url.rs
index 873dbda..f6c3cd5 100644
--- a/src/tests/url/create_data_url.rs
+++ b/src/tests/url/create_data_url.rs
@@ -13,9 +13,14 @@ mod passing {
 
     #[test]
     fn encode_string_with_specific_media_type() {
-        let mime = "application/javascript";
+        let media_type = "application/javascript";
         let data = "var word = 'hello';\nalert(word);\n";
-        let data_url = url::create_data_url(mime, data.as_bytes(), &Url::parse("data:,").unwrap());
+        let data_url = url::create_data_url(
+            media_type,
+            "",
+            data.as_bytes(),
+            &Url::parse("data:,").unwrap(),
+        );
 
         assert_eq!(
             data_url.as_str(),
@@ -28,6 +33,7 @@ mod passing {
         let data = "<svg></svg>\n";
         let data_url = url::create_data_url(
             "image/svg+xml",
+            "",
             data.as_bytes(),
             &Url::parse("data:,").unwrap(),
         );
@@ -37,4 +43,67 @@ mod passing {
             ""
         );
     }
+
+    #[test]
+    fn encode_string_with_specific_media_type_and_charset() {
+        let media_type = "application/javascript";
+        let charset = "utf8";
+        let data = "var word = 'hello';\nalert(word);\n";
+        let data_url = url::create_data_url(
+            media_type,
+            charset,
+            data.as_bytes(),
+            &Url::parse("data:,").unwrap(),
+        );
+
+        assert_eq!(
+            data_url.as_str(),
+            "data:application/javascript;charset=utf8;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
+        );
+    }
+
+    #[test]
+    fn create_data_url_with_us_ascii_charset() {
+        let media_type = "";
+        let charset = "us-ascii";
+        let data = "";
+        let data_url = url::create_data_url(
+            media_type,
+            charset,
+            data.as_bytes(),
+            &Url::parse("data:,").unwrap(),
+        );
+
+        assert_eq!(data_url.as_str(), "data:;base64,");
+    }
+
+    #[test]
+    fn create_data_url_with_utf8_charset() {
+        let media_type = "";
+        let charset = "utf8";
+        let data = "";
+        let data_url = url::create_data_url(
+            media_type,
+            charset,
+            data.as_bytes(),
+            &Url::parse("data:,").unwrap(),
+        );
+
+        assert_eq!(data_url.as_str(), "data:;charset=utf8;base64,");
+    }
+
+    #[test]
+    fn create_data_url_with_media_type_text_plain_and_utf8_charset() {
+        let media_type = "text/plain";
+        let charset = "utf8";
+        let data = "";
+        let data_url = url::create_data_url(
+            media_type,
+            charset,
+            data.as_bytes(),
+            &Url::parse("data:,").unwrap(),
+        );
+
+        assert_eq!(data_url.as_str(), "data:text/plain;charset=utf8;base64,");
+    }
 }
diff --git a/src/tests/utils/detect_media_type.rs b/src/tests/utils/detect_media_type.rs
index 707fc44..970af13 100644
--- a/src/tests/utils/detect_media_type.rs
+++ b/src/tests/utils/detect_media_type.rs
@@ -195,7 +195,7 @@ mod failing {
         let dummy_url: Url = Url::parse("data:,").unwrap();
         assert_eq!(
             utils::detect_media_type(b"abcdef0123456789", &dummy_url),
-            "application/octet-stream"
+            ""
         );
     }
 }
diff --git a/src/tests/utils/retrieve_asset.rs b/src/tests/utils/retrieve_asset.rs
index eee881d..1d12559 100644
--- a/src/tests/utils/retrieve_asset.rs
+++ b/src/tests/utils/retrieve_asset.rs
@@ -38,7 +38,7 @@ mod passing {
         assert_eq!(&media_type, "text/html");
         assert_eq!(&charset, "US-ASCII");
         assert_eq!(
-            url::create_data_url(&media_type, &data, &final_url),
+            url::create_data_url(&media_type, &charset, &data, &final_url),
             Url::parse("data:text/html;base64,dGFyZ2V0").unwrap(),
         );
         assert_eq!(
@@ -80,7 +80,7 @@ mod passing {
         .unwrap();
         assert_eq!(&media_type, "application/javascript");
         assert_eq!(&charset, "");
-        assert_eq!(url::create_data_url(&media_type, &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap());
+        assert_eq!(url::create_data_url(&media_type, &charset, &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap());
         assert_eq!(
             final_url,
             Url::parse(&format!(
diff --git a/src/url.rs b/src/url.rs
index eea0bb3..8f0221b 100644
--- a/src/url.rs
+++ b/src/url.rs
@@ -12,7 +12,8 @@ pub fn clean_url(url: Url) -> Url {
     url
 }
 
-pub fn create_data_url(media_type: &str, data: &[u8], final_asset_url: &Url) -> Url {
+pub fn create_data_url(media_type: &str, charset: &str, data: &[u8], final_asset_url: &Url) -> Url {
+    // TODO: move this block out of this function
     let media_type: String = if media_type.is_empty() {
         detect_media_type(data, &final_asset_url)
     } else {
@@ -21,7 +22,14 @@ pub fn create_data_url(media_type: &str, data: &[u8], final_asset_url: &Url) ->
 
     let mut data_url: Url = Url::parse("data:,").unwrap();
 
-    data_url.set_path(format!("{};base64,{}", media_type, base64::encode(data)).as_str());
+    let c: String =
+        if !charset.trim().is_empty() && !charset.trim().eq_ignore_ascii_case("US-ASCII") {
+            format!(";charset={}", charset.trim())
+        } else {
+            str!()
+        };
+
+    data_url.set_path(format!("{}{};base64,{}", media_type, c, base64::encode(data)).as_str());
 
     data_url
 }
diff --git a/src/utils.rs b/src/utils.rs
index 012b419..1bd2b65 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -84,9 +84,9 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String {
             "woff" => "font/woff",
             "woff2" => "font/woff2",
             "xml" => "text/xml",
-            &_ => "application/octet-stream",
+            &_ => "",
         },
-        None => "application/octet-stream",
+        None => "",
     };
 
     mime.to_string()