From 5443c0cc3ff0b3a5d9d9d761e13ec90e086e6ca4 Mon Sep 17 00:00:00 2001 From: knidarkness Date: Sat, 12 Oct 2019 12:32:59 +0300 Subject: [PATCH 01/23] Added loading of the links given as url(...) in css files --- src/html.rs | 16 +++++++++++++--- src/http.rs | 4 ++-- src/utils.rs | 30 ++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/html.rs b/src/html.rs index fa10175..1c0f325 100644 --- a/src/html.rs +++ b/src/html.rs @@ -8,7 +8,7 @@ use html5ever::{local_name, namespace_url, ns}; use http::retrieve_asset; use js::attr_is_event_handler; use std::default::Default; -use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol}; +use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol, resolve_css_imports}; lazy_static! { static ref EMPTY_STRING: String = String::new(); @@ -127,7 +127,7 @@ pub fn walk_and_embed_assets( .unwrap_or(EMPTY_STRING.clone()); let (css_dataurl, _) = retrieve_asset( &href_full_url, - true, + false, "text/css", opt_user_agent, opt_silent, @@ -135,7 +135,17 @@ pub fn walk_and_embed_assets( ) .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); attr.value.clear(); - attr.value.push_slice(css_dataurl.as_str()); + + let css_resolved = resolve_css_imports( + &css_dataurl, + &href_full_url, + opt_user_agent, + opt_silent, + opt_insecure, + ) + .unwrap_or(css_dataurl); + + attr.value.push_slice(css_resolved.as_str()); } } } diff --git a/src/http.rs b/src/http.rs index 19bf902..a5a9475 100644 --- a/src/http.rs +++ b/src/http.rs @@ -45,11 +45,11 @@ pub fn retrieve_asset( }; Ok(( - data_to_dataurl(&mimetype, &data), + if response.status() != 200 { "".to_string() } else { data_to_dataurl(&mimetype, &data) }, response.url().to_string(), )) } else { - Ok((response.text().unwrap(), response.url().to_string())) + Ok((if response.status() != 200 { "".to_string() } else { response.text().unwrap() }, response.url().to_string())) } } } diff --git a/src/utils.rs b/src/utils.rs index 61a5d83..f93800c 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -2,11 +2,13 @@ extern crate base64; use self::base64::encode; use regex::Regex; +use http::retrieve_asset; use url::{ParseError, Url}; lazy_static! { static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap(); + static ref EMPTY_STRING: String = String::new(); } static MAGIC: [[&[u8]; 2]; 19] = [ @@ -80,3 +82,31 @@ pub fn resolve_url(from: &str, to: &str) -> Result { Ok(result) } + +pub fn resolve_css_imports(css_string: &str, href: &str, opt_user_agent: &str, opt_silent: bool, opt_insecure: bool) -> Result { + let mut resolved_css = String::from(css_string); + let re = Regex::new(r###"url\((?:(?:https?|ftp)://)?"?[\w/\-?=%.]+\.[\w/\-?=%.]+"?\)"###).unwrap(); + + for link in re.captures_iter(&css_string) { + let target_link = if link[0].chars().nth(4) == Some('"') { &link[0][5..link[0].len()-2] } else {&link[0][4..link[0].len()-1]}; + let embedded_url = String::from([href, "/../", &target_link.to_string()].concat()); + + let (css_dataurl, _) = retrieve_asset( + &embedded_url, + true, // true + "", + opt_user_agent, + opt_silent, + opt_insecure, + ) + .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); + + let replacement = &["\"", &css_dataurl.replace("\"",&["\\", "\""].concat()).to_string(), "\""].concat(); + let t = resolved_css.replace(&link[0][4..link[0].len() - 1], &replacement).to_string(); + resolved_css = t.clone(); + } + + let encoded_css = data_to_dataurl("text/css", resolved_css.as_bytes()); + + Ok(encoded_css.to_string()) +} \ No newline at end of file From 550e4cc83f80dd54964f1cb66e80af99cfd98c06 Mon Sep 17 00:00:00 2001 From: knidarkness Date: Sat, 12 Oct 2019 14:05:07 +0300 Subject: [PATCH 02/23] Fixed formatting --- src/html.rs | 2 +- src/http.rs | 15 +++++++++++++-- src/utils.rs | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/html.rs b/src/html.rs index 1c0f325..9373a69 100644 --- a/src/html.rs +++ b/src/html.rs @@ -8,7 +8,7 @@ use html5ever::{local_name, namespace_url, ns}; use http::retrieve_asset; use js::attr_is_event_handler; use std::default::Default; -use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol, resolve_css_imports}; +use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol}; lazy_static! { static ref EMPTY_STRING: String = String::new(); diff --git a/src/http.rs b/src/http.rs index a5a9475..7fa60f0 100644 --- a/src/http.rs +++ b/src/http.rs @@ -45,11 +45,22 @@ pub fn retrieve_asset( }; Ok(( - if response.status() != 200 { "".to_string() } else { data_to_dataurl(&mimetype, &data) }, + if response.status() != 200 { + "".to_string() + } else { + data_to_dataurl(&mimetype, &data) + }, response.url().to_string(), )) } else { - Ok((if response.status() != 200 { "".to_string() } else { response.text().unwrap() }, response.url().to_string())) + Ok(( + if response.status() != 200 { + "".to_string() + } else { + response.text().unwrap() + }, + response.url().to_string(), + )) } } } diff --git a/src/utils.rs b/src/utils.rs index f93800c..069e095 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,8 +1,8 @@ extern crate base64; use self::base64::encode; -use regex::Regex; use http::retrieve_asset; +use regex::Regex; use url::{ParseError, Url}; lazy_static! { @@ -83,14 +83,25 @@ pub fn resolve_url(from: &str, to: &str) -> Result { Ok(result) } -pub fn resolve_css_imports(css_string: &str, href: &str, opt_user_agent: &str, opt_silent: bool, opt_insecure: bool) -> Result { +pub fn resolve_css_imports( + css_string: &str, + href: &str, + opt_user_agent: &str, + opt_silent: bool, + opt_insecure: bool, +) -> Result { let mut resolved_css = String::from(css_string); - let re = Regex::new(r###"url\((?:(?:https?|ftp)://)?"?[\w/\-?=%.]+\.[\w/\-?=%.]+"?\)"###).unwrap(); - + let re = + Regex::new(r###"url\((?:(?:https?|ftp)://)?"?[\w/\-?=%.]+\.[\w/\-?=%.]+"?\)"###).unwrap(); + for link in re.captures_iter(&css_string) { - let target_link = if link[0].chars().nth(4) == Some('"') { &link[0][5..link[0].len()-2] } else {&link[0][4..link[0].len()-1]}; + let target_link = if link[0].chars().nth(4) == Some('"') { + &link[0][5..link[0].len() - 2] + } else { + &link[0][4..link[0].len() - 1] + }; let embedded_url = String::from([href, "/../", &target_link.to_string()].concat()); - + let (css_dataurl, _) = retrieve_asset( &embedded_url, true, // true @@ -100,13 +111,22 @@ pub fn resolve_css_imports(css_string: &str, href: &str, opt_user_agent: &str, o opt_insecure, ) .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); - - let replacement = &["\"", &css_dataurl.replace("\"",&["\\", "\""].concat()).to_string(), "\""].concat(); - let t = resolved_css.replace(&link[0][4..link[0].len() - 1], &replacement).to_string(); + + let replacement = &[ + "\"", + &css_dataurl + .replace("\"", &["\\", "\""].concat()) + .to_string(), + "\"", + ] + .concat(); + let t = resolved_css + .replace(&link[0][4..link[0].len() - 1], &replacement) + .to_string(); resolved_css = t.clone(); } let encoded_css = data_to_dataurl("text/css", resolved_css.as_bytes()); Ok(encoded_css.to_string()) -} \ No newline at end of file +} From a2bf7e334568f92f4e203d01eb79f000068f4581 Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Thu, 5 Dec 2019 17:42:07 -0500 Subject: [PATCH 03/23] Fixed some errors detecting, parsing, and transforming urls in `resolve_css_imports` --- src/utils.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index 069e095..58de125 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -91,16 +91,16 @@ pub fn resolve_css_imports( opt_insecure: bool, ) -> Result { let mut resolved_css = String::from(css_string); - let re = - Regex::new(r###"url\((?:(?:https?|ftp)://)?"?[\w/\-?=%.]+\.[\w/\-?=%.]+"?\)"###).unwrap(); + let re = Regex::new(r###"url\("?([^"]+)"?\)"###).unwrap(); for link in re.captures_iter(&css_string) { - let target_link = if link[0].chars().nth(4) == Some('"') { - &link[0][5..link[0].len() - 2] - } else { - &link[0][4..link[0].len() - 1] + let target_link = dbg!(link.get(1).unwrap().as_str()); + + // Generate absolute URL for content + let embedded_url = match resolve_url(href, target_link) { + Ok(url) => url, + Err(_) => continue, // Malformed URL }; - let embedded_url = String::from([href, "/../", &target_link.to_string()].concat()); let (css_dataurl, _) = retrieve_asset( &embedded_url, From 11bbfc0851d5b42a5f90d1cc71ea8499e4b59ab2 Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Thu, 5 Dec 2019 18:15:06 -0500 Subject: [PATCH 04/23] Added support for recursively nested css @imports --- src/html.rs | 3 +-- src/utils.rs | 60 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 39 insertions(+), 24 deletions(-) diff --git a/src/html.rs b/src/html.rs index 9373a69..d798f89 100644 --- a/src/html.rs +++ b/src/html.rs @@ -142,8 +142,7 @@ pub fn walk_and_embed_assets( opt_user_agent, opt_silent, opt_insecure, - ) - .unwrap_or(css_dataurl); + ); attr.value.push_slice(css_resolved.as_str()); } diff --git a/src/utils.rs b/src/utils.rs index 58de125..c3ea1e5 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -89,12 +89,12 @@ pub fn resolve_css_imports( opt_user_agent: &str, opt_silent: bool, opt_insecure: bool, -) -> Result { +) -> String { let mut resolved_css = String::from(css_string); - let re = Regex::new(r###"url\("?([^"]+)"?\)"###).unwrap(); + let re = Regex::new(r###"(?P@import )?url\((?P"?(?P[^"]+)"?)\)"###).unwrap(); for link in re.captures_iter(&css_string) { - let target_link = dbg!(link.get(1).unwrap().as_str()); + let target_link = link.name("url").unwrap().as_str(); // Generate absolute URL for content let embedded_url = match resolve_url(href, target_link) { @@ -102,31 +102,47 @@ pub fn resolve_css_imports( Err(_) => continue, // Malformed URL }; - let (css_dataurl, _) = retrieve_asset( - &embedded_url, - true, // true - "", - opt_user_agent, - opt_silent, - opt_insecure, - ) - .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); + // Download the asset. If it's more CSS, resolve that too + let content = match link.name("import") { + + // The link is an @import link + Some(_) => retrieve_asset( + &embedded_url, + false, // Formating as data URL will be done later + "text/css", // Expect CSS + opt_user_agent, + opt_silent, + opt_insecure, + ) + .map(|(content, _)| resolve_css_imports( + &content, + &embedded_url, + opt_user_agent, + opt_silent, + opt_insecure, + )), + + // The link is some other, non-@import link + None => retrieve_asset( + &embedded_url, + true, // Format as data URL + "", // Unknown MIME type + opt_user_agent, + opt_silent, + opt_insecure, + ).map(|(a, _)| a), + + }.unwrap_or_else(|_| EMPTY_STRING.clone()); + + let replacement = format!("\"{}\"", &content); - let replacement = &[ - "\"", - &css_dataurl - .replace("\"", &["\\", "\""].concat()) - .to_string(), - "\"", - ] - .concat(); let t = resolved_css - .replace(&link[0][4..link[0].len() - 1], &replacement) + .replace(link.name("to_repl").unwrap().as_str(), &replacement) .to_string(); resolved_css = t.clone(); } let encoded_css = data_to_dataurl("text/css", resolved_css.as_bytes()); - Ok(encoded_css.to_string()) + encoded_css.to_string() } From ef7ddcd4347231b26226553373b87294950751c6 Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Thu, 5 Dec 2019 18:37:37 -0500 Subject: [PATCH 05/23] Added fallback to absolute URL on failure to resolve CSS stylesheet @imports --- src/utils.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/utils.rs b/src/utils.rs index c3ea1e5..19a146e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,6 +4,7 @@ use self::base64::encode; use http::retrieve_asset; use regex::Regex; use url::{ParseError, Url}; +use std::io::{stderr, Write}; lazy_static! { static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); @@ -132,7 +133,16 @@ pub fn resolve_css_imports( opt_insecure, ).map(|(a, _)| a), - }.unwrap_or_else(|_| EMPTY_STRING.clone()); + }.unwrap_or_else(|e| { + writeln!( + stderr(), + "Warning: {}", + e, + ).unwrap(); + + //If failed to resolve, replace with absolute URL + embedded_url + }); let replacement = format!("\"{}\"", &content); From ebbf755e096da53c7bd2767fba7e70e7aacffbec Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Thu, 5 Dec 2019 19:02:11 -0500 Subject: [PATCH 06/23] Fixed misleading variable name --- src/html.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/html.rs b/src/html.rs index 093b90e..c1dae22 100644 --- a/src/html.rs +++ b/src/html.rs @@ -129,7 +129,7 @@ pub fn walk_and_embed_assets( let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); - let (css_dataurl, _) = retrieve_asset( + let (css, _) = retrieve_asset( cache, &href_full_url, false, @@ -143,7 +143,7 @@ pub fn walk_and_embed_assets( let css_resolved = resolve_css_imports( cache, - &css_dataurl, + &css, &href_full_url, opt_user_agent, opt_silent, From 1de0fc0961559fcc1eb317da5b44d82a8b839d9d Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Thu, 5 Dec 2019 19:10:47 -0500 Subject: [PATCH 07/23] Add warning and fallback when parsing a rel=stylesheet link --- src/html.rs | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/html.rs b/src/html.rs index c1dae22..ceb5cb5 100644 --- a/src/html.rs +++ b/src/html.rs @@ -7,6 +7,7 @@ use html5ever::tree_builder::{Attribute, TreeSink}; use html5ever::{local_name, namespace_url, ns}; use http::retrieve_asset; use js::attr_is_event_handler; +use std::io::{stderr, Write}; use std::collections::HashMap; use std::default::Default; use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol}; @@ -129,7 +130,7 @@ pub fn walk_and_embed_assets( let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); - let (css, _) = retrieve_asset( + let replacement_text = match retrieve_asset( cache, &href_full_url, false, @@ -137,20 +138,33 @@ pub fn walk_and_embed_assets( opt_user_agent, opt_silent, opt_insecure, - ) - .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); + ) { + + // On successful retrieval, traverse CSS + Ok((css_data, _)) => resolve_css_imports( + cache, + &css_data, + &href_full_url, + opt_user_agent, + opt_silent, + opt_insecure, + ), + + // If a network error occured, warn + Err(e) => { + writeln!( + stderr(), + "Warning: {}", + e, + ).unwrap(); + + //If failed to resolve, replace with absolute URL + href_full_url + }, + }; + attr.value.clear(); - - let css_resolved = resolve_css_imports( - cache, - &css, - &href_full_url, - opt_user_agent, - opt_silent, - opt_insecure, - ); - - attr.value.push_slice(css_resolved.as_str()); + attr.value.push_slice(&replacement_text); } } } From d574e9a5dac858d611743b8aba2f139bba4771ec Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Thu, 5 Dec 2019 20:05:52 -0500 Subject: [PATCH 08/23] Added support for