use base64; use regex::Regex; use reqwest::blocking::Client; use reqwest::header::CONTENT_TYPE; use std::collections::HashMap; use std::fs; use std::path::Path; use url::{form_urlencoded, ParseError, Url}; /// This monster of a regex is used to match any kind of URL found in CSS. /// /// There are roughly three different categories that a found URL could fit /// into: /// - Font [found after a src: property in an @font-family rule] /// - Stylesheet [denoted by an @import before the url /// - Image [covers all other uses of the url() function] /// /// This regex aims to extract the following information: /// - What type of URL is it (font/image/css) /// - Where is the part that needs to be replaced (incl any wrapping quotes) /// - What is the URL (excl any wrapping quotes) /// /// Essentially, the regex can be broken down into two parts: /// /// `(?:(?P@import)|(?Psrc\s*:)\s+)?` /// This matches the precursor to a font or CSS URL, and fills in a match under /// either `` (if it's a CSS URL) or `` (if it's a font). /// Determining whether or not it's an image can be done by the negation of both /// of these. Either zero or one of these can match. /// /// `url\((?P['"]?(?P[^"'\)]+)['"]?)\)` /// This matches the actual URL part of the url(), and must always match. It also /// sets `` and `` which correspond to everything within /// `url(...)` and a usable URL, respectively. /// /// Note, however, that this does not perform any validation of the found URL. /// Malformed CSS could lead to an invalid URL being present. It is therefore /// recomended that the URL gets manually validated. const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P@import)|(?Psrc\s*:))\s+)?url\((?P['"]?(?P[^"'\)]+)['"]?)\)"###; lazy_static! { static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap(); } const MAGIC: [[&[u8]; 2]; 19] = [ // Image [b"GIF87a", b"image/gif"], [b"GIF89a", b"image/gif"], [b"\xFF\xD8\xFF", b"image/jpeg"], [b"\x89PNG\x0D\x0A\x1A\x0A", b"image/png"], [b" String { let media_type = if mime.is_empty() { detect_media_type(data) } else { mime.to_string() }; format!("data:{};base64,{}", media_type, base64::encode(data)) } pub fn detect_media_type(data: &[u8]) -> String { for item in MAGIC.iter() { if data.starts_with(item[0]) { return String::from_utf8(item[1].to_vec()).unwrap(); } } str!() } pub fn url_has_protocol>(url: T) -> bool { Url::parse(url.as_ref()) .and_then(|u| Ok(u.scheme().len() > 0)) .unwrap_or(false) } pub fn is_data_url>(url: T) -> bool { Url::parse(url.as_ref()) .and_then(|u| Ok(u.scheme() == "data")) .unwrap_or(false) } pub fn is_file_url>(url: T) -> bool { Url::parse(url.as_ref()) .and_then(|u| Ok(u.scheme() == "file")) .unwrap_or(false) } pub fn is_http_url>(url: T) -> bool { Url::parse(url.as_ref()) .and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https")) .unwrap_or(false) } pub fn resolve_url, U: AsRef>(from: T, to: U) -> Result { let result = if is_http_url(to.as_ref()) { to.as_ref().to_string() } else { Url::parse(from.as_ref())? .join(to.as_ref())? .as_ref() .to_string() }; Ok(result) } pub fn resolve_css_imports( cache: &mut HashMap, client: &Client, css_string: &str, as_data_url: bool, parent_url: &str, href: &str, opt_no_images: bool, opt_silent: bool, ) -> String { let mut resolved_css = String::from(css_string); for link in REGEX_CSS_URL.captures_iter(&css_string) { let target_link = link.name("url").unwrap().as_str(); // Determine linked asset type let is_stylesheet = link.name("stylesheet").is_some(); let is_font = link.name("font").is_some(); let is_image = !is_stylesheet && !is_font; // Generate absolute URL for the content let embedded_url = match resolve_url(href, target_link) { Ok(url) => url, Err(_) => continue, // Malformed URL }; // Download the asset. If it's more CSS, resolve that too let content = if is_stylesheet { // The link is an @import link retrieve_asset( cache, client, &parent_url, &embedded_url, false, // Formatting as data URL will be done later "text/css", // Expect CSS opt_silent, ) .map(|(content, _)| { resolve_css_imports( cache, client, &content, true, // Finally, convert to a data URL &parent_url, &embedded_url, opt_no_images, opt_silent, ) }) } else if (is_image && !opt_no_images) || is_font { // The link is some other, non-@import link retrieve_asset( cache, client, &parent_url, &embedded_url, true, // Format as data URL "", // Unknown MIME type opt_silent, ) .map(|(a, _)| a) } else { // If it's a datatype that has been opt_no'd out of, replace with // absolute URL Ok(embedded_url.clone()) } .unwrap_or_else(|e| { eprintln!("Warning: {}", e); // If failed to resolve, replace with absolute URL embedded_url }); let replacement = format!("\"{}\"", &content); let dest = link.name("to_repl").unwrap(); if resolved_css.len() > css_string.len() { let offset = resolved_css.len() - css_string.len(); let target_range = (dest.start() + offset)..(dest.end() + offset); resolved_css.replace_range(target_range, &replacement); } } if as_data_url { data_to_data_url("text/css", resolved_css.as_bytes()) } else { resolved_css } } pub fn clean_url>(url: T) -> String { let mut result = Url::parse(url.as_ref()).unwrap(); // Clear fragment result.set_fragment(None); // Get rid of stray question mark if result.query() == Some("") { result.set_query(None); } result.to_string() } pub fn data_url_to_text>(url: T) -> String { let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap()); let path: String = parsed_url.path().to_string(); let comma_loc: usize = path.find(',').unwrap_or(path.len()); if comma_loc == path.len() { return str!(); } let meta_data: String = path.chars().take(comma_loc).collect(); let raw_data: String = path.chars().skip(comma_loc + 1).collect(); let data: String = decode_url(raw_data); let meta_data_items: Vec<&str> = meta_data.split(';').collect(); let mut mime_type: &str = ""; let mut encoding: &str = ""; let mut i: i8 = 0; for item in &meta_data_items { if i == 0 { if item.eq_ignore_ascii_case("text/html") { mime_type = item; continue; } } if item.eq_ignore_ascii_case("base64") || item.eq_ignore_ascii_case("utf8") { encoding = item; } i = i + 1; } if mime_type.eq_ignore_ascii_case("text/html") { if encoding.eq_ignore_ascii_case("base64") { String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!()) } else { data } } else { str!() } } pub fn decode_url(input: String) -> String { form_urlencoded::parse(input.as_bytes()) .map(|(key, val)| { [ key.to_string(), if val.to_string().len() == 0 { str!() } else { str!('=') }, val.to_string(), ] .concat() }) .collect() } pub fn retrieve_asset( cache: &mut HashMap, client: &Client, parent_url: &str, url: &str, as_data_url: bool, mime: &str, opt_silent: bool, ) -> Result<(String, String), reqwest::Error> { if url.len() == 0 { return Ok((str!(), str!())); } let cache_key = clean_url(&url); if is_data_url(&url) { Ok((url.to_string(), url.to_string())) } else if is_file_url(&url) { // Check if parent_url is also file:/// // (if not then we don't download/embed the asset) if !is_file_url(&parent_url) { return Ok((str!(), str!())); } let cutoff = if cfg!(windows) { 8 } else { 7 }; let fs_file_path: String = decode_url(url.to_string()[cutoff..].to_string()); let path = Path::new(&fs_file_path); if path.exists() { if !opt_silent { eprintln!("{}", &url); } if as_data_url { let data_url: String = data_to_data_url(&mime, &fs::read(&fs_file_path).unwrap()); Ok((data_url, url.to_string())) } else { let data: String = fs::read_to_string(&fs_file_path).expect(url); Ok((data, url.to_string())) } } else { Ok((str!(), url.to_string())) } } else { if cache.contains_key(&cache_key) { // URL is in cache if !opt_silent { eprintln!("{} (from cache)", &url); } let data = cache.get(&cache_key).unwrap(); Ok((data.to_string(), url.to_string())) } else { // URL not in cache, we request it let mut response = client.get(url).send()?; let res_url = response.url().to_string(); if !opt_silent { if url == res_url { eprintln!("{}", &url); } else { eprintln!("{} -> {}", &url, &res_url); } } let new_cache_key = clean_url(&res_url); if as_data_url { // Convert response into a byte array let mut data: Vec = vec![]; response.copy_to(&mut data)?; // Attempt to obtain MIME type by reading the Content-Type header let media_type = if mime == "" { response .headers() .get(CONTENT_TYPE) .and_then(|header| header.to_str().ok()) .unwrap_or(&mime) } else { mime }; let data_url = data_to_data_url(&media_type, &data); // Add to cache cache.insert(new_cache_key, data_url.clone()); Ok((data_url, res_url)) } else { let content = response.text().unwrap(); // Add to cache cache.insert(new_cache_key, content.clone()); Ok((content, res_url)) } } } }