monolith/src/utils.rs

use crate::http::retrieve_asset;
use base64::encode;
use regex::Regex;
use reqwest::blocking::Client;
use std::collections::HashMap;
use url::{ParseError, Url};

/// This monster of a regex is used to match any kind of URL found in CSS.
///
/// There  are roughly three different categories that a found URL could fit
/// into:
///    - Font       [found after a src: property in an @font-family rule]
///    - Stylesheet [denoted by an @import before the url
///    - Image      [covers all other uses of the url() function]
///
/// This regex aims to extract the following information:
///    - What type of URL is it (font/image/css)
///    - Where is the part that needs to be replaced (incl any wrapping quotes)
///    - What is the URL (excl any wrapping quotes)
///
/// Essentially, the regex can be broken down into two parts:
///
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
/// This matches the precursor to a font or CSS URL, and fills in a match under
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
/// Determining whether or not it's an image can be done by the negation of both
/// of these. Either zero or one of these can match.
///
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
/// This matches the actual URL part of the url(), and must always match. It also
/// sets `<to_repl>` and `<url>` which correspond to everything within
/// `url(...)` and a usable URL, respectively.
///
/// Note, however, that this does not perform any validation of the found URL.
/// Malformed CSS could lead to an invalid URL being present. It is therefore
/// recomended that the URL gets manually validated.
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;

lazy_static! {
    static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
    static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
    static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
}

const MAGIC: [[&[u8]; 2]; 19] = [
    // Image
    [b"GIF87a", b"image/gif"],
    [b"GIF89a", b"image/gif"],
    [b"\xFF\xD8\xFF", b"image/jpeg"],
    [b"\x89PNG\x0D\x0A\x1A\x0A", b"image/png"],
    [b"<?xml ", b"image/svg+xml"],
    [b"<svg ", b"image/svg+xml"],
    [b"RIFF....WEBPVP8 ", b"image/webp"],
    [b"\x00\x00\x01\x00", b"image/x-icon"],
    // Audio
    [b"ID3", b"audio/mpeg"],
    [b"\xFF\x0E", b"audio/mpeg"],
    [b"\xFF\x0F", b"audio/mpeg"],
    [b"OggS", b"audio/ogg"],
    [b"RIFF....WAVEfmt ", b"audio/wav"],
    [b"fLaC", b"audio/x-flac"],
    // Video
    [b"RIFF....AVI LIST", b"video/avi"],
    [b"....ftyp", b"video/mp4"],
    [b"\x00\x00\x01\x0B", b"video/mpeg"],
    [b"....moov", b"video/quicktime"],
    [b"\x1A\x45\xDF\xA3", b"video/webm"],
];

pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
    let mimetype = if mime.is_empty() {
        detect_mimetype(data)
    } else {
        mime.to_string()
    };
    format!("data:{};base64,{}", mimetype, encode(data))
}

pub fn detect_mimetype(data: &[u8]) -> String {
    for item in MAGIC.iter() {
        if data.starts_with(item[0]) {
            return String::from_utf8(item[1].to_vec()).unwrap();
        }
    }
    "".to_owned()
}

pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {
    HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str())
}

pub fn is_data_url<T: AsRef<str>>(url: T) -> Result<bool, ParseError> {
    Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data"))
}

pub fn is_valid_url<T: AsRef<str>>(path: T) -> bool {
    REGEX_URL.is_match(path.as_ref())
}

pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
    let result = if is_valid_url(to.as_ref()) {
        to.as_ref().to_string()
    } else {
        Url::parse(from.as_ref())?
            .join(to.as_ref())?
            .as_ref()
            .to_string()
    };
    Ok(result)
}

pub fn resolve_css_imports(
    cache: &mut HashMap<String, String>,
    client: &Client,
    css_string: &str,
    as_dataurl: bool,
    href: &str,
    opt_no_images: bool,
    opt_silent: bool,
) -> String {
    let mut resolved_css = String::from(css_string);

    for link in REGEX_CSS_URL.captures_iter(&css_string) {
        let target_link = link.name("url").unwrap().as_str();

        // Determine the type of link
        let is_stylesheet = link.name("stylesheet").is_some();
        let is_font = link.name("font").is_some();
        let is_image = !is_stylesheet && !is_font;

        // Generate absolute URL for content
        let embedded_url = match resolve_url(href, target_link) {
            Ok(url) => url,
            Err(_) => continue, // Malformed URL
        };

        // Download the asset. If it's more CSS, resolve that too
        let content = if is_stylesheet {
            // The link is an @import link
            retrieve_asset(
                cache,
                client,
                &embedded_url,
                false,      // Formating as data URL will be done later
                "text/css", // Expect CSS
                opt_silent,
            )
            .map(|(content, _)| {
                resolve_css_imports(
                    cache,
                    client,
                    &content,
                    true, // Finally, convert to a dataurl
                    &embedded_url,
                    opt_no_images,
                    opt_silent,
                )
            })
        } else if (is_image && !opt_no_images) || is_font {
            // The link is some other, non-@import link
            retrieve_asset(
                cache,
                client,
                &embedded_url,
                true, // Format as data URL
                "",   // Unknown MIME type
                opt_silent,
            )
            .map(|(a, _)| a)
        } else {
            // If it's a datatype that has been opt_no'd out of, replace with
            // absolute URL

            Ok(embedded_url.clone())
        }
        .unwrap_or_else(|e| {
            eprintln!("Warning: {}", e);

            // If failed to resolve, replace with absolute URL
            embedded_url
        });

        let replacement = format!("\"{}\"", &content);
        let dest = link.name("to_repl").unwrap();
        let offset = resolved_css.len() - css_string.len();
        let target_range = (dest.start() + offset)..(dest.end() + offset);

        resolved_css.replace_range(target_range, &replacement);
    }

    if as_dataurl {
        data_to_dataurl("text/css", resolved_css.as_bytes())
    } else {
        resolved_css
    }
}

pub fn clean_url<T: AsRef<str>>(url: T) -> String {
    let mut result = Url::parse(url.as_ref()).unwrap();
    // Clear fragment
    result.set_fragment(None);
    // Get rid of stray question mark
    if result.query() == Some("") {
        result.set_query(None);
    }
    result.to_string()
}
migrate to Rust 2018 2020-01-02 16:31:55 +01:00			`use crate::http::retrieve_asset;`
			`use base64::encode;`
Improve code structure 2019-09-29 23:15:49 +02:00			`use regex::Regex;`
upgrade reqwest to v0.10.0 This will improve build time and binary size as follows: * Before - Compile targets: 220 - Build time: `cargo build --release 1264.95s user 39.72s system 335% cpu 6:29.14 total` - Binary size: 6578568 bytes * After - Compile targets: 170 - Build time: `cargo build --release 1130.64s user 32.15s system 359% cpu 5:23.69 total` - Binary size: 6107088 bytes * Differences - Compile targets: 1.29x smaller - Build time: 1.23x faster - Binary size: 1.07x smaller 2020-01-07 05:22:28 +01:00			`use reqwest::blocking::Client;`
Cleaned up some overcomplicated code 2019-12-06 02:20:09 +01:00			`use std::collections::HashMap;`
Improve code structure 2019-09-29 23:15:49 +02:00			`use url::{ParseError, Url};`

Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`/// This monster of a regex is used to match any kind of URL found in CSS.`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`///`
Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`/// There are roughly three different categories that a found URL could fit`
			`/// into:`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`/// - Font [found after a src: property in an @font-family rule]`
			`/// - Stylesheet [denoted by an @import before the url`
			`/// - Image [covers all other uses of the url() function]`
			`///`
			`/// This regex aims to extract the following information:`
			`/// - What type of URL is it (font/image/css)`
			`/// - Where is the part that needs to be replaced (incl any wrapping quotes)`
			`/// - What is the URL (excl any wrapping quotes)`
			`///`
Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`/// Essentially, the regex can be broken down into two parts:`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`///`
			/// `(?:(?P<import>@import)\|(?P<font>src\s*:)\s+)?`
Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`/// This matches the precursor to a font or CSS URL, and fills in a match under`
			/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`/// Determining whether or not it's an image can be done by the negation of both`
Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`/// of these. Either zero or one of these can match.`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`///`
			/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`/// This matches the actual URL part of the url(), and must always match. It also`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			/// sets `<to_repl>` and `<url>` which correspond to everything within
			/// `url(...)` and a usable URL, respectively.
			`///`
			`/// Note, however, that this does not perform any validation of the found URL.`
Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`/// Malformed CSS could lead to an invalid URL being present. It is therefore`
			`/// recomended that the URL gets manually validated.`
Rustfmt update for nightly formatter 2019-12-06 22:46:52 +01:00			`const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)\|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;`
Moved regex compilation to lazy_static 2019-12-06 19:53:44 +01:00
Improve code structure 2019-09-29 23:15:49 +02:00			`lazy_static! {`
			`static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();`
			`static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();`
Moved regex compilation to lazy_static 2019-12-06 19:53:44 +01:00			`static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();`
Improve code structure 2019-09-29 23:15:49 +02:00			`}`
Get rid of mime-sniffer dependency 2019-08-24 00:48:08 +02:00
refactor utils functions 2019-10-10 15:23:00 +02:00			`const MAGIC: [[&[u8]; 2]; 19] = [`
Get rid of mime-sniffer dependency 2019-08-24 00:48:08 +02:00			`// Image`
			`[b"GIF87a", b"image/gif"],`
			`[b"GIF89a", b"image/gif"],`
			`[b"\xFF\xD8\xFF", b"image/jpeg"],`
			`[b"\x89PNG\x0D\x0A\x1A\x0A", b"image/png"],`
			`[b"<?xml ", b"image/svg+xml"],`
			`[b"<svg ", b"image/svg+xml"],`
			`[b"RIFF....WEBPVP8 ", b"image/webp"],`
			`[b"\x00\x00\x01\x00", b"image/x-icon"],`
			`// Audio`
			`[b"ID3", b"audio/mpeg"],`
			`[b"\xFF\x0E", b"audio/mpeg"],`
			`[b"\xFF\x0F", b"audio/mpeg"],`
			`[b"OggS", b"audio/ogg"],`
			`[b"RIFF....WAVEfmt ", b"audio/wav"],`
			`[b"fLaC", b"audio/x-flac"],`
			`// Video`
			`[b"RIFF....AVI LIST", b"video/avi"],`
			`[b"....ftyp", b"video/mp4"],`
			`[b"\x00\x00\x01\x0B", b"video/mpeg"],`
			`[b"....moov", b"video/quicktime"],`
			`[b"\x1A\x45\xDF\xA3", b"video/webm"],`
			`];`
Rewrite program in Rust 2019-08-23 05:17:15 +02:00
			`pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {`
refactor utils functions 2019-10-10 15:23:00 +02:00			`let mimetype = if mime.is_empty() {`
cargo clippy 2019-08-23 20:24:45 +02:00			`detect_mimetype(data)`
Rewrite program in Rust 2019-08-23 05:17:15 +02:00			`} else {`
cargo clippy 2019-08-23 20:24:45 +02:00			`mime.to_string()`
			`};`
Rewrite program in Rust 2019-08-23 05:17:15 +02:00			`format!("data:{};base64,{}", mimetype, encode(data))`
			`}`

Improve code structure 2019-09-29 23:15:49 +02:00			`pub fn detect_mimetype(data: &[u8]) -> String {`
Add CSP isolation, no CSS, and no iframe options 2019-09-22 02:06:00 +02:00			`for item in MAGIC.iter() {`
Get rid of mime-sniffer dependency 2019-08-24 00:48:08 +02:00			`if data.starts_with(item[0]) {`
refactor utils functions 2019-10-10 15:23:00 +02:00			`return String::from_utf8(item[1].to_vec()).unwrap();`
Get rid of mime-sniffer dependency 2019-08-24 00:48:08 +02:00			`}`
			`}`
refactor utils functions 2019-10-10 15:23:00 +02:00			`"".to_owned()`
Rewrite program in Rust 2019-08-23 05:17:15 +02:00			`}`

refactor utils functions 2019-10-10 15:23:00 +02:00			`pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {`
			`HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str())`
Improve code structure 2019-09-29 23:15:49 +02:00			`}`
Rewrite program in Rust 2019-08-23 05:17:15 +02:00
refactor utils functions 2019-10-10 15:23:00 +02:00			`pub fn is_data_url<T: AsRef<str>>(url: T) -> Result<bool, ParseError> {`
			`Url::parse(url.as_ref()).and_then(\|u\| Ok(u.scheme() == "data"))`
Improve code structure 2019-09-29 23:15:49 +02:00			`}`
Get rid of mime-sniffer dependency 2019-08-24 00:48:08 +02:00
refactor utils functions 2019-10-10 15:23:00 +02:00			`pub fn is_valid_url<T: AsRef<str>>(path: T) -> bool {`
			`REGEX_URL.is_match(path.as_ref())`
Improve code structure 2019-09-29 23:15:49 +02:00			`}`

refactor utils functions 2019-10-10 15:23:00 +02:00			`pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {`
			`let result = if is_valid_url(to.as_ref()) {`
			`to.as_ref().to_string()`
Improve code structure 2019-09-29 23:15:49 +02:00			`} else {`
refactor utils functions 2019-10-10 15:23:00 +02:00			`Url::parse(from.as_ref())?`
			`.join(to.as_ref())?`
			`.as_ref()`
			`.to_string()`
Improve code structure 2019-09-29 23:15:49 +02:00			`};`
			`Ok(result)`
Rewrite program in Rust 2019-08-23 05:17:15 +02:00			`}`
Added loading of the links given as url(...) in css files 2019-10-12 11:32:59 +02:00
Fixed formatting 2019-10-12 13:05:07 +02:00			`pub fn resolve_css_imports(`
Made merge compatible with Y2Z/master 2019-12-06 01:01:03 +01:00			`cache: &mut HashMap<String, String>,`
Merge Y2Z/master, fix conflicts between shared-client & resolve-css 2019-12-13 01:29:21 +01:00			`client: &Client,`
Fixed formatting 2019-10-12 13:05:07 +02:00			`css_string: &str,`
Added support for <style> tags 2019-12-06 02:05:52 +01:00			`as_dataurl: bool,`
Fixed formatting 2019-10-12 13:05:07 +02:00			`href: &str,`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`opt_no_images: bool,`
Fixed formatting 2019-10-12 13:05:07 +02:00			`opt_silent: bool,`
Added support for recursively nested css @imports 2019-12-06 00:15:06 +01:00			`) -> String {`
Added loading of the links given as url(...) in css files 2019-10-12 11:32:59 +02:00			`let mut resolved_css = String::from(css_string);`
Fixed formatting 2019-10-12 13:05:07 +02:00
Moved regex compilation to lazy_static 2019-12-06 19:53:44 +01:00			`for link in REGEX_CSS_URL.captures_iter(&css_string) {`
Added support for recursively nested css @imports 2019-12-06 00:15:06 +01:00			`let target_link = link.name("url").unwrap().as_str();`
Fixed some errors detecting, parsing, and transforming urls in `resolve_css_imports` 2019-12-05 23:42:07 +01:00
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`// Determine the type of link`
			`let is_stylesheet = link.name("stylesheet").is_some();`
			`let is_font = link.name("font").is_some();`
			`let is_image = !is_stylesheet && !is_font;`

Fixed some errors detecting, parsing, and transforming urls in `resolve_css_imports` 2019-12-05 23:42:07 +01:00			`// Generate absolute URL for content`
			`let embedded_url = match resolve_url(href, target_link) {`
			`Ok(url) => url,`
			`Err(_) => continue, // Malformed URL`
Fixed formatting 2019-10-12 13:05:07 +02:00			`};`

Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`// Download the asset. If it's more CSS, resolve that too`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`let content = if is_stylesheet {`
Added support for recursively nested css @imports 2019-12-06 00:15:06 +01:00			`// The link is an @import link`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`retrieve_asset(`
Applied rustfmt 2019-12-06 02:41:43 +01:00			`cache,`
Merge Y2Z/master, fix conflicts between shared-client & resolve-css 2019-12-13 01:29:21 +01:00			`client,`
Applied rustfmt 2019-12-06 02:41:43 +01:00			`&embedded_url,`
			`false, // Formating as data URL will be done later`
			`"text/css", // Expect CSS`
			`opt_silent,`
			`)`
			`.map(\|(content, _)\| {`
			`resolve_css_imports(`
			`cache,`
Merge Y2Z/master, fix conflicts between shared-client & resolve-css 2019-12-13 01:29:21 +01:00			`client,`
Applied rustfmt 2019-12-06 02:41:43 +01:00			`&content,`
Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`true, // Finally, convert to a dataurl`
Applied rustfmt 2019-12-06 02:41:43 +01:00			`&embedded_url,`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`opt_no_images,`
Applied rustfmt 2019-12-06 02:41:43 +01:00			`opt_silent,`
			`)`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`})`
			`} else if (is_image && !opt_no_images) \|\| is_font {`
Added support for recursively nested css @imports 2019-12-06 00:15:06 +01:00			`// The link is some other, non-@import link`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`retrieve_asset(`
Applied rustfmt 2019-12-06 02:41:43 +01:00			`cache,`
Merge Y2Z/master, fix conflicts between shared-client & resolve-css 2019-12-13 01:29:21 +01:00			`client,`
Applied rustfmt 2019-12-06 02:41:43 +01:00			`&embedded_url,`
			`true, // Format as data URL`
			`"", // Unknown MIME type`
			`opt_silent,`
			`)`
Respect the --no-images flag while parsing CSS 2019-12-06 20:59:13 +01:00			`.map(\|(a, _)\| a)`
			`} else {`
			`// If it's a datatype that has been opt_no'd out of, replace with`
			`// absolute URL`

			`Ok(embedded_url.clone())`
Applied rustfmt 2019-12-06 02:41:43 +01:00			`}`
			`.unwrap_or_else(\|e\| {`
Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`eprintln!("Warning: {}", e);`
Added fallback to absolute URL on failure to resolve CSS stylesheet @imports 2019-12-06 00:37:37 +01:00
Gramatical and stylistic fixes 2019-12-09 18:41:21 +01:00			`// If failed to resolve, replace with absolute URL`
Added fallback to absolute URL on failure to resolve CSS stylesheet @imports 2019-12-06 00:37:37 +01:00			`embedded_url`
			`});`
Added support for recursively nested css @imports 2019-12-06 00:15:06 +01:00
			`let replacement = format!("\"{}\"", &content);`
Use a slightly more efficient .replace_range() instead of cloning the string twice 2019-12-06 17:37:05 +01:00			`let dest = link.name("to_repl").unwrap();`
Fixed css replacement with more than one linked asset 2019-12-06 19:52:51 +01:00			`let offset = resolved_css.len() - css_string.len();`
			`let target_range = (dest.start() + offset)..(dest.end() + offset);`
Added support for recursively nested css @imports 2019-12-06 00:15:06 +01:00
Fixed css replacement with more than one linked asset 2019-12-06 19:52:51 +01:00			`resolved_css.replace_range(target_range, &replacement);`
Added loading of the links given as url(...) in css files 2019-10-12 11:32:59 +02:00			`}`

Added support for <style> tags 2019-12-06 02:05:52 +01:00			`if as_dataurl {`
			`data_to_dataurl("text/css", resolved_css.as_bytes())`
			`} else {`
			`resolved_css`
			`}`
Fixed formatting 2019-10-12 13:05:07 +02:00			`}`
use clean URLs as hashmap keys 2019-12-12 03:13:11 +01:00
			`pub fn clean_url<T: AsRef<str>>(url: T) -> String {`
			`let mut result = Url::parse(url.as_ref()).unwrap();`
			`// Clear fragment`
			`result.set_fragment(None);`
			`// Get rid of stray question mark`
			`if result.query() == Some("") {`
			`result.set_query(None);`
			`}`
			`result.to_string()`
			`}`