From a4743ca92f4db510940231b20b1a40c705161c9c Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Fri, 6 Dec 2019 14:59:13 -0500 Subject: [PATCH] Respect the --no-images flag while parsing CSS --- src/html.rs | 2 ++ src/utils.rs | 55 +++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/src/html.rs b/src/html.rs index 1450dc6..128a303 100644 --- a/src/html.rs +++ b/src/html.rs @@ -145,6 +145,7 @@ pub fn walk_and_embed_assets( &css_data, true, &href_full_url, + opt_no_images, opt_user_agent, opt_silent, opt_insecure, @@ -302,6 +303,7 @@ pub fn walk_and_embed_assets( tendril.as_ref(), false, &url, + opt_no_images, opt_user_agent, opt_silent, opt_insecure, diff --git a/src/utils.rs b/src/utils.rs index dd8e4bd..12b0d14 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,8 +6,37 @@ use regex::Regex; use std::collections::HashMap; use url::{ParseError, Url}; +/// This monster of a regex is used to match any kind of URL found in a CSS +/// stylesheet. +/// +/// There's roughly three different categories that a found url could fit into: +/// - Font [found after a src: property in an @font-family rule] +/// - Stylesheet [denoted by an @import before the url +/// - Image [covers all other uses of the url() function] +/// +/// This regex aims to extract the following information: +/// - What type of URL is it (font/image/css) +/// - Where is the part that needs to be replaced (incl any wrapping quotes) +/// - What is the URL (excl any wrapping quotes) +/// +/// For understandablility, the regex can be broken down into two parts: +/// +/// `(?:(?P@import)|(?Psrc\s*:)\s+)?` +/// This matches the precursor to a font or css url, and fills in a match under +/// either `` (if it's a css url) or `` (if it's a font). +/// Determining whether or not it's an image can be done by the negation of both +/// of these. Either zero or one of these can match. +/// +/// `url\((?P['"]?(?P[^"'\)]+)['"]?)\)` +/// This matches the actual URL part of the url, and must always match. It also +/// sets `` and `` which correspond to everything within +/// `url(...)` and a usable URL, respectively. +/// +/// Note, however, that this does not perform any validation of the found URL. +/// Malformed CSS could lead to an invalid URL being present. It is therefor +/// recomended that the URL is manually validated. const CSS_URL_REGEX_STR: &str = - r###"(?P@import )?url\((?P['"]?(?P[^"'\)]+)['"]?)\)"###; + r###"(?:(?P@import)|(?Psrc\s*:)\s+)?url\((?P['"]?(?P[^"'\)]+)['"]?)\)"###; lazy_static! { static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); @@ -87,6 +116,7 @@ pub fn resolve_css_imports( css_string: &str, as_dataurl: bool, href: &str, + opt_no_images: bool, opt_user_agent: &str, opt_silent: bool, opt_insecure: bool, @@ -96,6 +126,11 @@ pub fn resolve_css_imports( for link in REGEX_CSS_URL.captures_iter(&css_string) { let target_link = link.name("url").unwrap().as_str(); + // Determine the type of link + let is_stylesheet = link.name("stylesheet").is_some(); + let is_font = link.name("font").is_some(); + let is_image = !is_stylesheet && !is_font; + // Generate absolute URL for content let embedded_url = match resolve_url(href, target_link) { Ok(url) => url, @@ -103,9 +138,9 @@ pub fn resolve_css_imports( }; // Download the asset. If it's more CSS, resolve that too - let content = match link.name("import") { + let content = if is_stylesheet { // The link is an @import link - Some(_) => retrieve_asset( + retrieve_asset( cache, &embedded_url, false, // Formating as data URL will be done later @@ -120,14 +155,15 @@ pub fn resolve_css_imports( &content, true, //NOW, convert to data URL &embedded_url, + opt_no_images, opt_user_agent, opt_silent, opt_insecure, ) - }), - + }) + } else if (is_image && !opt_no_images) || is_font { // The link is some other, non-@import link - None => retrieve_asset( + retrieve_asset( cache, &embedded_url, true, // Format as data URL @@ -136,7 +172,12 @@ pub fn resolve_css_imports( opt_silent, opt_insecure, ) - .map(|(a, _)| a), + .map(|(a, _)| a) + } else { + // If it's a datatype that has been opt_no'd out of, replace with + // absolute URL + + Ok(embedded_url.clone()) } .unwrap_or_else(|e| { eprintln!("Warning: {}", e,);