diff --git a/Cargo.lock b/Cargo.lock index 089b71e..1ef3522 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -625,7 +625,7 @@ dependencies = [ [[package]] name = "monolith" -version = "2.0.23" +version = "2.1.0" dependencies = [ "base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index 7bb4abd..901fe35 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,11 @@ [package] name = "monolith" -version = "2.0.23" +version = "2.1.0" authors = [ "Sunshine ", "Mahdi Robatipoor ", "Emmanuel Delaborde ", + "Emi Simpson ", ] description = "CLI tool for saving web pages as a single HTML file" diff --git a/src/html.rs b/src/html.rs index 3c7057f..ea1b620 100644 --- a/src/html.rs +++ b/src/html.rs @@ -9,7 +9,7 @@ use http::retrieve_asset; use js::attr_is_event_handler; use std::collections::HashMap; use std::default::Default; -use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol}; +use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol}; lazy_static! { static ref EMPTY_STRING: String = String::new(); @@ -129,18 +129,38 @@ pub fn walk_and_embed_assets( let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); - let (css_dataurl, _) = retrieve_asset( + let replacement_text = match retrieve_asset( cache, &href_full_url, - true, + false, "text/css", opt_user_agent, opt_silent, opt_insecure, - ) - .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); + ) { + // On successful retrieval, traverse CSS + Ok((css_data, _)) => resolve_css_imports( + cache, + &css_data, + true, + &href_full_url, + opt_no_images, + opt_user_agent, + opt_silent, + opt_insecure, + ), + + // If a network error occured, warn + Err(e) => { + eprintln!("Warning: {}", e,); + + // If failed to resolve, replace with absolute URL + href_full_url + } + }; + attr.value.clear(); - attr.value.push_slice(css_dataurl.as_str()); + attr.value.push_slice(&replacement_text); } } } @@ -273,6 +293,24 @@ pub fn walk_and_embed_assets( if opt_no_css { // Empty inner content of STYLE tags node.children.borrow_mut().clear(); + } else { + for node in node.children.borrow_mut().iter_mut() { + if let NodeData::Text { ref contents } = node.data { + let mut tendril = contents.borrow_mut(); + let replacement = resolve_css_imports( + cache, + tendril.as_ref(), + false, + &url, + opt_no_images, + opt_user_agent, + opt_silent, + opt_insecure, + ); + tendril.clear(); + tendril.push_slice(&replacement); + } + } } } "form" => { @@ -372,6 +410,7 @@ pub fn walk_and_embed_assets( _ => {} } + // Process style attributes if opt_no_css { // Get rid of style attributes let mut style_attr_indexes = Vec::new(); @@ -384,6 +423,25 @@ pub fn walk_and_embed_assets( for attr_index in style_attr_indexes { attrs_mut.remove(attr_index); } + } else { + // Otherwise, parse any links found in the attributes + for attribute in attrs_mut + .iter_mut() + .filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style")) + { + let replacement = resolve_css_imports( + cache, + attribute.value.as_ref(), + false, + &url, + opt_no_images, + opt_user_agent, + opt_silent, + opt_insecure, + ); + attribute.value.clear(); + attribute.value.push_slice(&replacement); + } } if opt_no_js { diff --git a/src/utils.rs b/src/utils.rs index 069e974..6637a98 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,12 +1,46 @@ extern crate base64; use self::base64::encode; +use http::retrieve_asset; use regex::Regex; +use std::collections::HashMap; use url::{ParseError, Url}; +/// This monster of a regex is used to match any kind of URL found in CSS. +/// +/// There are roughly three different categories that a found URL could fit +/// into: +/// - Font [found after a src: property in an @font-family rule] +/// - Stylesheet [denoted by an @import before the url +/// - Image [covers all other uses of the url() function] +/// +/// This regex aims to extract the following information: +/// - What type of URL is it (font/image/css) +/// - Where is the part that needs to be replaced (incl any wrapping quotes) +/// - What is the URL (excl any wrapping quotes) +/// +/// Essentially, the regex can be broken down into two parts: +/// +/// `(?:(?P@import)|(?Psrc\s*:)\s+)?` +/// This matches the precursor to a font or CSS URL, and fills in a match under +/// either `` (if it's a CSS URL) or `` (if it's a font). +/// Determining whether or not it's an image can be done by the negation of both +/// of these. Either zero or one of these can match. +/// +/// `url\((?P['"]?(?P[^"'\)]+)['"]?)\)` +/// This matches the actual URL part of the url(), and must always match. It also +/// sets `` and `` which correspond to everything within +/// `url(...)` and a usable URL, respectively. +/// +/// Note, however, that this does not perform any validation of the found URL. +/// Malformed CSS could lead to an invalid URL being present. It is therefore +/// recomended that the URL gets manually validated. +const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P@import)|(?Psrc\s*:))\s+)?url\((?P['"]?(?P[^"'\)]+)['"]?)\)"###; + lazy_static! { static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap(); + static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap(); } const MAGIC: [[&[u8]; 2]; 19] = [ @@ -75,3 +109,93 @@ pub fn resolve_url, U: AsRef>(from: T, to: U) -> Result, + css_string: &str, + as_dataurl: bool, + href: &str, + opt_no_images: bool, + opt_user_agent: &str, + opt_silent: bool, + opt_insecure: bool, +) -> String { + let mut resolved_css = String::from(css_string); + + for link in REGEX_CSS_URL.captures_iter(&css_string) { + let target_link = link.name("url").unwrap().as_str(); + + // Determine the type of link + let is_stylesheet = link.name("stylesheet").is_some(); + let is_font = link.name("font").is_some(); + let is_image = !is_stylesheet && !is_font; + + // Generate absolute URL for content + let embedded_url = match resolve_url(href, target_link) { + Ok(url) => url, + Err(_) => continue, // Malformed URL + }; + + // Download the asset. If it's more CSS, resolve that too + let content = if is_stylesheet { + // The link is an @import link + retrieve_asset( + cache, + &embedded_url, + false, // Formating as data URL will be done later + "text/css", // Expect CSS + opt_user_agent, + opt_silent, + opt_insecure, + ) + .map(|(content, _)| { + resolve_css_imports( + cache, + &content, + true, // Finally, convert to a dataurl + &embedded_url, + opt_no_images, + opt_user_agent, + opt_silent, + opt_insecure, + ) + }) + } else if (is_image && !opt_no_images) || is_font { + // The link is some other, non-@import link + retrieve_asset( + cache, + &embedded_url, + true, // Format as data URL + "", // Unknown MIME type + opt_user_agent, + opt_silent, + opt_insecure, + ) + .map(|(a, _)| a) + } else { + // If it's a datatype that has been opt_no'd out of, replace with + // absolute URL + + Ok(embedded_url.clone()) + } + .unwrap_or_else(|e| { + eprintln!("Warning: {}", e); + + // If failed to resolve, replace with absolute URL + embedded_url + }); + + let replacement = format!("\"{}\"", &content); + let dest = link.name("to_repl").unwrap(); + let offset = resolved_css.len() - css_string.len(); + let target_range = (dest.start() + offset)..(dest.end() + offset); + + resolved_css.replace_range(target_range, &replacement); + } + + if as_dataurl { + data_to_dataurl("text/css", resolved_css.as_bytes()) + } else { + resolved_css + } +}