Merge pull request #78 from Alch-Emi/load-css-imports
Load URLs in CSS and style attributes
This commit is contained in:
commit
919e626b5e
4 changed files with 191 additions and 8 deletions
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -625,7 +625,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "monolith"
|
name = "monolith"
|
||||||
version = "2.0.23"
|
version = "2.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
[package]
|
[package]
|
||||||
name = "monolith"
|
name = "monolith"
|
||||||
version = "2.0.23"
|
version = "2.1.0"
|
||||||
authors = [
|
authors = [
|
||||||
"Sunshine <sunshine@uberspace.net>",
|
"Sunshine <sunshine@uberspace.net>",
|
||||||
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
|
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
|
||||||
"Emmanuel Delaborde <th3rac25@gmail.com>",
|
"Emmanuel Delaborde <th3rac25@gmail.com>",
|
||||||
|
"Emi Simpson <emi@alchemi.dev>",
|
||||||
]
|
]
|
||||||
description = "CLI tool for saving web pages as a single HTML file"
|
description = "CLI tool for saving web pages as a single HTML file"
|
||||||
|
|
||||||
|
|
70
src/html.rs
70
src/html.rs
|
@ -9,7 +9,7 @@ use http::retrieve_asset;
|
||||||
use js::attr_is_event_handler;
|
use js::attr_is_event_handler;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::default::Default;
|
use std::default::Default;
|
||||||
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
|
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref EMPTY_STRING: String = String::new();
|
static ref EMPTY_STRING: String = String::new();
|
||||||
|
@ -129,18 +129,38 @@ pub fn walk_and_embed_assets(
|
||||||
let href_full_url: String =
|
let href_full_url: String =
|
||||||
resolve_url(&url, &attr.value.to_string())
|
resolve_url(&url, &attr.value.to_string())
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let (css_dataurl, _) = retrieve_asset(
|
let replacement_text = match retrieve_asset(
|
||||||
cache,
|
cache,
|
||||||
&href_full_url,
|
&href_full_url,
|
||||||
true,
|
false,
|
||||||
"text/css",
|
"text/css",
|
||||||
opt_user_agent,
|
opt_user_agent,
|
||||||
opt_silent,
|
opt_silent,
|
||||||
opt_insecure,
|
opt_insecure,
|
||||||
)
|
) {
|
||||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
// On successful retrieval, traverse CSS
|
||||||
|
Ok((css_data, _)) => resolve_css_imports(
|
||||||
|
cache,
|
||||||
|
&css_data,
|
||||||
|
true,
|
||||||
|
&href_full_url,
|
||||||
|
opt_no_images,
|
||||||
|
opt_user_agent,
|
||||||
|
opt_silent,
|
||||||
|
opt_insecure,
|
||||||
|
),
|
||||||
|
|
||||||
|
// If a network error occured, warn
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Warning: {}", e,);
|
||||||
|
|
||||||
|
// If failed to resolve, replace with absolute URL
|
||||||
|
href_full_url
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(css_dataurl.as_str());
|
attr.value.push_slice(&replacement_text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -273,6 +293,24 @@ pub fn walk_and_embed_assets(
|
||||||
if opt_no_css {
|
if opt_no_css {
|
||||||
// Empty inner content of STYLE tags
|
// Empty inner content of STYLE tags
|
||||||
node.children.borrow_mut().clear();
|
node.children.borrow_mut().clear();
|
||||||
|
} else {
|
||||||
|
for node in node.children.borrow_mut().iter_mut() {
|
||||||
|
if let NodeData::Text { ref contents } = node.data {
|
||||||
|
let mut tendril = contents.borrow_mut();
|
||||||
|
let replacement = resolve_css_imports(
|
||||||
|
cache,
|
||||||
|
tendril.as_ref(),
|
||||||
|
false,
|
||||||
|
&url,
|
||||||
|
opt_no_images,
|
||||||
|
opt_user_agent,
|
||||||
|
opt_silent,
|
||||||
|
opt_insecure,
|
||||||
|
);
|
||||||
|
tendril.clear();
|
||||||
|
tendril.push_slice(&replacement);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"form" => {
|
"form" => {
|
||||||
|
@ -372,6 +410,7 @@ pub fn walk_and_embed_assets(
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Process style attributes
|
||||||
if opt_no_css {
|
if opt_no_css {
|
||||||
// Get rid of style attributes
|
// Get rid of style attributes
|
||||||
let mut style_attr_indexes = Vec::new();
|
let mut style_attr_indexes = Vec::new();
|
||||||
|
@ -384,6 +423,25 @@ pub fn walk_and_embed_assets(
|
||||||
for attr_index in style_attr_indexes {
|
for attr_index in style_attr_indexes {
|
||||||
attrs_mut.remove(attr_index);
|
attrs_mut.remove(attr_index);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Otherwise, parse any links found in the attributes
|
||||||
|
for attribute in attrs_mut
|
||||||
|
.iter_mut()
|
||||||
|
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
|
||||||
|
{
|
||||||
|
let replacement = resolve_css_imports(
|
||||||
|
cache,
|
||||||
|
attribute.value.as_ref(),
|
||||||
|
false,
|
||||||
|
&url,
|
||||||
|
opt_no_images,
|
||||||
|
opt_user_agent,
|
||||||
|
opt_silent,
|
||||||
|
opt_insecure,
|
||||||
|
);
|
||||||
|
attribute.value.clear();
|
||||||
|
attribute.value.push_slice(&replacement);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if opt_no_js {
|
if opt_no_js {
|
||||||
|
|
124
src/utils.rs
124
src/utils.rs
|
@ -1,12 +1,46 @@
|
||||||
extern crate base64;
|
extern crate base64;
|
||||||
|
|
||||||
use self::base64::encode;
|
use self::base64::encode;
|
||||||
|
use http::retrieve_asset;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
use std::collections::HashMap;
|
||||||
use url::{ParseError, Url};
|
use url::{ParseError, Url};
|
||||||
|
|
||||||
|
/// This monster of a regex is used to match any kind of URL found in CSS.
|
||||||
|
///
|
||||||
|
/// There are roughly three different categories that a found URL could fit
|
||||||
|
/// into:
|
||||||
|
/// - Font [found after a src: property in an @font-family rule]
|
||||||
|
/// - Stylesheet [denoted by an @import before the url
|
||||||
|
/// - Image [covers all other uses of the url() function]
|
||||||
|
///
|
||||||
|
/// This regex aims to extract the following information:
|
||||||
|
/// - What type of URL is it (font/image/css)
|
||||||
|
/// - Where is the part that needs to be replaced (incl any wrapping quotes)
|
||||||
|
/// - What is the URL (excl any wrapping quotes)
|
||||||
|
///
|
||||||
|
/// Essentially, the regex can be broken down into two parts:
|
||||||
|
///
|
||||||
|
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
|
||||||
|
/// This matches the precursor to a font or CSS URL, and fills in a match under
|
||||||
|
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
|
||||||
|
/// Determining whether or not it's an image can be done by the negation of both
|
||||||
|
/// of these. Either zero or one of these can match.
|
||||||
|
///
|
||||||
|
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
|
||||||
|
/// This matches the actual URL part of the url(), and must always match. It also
|
||||||
|
/// sets `<to_repl>` and `<url>` which correspond to everything within
|
||||||
|
/// `url(...)` and a usable URL, respectively.
|
||||||
|
///
|
||||||
|
/// Note, however, that this does not perform any validation of the found URL.
|
||||||
|
/// Malformed CSS could lead to an invalid URL being present. It is therefore
|
||||||
|
/// recomended that the URL gets manually validated.
|
||||||
|
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
||||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
||||||
|
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
const MAGIC: [[&[u8]; 2]; 19] = [
|
const MAGIC: [[&[u8]; 2]; 19] = [
|
||||||
|
@ -75,3 +109,93 @@ pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<Strin
|
||||||
};
|
};
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn resolve_css_imports(
|
||||||
|
cache: &mut HashMap<String, String>,
|
||||||
|
css_string: &str,
|
||||||
|
as_dataurl: bool,
|
||||||
|
href: &str,
|
||||||
|
opt_no_images: bool,
|
||||||
|
opt_user_agent: &str,
|
||||||
|
opt_silent: bool,
|
||||||
|
opt_insecure: bool,
|
||||||
|
) -> String {
|
||||||
|
let mut resolved_css = String::from(css_string);
|
||||||
|
|
||||||
|
for link in REGEX_CSS_URL.captures_iter(&css_string) {
|
||||||
|
let target_link = link.name("url").unwrap().as_str();
|
||||||
|
|
||||||
|
// Determine the type of link
|
||||||
|
let is_stylesheet = link.name("stylesheet").is_some();
|
||||||
|
let is_font = link.name("font").is_some();
|
||||||
|
let is_image = !is_stylesheet && !is_font;
|
||||||
|
|
||||||
|
// Generate absolute URL for content
|
||||||
|
let embedded_url = match resolve_url(href, target_link) {
|
||||||
|
Ok(url) => url,
|
||||||
|
Err(_) => continue, // Malformed URL
|
||||||
|
};
|
||||||
|
|
||||||
|
// Download the asset. If it's more CSS, resolve that too
|
||||||
|
let content = if is_stylesheet {
|
||||||
|
// The link is an @import link
|
||||||
|
retrieve_asset(
|
||||||
|
cache,
|
||||||
|
&embedded_url,
|
||||||
|
false, // Formating as data URL will be done later
|
||||||
|
"text/css", // Expect CSS
|
||||||
|
opt_user_agent,
|
||||||
|
opt_silent,
|
||||||
|
opt_insecure,
|
||||||
|
)
|
||||||
|
.map(|(content, _)| {
|
||||||
|
resolve_css_imports(
|
||||||
|
cache,
|
||||||
|
&content,
|
||||||
|
true, // Finally, convert to a dataurl
|
||||||
|
&embedded_url,
|
||||||
|
opt_no_images,
|
||||||
|
opt_user_agent,
|
||||||
|
opt_silent,
|
||||||
|
opt_insecure,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
} else if (is_image && !opt_no_images) || is_font {
|
||||||
|
// The link is some other, non-@import link
|
||||||
|
retrieve_asset(
|
||||||
|
cache,
|
||||||
|
&embedded_url,
|
||||||
|
true, // Format as data URL
|
||||||
|
"", // Unknown MIME type
|
||||||
|
opt_user_agent,
|
||||||
|
opt_silent,
|
||||||
|
opt_insecure,
|
||||||
|
)
|
||||||
|
.map(|(a, _)| a)
|
||||||
|
} else {
|
||||||
|
// If it's a datatype that has been opt_no'd out of, replace with
|
||||||
|
// absolute URL
|
||||||
|
|
||||||
|
Ok(embedded_url.clone())
|
||||||
|
}
|
||||||
|
.unwrap_or_else(|e| {
|
||||||
|
eprintln!("Warning: {}", e);
|
||||||
|
|
||||||
|
// If failed to resolve, replace with absolute URL
|
||||||
|
embedded_url
|
||||||
|
});
|
||||||
|
|
||||||
|
let replacement = format!("\"{}\"", &content);
|
||||||
|
let dest = link.name("to_repl").unwrap();
|
||||||
|
let offset = resolved_css.len() - css_string.len();
|
||||||
|
let target_range = (dest.start() + offset)..(dest.end() + offset);
|
||||||
|
|
||||||
|
resolved_css.replace_range(target_range, &replacement);
|
||||||
|
}
|
||||||
|
|
||||||
|
if as_dataurl {
|
||||||
|
data_to_dataurl("text/css", resolved_css.as_bytes())
|
||||||
|
} else {
|
||||||
|
resolved_css
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue