Merge Y2Z/master, fix conflicts between shared-client & resolve-css

This commit is contained in:
Emi Simpson 2019-12-12 19:29:21 -05:00
commit 3d4a932ac1
No known key found for this signature in database
GPG key ID: 68FAB2E2E6DFC98B
5 changed files with 188 additions and 11 deletions

2
Cargo.lock generated
View file

@ -625,7 +625,7 @@ dependencies = [
[[package]]
name = "monolith"
version = "2.0.23"
version = "2.1.0"
dependencies = [
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",

View file

@ -1,10 +1,11 @@
[package]
name = "monolith"
version = "2.0.23"
version = "2.1.0"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
"Emmanuel Delaborde <th3rac25@gmail.com>",
"Emi Simpson <emi@alchemi.dev>",
]
description = "CLI tool for saving web pages as a single HTML file"

View file

@ -10,7 +10,7 @@ use js::attr_is_event_handler;
use reqwest::Client;
use std::collections::HashMap;
use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
lazy_static! {
static ref EMPTY_STRING: String = String::new();
@ -127,17 +127,36 @@ pub fn walk_and_embed_assets(
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let (css_dataurl, _) = retrieve_asset(
let replacement_text = match retrieve_asset(
cache,
client,
&href_full_url,
true,
false,
"text/css",
opt_silent,
)
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
) {
// On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports(
cache,
client,
&css_data,
true,
&href_full_url,
opt_no_images,
opt_silent,
),
// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e,);
// If failed to resolve, replace with absolute URL
href_full_url
}
};
attr.value.clear();
attr.value.push_slice(css_dataurl.as_str());
attr.value.push_slice(&replacement_text);
}
}
}
@ -267,6 +286,23 @@ pub fn walk_and_embed_assets(
if opt_no_css {
// Empty inner content of STYLE tags
node.children.borrow_mut().clear();
} else {
for node in node.children.borrow_mut().iter_mut() {
if let NodeData::Text { ref contents } = node.data {
let mut tendril = contents.borrow_mut();
let replacement = resolve_css_imports(
cache,
client,
tendril.as_ref(),
false,
&url,
opt_no_images,
opt_silent,
);
tendril.clear();
tendril.push_slice(&replacement);
}
}
}
}
"form" => {
@ -363,6 +399,7 @@ pub fn walk_and_embed_assets(
_ => {}
}
// Process style attributes
if opt_no_css {
// Get rid of style attributes
let mut style_attr_indexes = Vec::new();
@ -375,6 +412,24 @@ pub fn walk_and_embed_assets(
for attr_index in style_attr_indexes {
attrs_mut.remove(attr_index);
}
} else {
// Otherwise, parse any links found in the attributes
for attribute in attrs_mut
.iter_mut()
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
{
let replacement = resolve_css_imports(
cache,
client,
attribute.value.as_ref(),
false,
&url,
opt_no_images,
opt_silent,
);
attribute.value.clear();
attribute.value.push_slice(&replacement);
}
}
if opt_no_js {

View file

@ -17,7 +17,7 @@ pub fn retrieve_asset(
if cache.contains_key(&url.to_string()) {
// url is in cache
if !opt_silent {
eprintln!("[ {} ] (from cache)", &url);
eprintln!("{} (from cache)", &url);
}
let data = cache.get(&url.to_string()).unwrap();
Ok((data.to_string(), url.to_string()))
@ -27,9 +27,9 @@ pub fn retrieve_asset(
if !opt_silent {
if url == response.url().as_str() {
eprintln!("[ {} ]", &url);
eprintln!("{}", &url);
} else {
eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
eprintln!("{} -> {}", &url, &response.url().as_str());
}
}

View file

@ -1,12 +1,47 @@
extern crate base64;
use self::base64::encode;
use http::retrieve_asset;
use regex::Regex;
use reqwest::Client;
use std::collections::HashMap;
use url::{ParseError, Url};
/// This monster of a regex is used to match any kind of URL found in CSS.
///
/// There are roughly three different categories that a found URL could fit
/// into:
/// - Font [found after a src: property in an @font-family rule]
/// - Stylesheet [denoted by an @import before the url
/// - Image [covers all other uses of the url() function]
///
/// This regex aims to extract the following information:
/// - What type of URL is it (font/image/css)
/// - Where is the part that needs to be replaced (incl any wrapping quotes)
/// - What is the URL (excl any wrapping quotes)
///
/// Essentially, the regex can be broken down into two parts:
///
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
/// This matches the precursor to a font or CSS URL, and fills in a match under
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
/// Determining whether or not it's an image can be done by the negation of both
/// of these. Either zero or one of these can match.
///
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
/// This matches the actual URL part of the url(), and must always match. It also
/// sets `<to_repl>` and `<url>` which correspond to everything within
/// `url(...)` and a usable URL, respectively.
///
/// Note, however, that this does not perform any validation of the found URL.
/// Malformed CSS could lead to an invalid URL being present. It is therefore
/// recomended that the URL gets manually validated.
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
lazy_static! {
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
}
const MAGIC: [[&[u8]; 2]; 19] = [
@ -75,3 +110,89 @@ pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<Strin
};
Ok(result)
}
pub fn resolve_css_imports(
cache: &mut HashMap<String, String>,
client: &Client,
css_string: &str,
as_dataurl: bool,
href: &str,
opt_no_images: bool,
opt_silent: bool,
) -> String {
let mut resolved_css = String::from(css_string);
for link in REGEX_CSS_URL.captures_iter(&css_string) {
let target_link = link.name("url").unwrap().as_str();
// Determine the type of link
let is_stylesheet = link.name("stylesheet").is_some();
let is_font = link.name("font").is_some();
let is_image = !is_stylesheet && !is_font;
// Generate absolute URL for content
let embedded_url = match resolve_url(href, target_link) {
Ok(url) => url,
Err(_) => continue, // Malformed URL
};
// Download the asset. If it's more CSS, resolve that too
let content = if is_stylesheet {
// The link is an @import link
retrieve_asset(
cache,
client,
&embedded_url,
false, // Formating as data URL will be done later
"text/css", // Expect CSS
opt_silent,
)
.map(|(content, _)| {
resolve_css_imports(
cache,
client,
&content,
true, // Finally, convert to a dataurl
&embedded_url,
opt_no_images,
opt_silent,
)
})
} else if (is_image && !opt_no_images) || is_font {
// The link is some other, non-@import link
retrieve_asset(
cache,
client,
&embedded_url,
true, // Format as data URL
"", // Unknown MIME type
opt_silent,
)
.map(|(a, _)| a)
} else {
// If it's a datatype that has been opt_no'd out of, replace with
// absolute URL
Ok(embedded_url.clone())
}
.unwrap_or_else(|e| {
eprintln!("Warning: {}", e);
// If failed to resolve, replace with absolute URL
embedded_url
});
let replacement = format!("\"{}\"", &content);
let dest = link.name("to_repl").unwrap();
let offset = resolved_css.len() - css_string.len();
let target_range = (dest.start() + offset)..(dest.end() + offset);
resolved_css.replace_range(target_range, &replacement);
}
if as_dataurl {
data_to_dataurl("text/css", resolved_css.as_bytes())
} else {
resolved_css
}
}