Merged Y2Z/master with Alch-Emi/lazyload

This commit is contained in:
Emi Simpson 2019-12-24 10:07:56 -05:00
commit dab4ae6965
No known key found for this signature in database
GPG Key ID: 68FAB2E2E6DFC98B
12 changed files with 295 additions and 114 deletions

View File

@ -16,6 +16,11 @@ before_script:
- rustup component add rustfmt - rustup component add rustfmt
script: script:
- cargo build --locked --verbose - cargo build --all --locked --verbose
- cargo test --locked --verbose - cargo test --all --locked --verbose
- cargo fmt --all -- --check - cargo fmt --all -- --check
jobs:
allow_failures:
- rust: nightly
fast_finish: true

2
Cargo.lock generated
View File

@ -625,7 +625,7 @@ dependencies = [
[[package]] [[package]]
name = "monolith" name = "monolith"
version = "2.0.23" version = "2.1.0"
dependencies = [ dependencies = [
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", "base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",

View File

@ -1,10 +1,11 @@
[package] [package]
name = "monolith" name = "monolith"
version = "2.0.23" version = "2.1.0"
authors = [ authors = [
"Sunshine <sunshine@uberspace.net>", "Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>", "Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
"Emmanuel Delaborde <th3rac25@gmail.com>", "Emmanuel Delaborde <th3rac25@gmail.com>",
"Emi Simpson <emi@alchemi.dev>",
] ]
description = "CLI tool for saving web pages as a single HTML file" description = "CLI tool for saving web pages as a single HTML file"

View File

@ -18,17 +18,20 @@ Unlike the conventional "Save page as", `monolith` not only saves the target doc
If compared to saving websites with `wget -mpk`, this tool embeds all assets as data URLs and therefore lets browsers render the saved page exactly the way it was on the Internet, even when no network connection is available. If compared to saving websites with `wget -mpk`, this tool embeds all assets as data URLs and therefore lets browsers render the saved page exactly the way it was on the Internet, even when no network connection is available.
<!-- `This program works both on remote and local targets. --> ## Installation
### Installation ### From source
$ git clone https://github.com/Y2Z/monolith.git $ git clone https://github.com/Y2Z/monolith.git
$ cd monolith $ cd monolith
$ cargo install --path . $ cargo install --path .
### Usage ### On macOS (via Homebrew)
$ brew install monolith
## Usage
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html $ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
### Options ## Options
- `-c`: Ignore styles - `-c`: Ignore styles
- `-f`: Exclude iframes - `-f`: Exclude iframes
- `-i`: Remove images - `-i`: Remove images
@ -38,11 +41,11 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
- `-s`: Silent mode - `-s`: Silent mode
- `-u`: Specify custom User-Agent - `-u`: Specify custom User-Agent
### Related Projects ## Related projects
- `Pagesaver`: https://github.com/distributed-mind/pagesaver - `Pagesaver`: https://github.com/distributed-mind/pagesaver
- `SingleFile`: https://github.com/gildas-lormeau/SingleFile - `SingleFile`: https://github.com/gildas-lormeau/SingleFile
### License ## License
The Unlicense The Unlicense
<!-- Microtext --> <!-- Microtext -->

View File

@ -7,13 +7,10 @@ use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns}; use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset; use http::retrieve_asset;
use js::attr_is_event_handler; use js::attr_is_event_handler;
use reqwest::Client;
use std::collections::HashMap; use std::collections::HashMap;
use std::default::Default; use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol}; use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
lazy_static! {
static ref EMPTY_STRING: String = String::new();
}
const ICON_VALUES: [&str; 5] = [ const ICON_VALUES: [&str; 5] = [
"icon", "icon",
@ -35,7 +32,7 @@ pub fn get_parent_node(node: &Handle) -> Handle {
pub fn get_node_name(node: &Handle) -> String { pub fn get_node_name(node: &Handle) -> String {
match &node.data { match &node.data {
NodeData::Element { ref name, .. } => name.local.as_ref().to_string(), NodeData::Element { ref name, .. } => name.local.as_ref().to_string(),
_ => EMPTY_STRING.clone(), _ => str!(),
} }
} }
@ -45,14 +42,13 @@ pub fn is_icon(attr_value: &str) -> bool {
pub fn walk_and_embed_assets( pub fn walk_and_embed_assets(
cache: &mut HashMap<String, String>, cache: &mut HashMap<String, String>,
client: &Client,
url: &str, url: &str,
node: &Handle, node: &Handle,
opt_no_css: bool, opt_no_css: bool,
opt_no_js: bool, opt_no_js: bool,
opt_no_images: bool, opt_no_images: bool,
opt_user_agent: &str,
opt_silent: bool, opt_silent: bool,
opt_insecure: bool,
opt_no_frames: bool, opt_no_frames: bool,
) { ) {
match node.data { match node.data {
@ -61,14 +57,13 @@ pub fn walk_and_embed_assets(
for child in node.children.borrow().iter() { for child in node.children.borrow().iter() {
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
client,
&url, &url,
child, child,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
} }
@ -104,17 +99,16 @@ pub fn walk_and_embed_assets(
} else { } else {
let href_full_url: String = let href_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(str!());
let (favicon_dataurl, _) = retrieve_asset( let (favicon_dataurl, _) = retrieve_asset(
cache, cache,
client,
&href_full_url, &href_full_url,
true, true,
"", "",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); .unwrap_or((str!(), str!()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(favicon_dataurl.as_str()); attr.value.push_slice(favicon_dataurl.as_str());
} }
@ -128,19 +122,37 @@ pub fn walk_and_embed_assets(
} else { } else {
let href_full_url: String = let href_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(str!());
let (css_dataurl, _) = retrieve_asset( let replacement_text = match retrieve_asset(
cache, cache,
client,
&href_full_url, &href_full_url,
true, false,
"text/css", "text/css",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure, ) {
) // On successful retrieval, traverse CSS
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); Ok((css_data, _)) => resolve_css_imports(
cache,
client,
&css_data,
true,
&href_full_url,
opt_no_images,
opt_silent,
),
// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e,);
// If failed to resolve, replace with absolute URL
href_full_url
}
};
attr.value.clear(); attr.value.clear();
attr.value.push_slice(css_dataurl.as_str()); attr.value.push_slice(&replacement_text);
} }
} }
} }
@ -148,8 +160,7 @@ pub fn walk_and_embed_assets(
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" { if &attr.name.local == "href" {
let href_full_url: String = let href_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(&href_full_url.as_str()); attr.value.push_slice(&href_full_url.as_str());
} }
@ -184,16 +195,15 @@ pub fn walk_and_embed_assets(
.map(|attr| &attr.value) .map(|attr| &attr.value)
.filter(|src| !src.is_empty()) // Ignore empty srcs .filter(|src| !src.is_empty()) // Ignore empty srcs
.next() .next()
.and_then(|src| resolve_url(&url, src).ok()) //Make absolute .and_then(|src| resolve_url(&url, src).ok()) // Make absolute
.and_then(|abs_src| // Download and convert to dataurl .and_then(|abs_src| // Download and convert to dataurl
retrieve_asset( retrieve_asset(
cache, cache,
client,
&abs_src, &abs_src,
true, true,
"", "",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
).ok()) ).ok())
{ {
// Add the new dataurl src attribute // Add the new dataurl src attribute
@ -220,17 +230,16 @@ pub fn walk_and_embed_assets(
} else { } else {
let srcset_full_url: String = let srcset_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(str!());
let (source_dataurl, _) = retrieve_asset( let (source_dataurl, _) = retrieve_asset(
cache, cache,
client,
&srcset_full_url, &srcset_full_url,
true, true,
"", "",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); .unwrap_or((str!(), str!()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(source_dataurl.as_str()); attr.value.push_slice(source_dataurl.as_str());
} }
@ -246,8 +255,8 @@ pub fn walk_and_embed_assets(
continue; continue;
} }
let href_full_url: String = resolve_url(&url, &attr.value.to_string()) let href_full_url: String =
.unwrap_or(EMPTY_STRING.clone()); resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(href_full_url.as_str()); attr.value.push_slice(href_full_url.as_str());
} }
@ -266,18 +275,16 @@ pub fn walk_and_embed_assets(
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" { if &attr.name.local == "src" {
let src_full_url: String = let src_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
.unwrap_or(EMPTY_STRING.clone());
let (js_dataurl, _) = retrieve_asset( let (js_dataurl, _) = retrieve_asset(
cache, cache,
client,
&src_full_url, &src_full_url,
true, true,
"application/javascript", "application/javascript",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); .unwrap_or((str!(), str!()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(js_dataurl.as_str()); attr.value.push_slice(js_dataurl.as_str());
} }
@ -288,6 +295,23 @@ pub fn walk_and_embed_assets(
if opt_no_css { if opt_no_css {
// Empty inner content of STYLE tags // Empty inner content of STYLE tags
node.children.borrow_mut().clear(); node.children.borrow_mut().clear();
} else {
for node in node.children.borrow_mut().iter_mut() {
if let NodeData::Text { ref contents } = node.data {
let mut tendril = contents.borrow_mut();
let replacement = resolve_css_imports(
cache,
client,
tendril.as_ref(),
false,
&url,
opt_no_images,
opt_silent,
);
tendril.clear();
tendril.push_slice(&replacement);
}
}
} }
} }
"form" => { "form" => {
@ -296,8 +320,7 @@ pub fn walk_and_embed_assets(
// Modify action to be a full URL // Modify action to be a full URL
if !is_valid_url(&attr.value) { if !is_valid_url(&attr.value) {
let href_full_url: String = let href_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(href_full_url.as_str()); attr.value.push_slice(href_full_url.as_str());
} }
@ -316,33 +339,31 @@ pub fn walk_and_embed_assets(
let iframe_src: String = attr.value.to_string(); let iframe_src: String = attr.value.to_string();
// Ignore iframes with empty source (they cause infinite loops) // Ignore iframes with empty source (they cause infinite loops)
if iframe_src == EMPTY_STRING.clone() { if iframe_src == str!() {
continue; continue;
} }
let src_full_url: String = let src_full_url: String =
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone()); resolve_url(&url, &iframe_src).unwrap_or(str!());
let (iframe_data, iframe_final_url) = retrieve_asset( let (iframe_data, iframe_final_url) = retrieve_asset(
cache, cache,
client,
&src_full_url, &src_full_url,
false, false,
"text/html", "text/html",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((EMPTY_STRING.clone(), src_full_url)); .unwrap_or((str!(), src_full_url));
let dom = html_to_dom(&iframe_data); let dom = html_to_dom(&iframe_data);
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
client,
&iframe_final_url, &iframe_final_url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
@ -359,25 +380,24 @@ pub fn walk_and_embed_assets(
let video_poster = attr.value.to_string(); let video_poster = attr.value.to_string();
// Skip posters with empty source // Skip posters with empty source
if video_poster == EMPTY_STRING.clone() { if video_poster == str!() {
continue; continue;
} }
if opt_no_images { if opt_no_images {
attr.value.clear(); attr.value.clear();
} else { } else {
let poster_full_url: String = resolve_url(&url, &video_poster) let poster_full_url: String =
.unwrap_or(EMPTY_STRING.clone()); resolve_url(&url, &video_poster).unwrap_or(str!());
let (poster_dataurl, _) = retrieve_asset( let (poster_dataurl, _) = retrieve_asset(
cache, cache,
client,
&poster_full_url, &poster_full_url,
true, true,
"", "",
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
) )
.unwrap_or((poster_full_url, EMPTY_STRING.clone())); .unwrap_or((poster_full_url, str!()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(poster_dataurl.as_str()); attr.value.push_slice(poster_dataurl.as_str());
} }
@ -387,6 +407,7 @@ pub fn walk_and_embed_assets(
_ => {} _ => {}
} }
// Process style attributes
if opt_no_css { if opt_no_css {
// Get rid of style attributes // Get rid of style attributes
let mut style_attr_indexes = Vec::new(); let mut style_attr_indexes = Vec::new();
@ -399,6 +420,24 @@ pub fn walk_and_embed_assets(
for attr_index in style_attr_indexes { for attr_index in style_attr_indexes {
attrs_mut.remove(attr_index); attrs_mut.remove(attr_index);
} }
} else {
// Otherwise, parse any links found in the attributes
for attribute in attrs_mut
.iter_mut()
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
{
let replacement = resolve_css_imports(
cache,
client,
attribute.value.as_ref(),
false,
&url,
opt_no_images,
opt_silent,
);
attribute.value.clear();
attribute.value.push_slice(&replacement);
}
} }
if opt_no_js { if opt_no_js {
@ -419,14 +458,13 @@ pub fn walk_and_embed_assets(
for child in node.children.borrow().iter() { for child in node.children.borrow().iter() {
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
client,
&url, &url,
child, child,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
opt_user_agent,
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
} }
@ -481,7 +519,7 @@ pub fn stringify_document(
let doc = dom.get_document(); let doc = dom.get_document();
let html = get_child_node_by_name(&doc, "html"); let html = get_child_node_by_name(&doc, "html");
let head = get_child_node_by_name(&html, "head"); let head = get_child_node_by_name(&html, "head");
let mut content_attr = EMPTY_STRING.clone(); let mut content_attr = str!();
if opt_isolate { if opt_isolate {
content_attr += " default-src 'unsafe-inline' data:;"; content_attr += " default-src 'unsafe-inline' data:;";
} }

View File

@ -1,17 +1,15 @@
use reqwest::header::{CONTENT_TYPE, USER_AGENT}; use reqwest::header::CONTENT_TYPE;
use reqwest::Client; use reqwest::Client;
use std::collections::HashMap; use std::collections::HashMap;
use std::time::Duration;
use utils::{data_to_dataurl, is_data_url}; use utils::{data_to_dataurl, is_data_url};
pub fn retrieve_asset( pub fn retrieve_asset(
cache: &mut HashMap<String, String>, cache: &mut HashMap<String, String>,
client: &Client,
url: &str, url: &str,
as_dataurl: bool, as_dataurl: bool,
mime: &str, mime: &str,
opt_user_agent: &str,
opt_silent: bool, opt_silent: bool,
opt_insecure: bool,
) -> Result<(String, String), reqwest::Error> { ) -> Result<(String, String), reqwest::Error> {
if is_data_url(&url).unwrap() { if is_data_url(&url).unwrap() {
Ok((url.to_string(), url.to_string())) Ok((url.to_string(), url.to_string()))
@ -19,23 +17,19 @@ pub fn retrieve_asset(
if cache.contains_key(&url.to_string()) { if cache.contains_key(&url.to_string()) {
// url is in cache // url is in cache
if !opt_silent { if !opt_silent {
eprintln!("[ {} ] (from cache)", &url); eprintln!("{} (from cache)", &url);
} }
let data = cache.get(&url.to_string()).unwrap(); let data = cache.get(&url.to_string()).unwrap();
Ok((data.to_string(), url.to_string())) Ok((data.to_string(), url.to_string()))
} else { } else {
// url not in cache, we request it // url not in cache, we request it
let client = Client::builder() let mut response = client.get(url).send()?;
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(opt_insecure)
.build()?;
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
if !opt_silent { if !opt_silent {
if url == response.url().as_str() { if url == response.url().as_str() {
eprintln!("[ {} ]", &url); eprintln!("{}", &url);
} else { } else {
eprintln!("[ {} -> {} ]", &url, &response.url().as_str()); eprintln!("{} -> {}", &url, &response.url().as_str());
} }
} }

View File

@ -1,10 +1,13 @@
extern crate html5ever;
#[macro_use] #[macro_use]
extern crate lazy_static; extern crate lazy_static;
extern crate html5ever;
extern crate regex; extern crate regex;
extern crate reqwest; extern crate reqwest;
extern crate url; extern crate url;
#[macro_use]
mod macros;
pub mod html; pub mod html;
pub mod http; pub mod http;
pub mod js; pub mod js;

9
src/macros.rs Normal file
View File

@ -0,0 +1,9 @@
#[macro_export]
macro_rules! str {
() => {
String::new()
};
($val: expr) => {
ToString::to_string(&$val)
};
}

View File

@ -1,6 +1,7 @@
#[macro_use] #[macro_use]
extern crate clap; extern crate clap;
extern crate monolith; extern crate monolith;
extern crate reqwest;
mod args; mod args;
@ -8,34 +9,50 @@ use args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets}; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset; use monolith::http::retrieve_asset;
use monolith::utils::is_valid_url; use monolith::utils::is_valid_url;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap; use std::collections::HashMap;
use std::time::Duration;
fn main() { fn main() {
let app_args = AppArgs::get(); let app_args = AppArgs::get();
let cache = &mut HashMap::new(); let cache = &mut HashMap::new();
if is_valid_url(app_args.url_target.as_str()) { if is_valid_url(app_args.url_target.as_str()) {
// Initialize client
let mut header_map = HeaderMap::new();
match HeaderValue::from_str(&app_args.user_agent) {
Ok(header) => header_map.insert(USER_AGENT, header),
Err(err) => {
eprintln!("Invalid user agent! {}", err);
return;
}
};
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(app_args.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
let (data, final_url) = retrieve_asset( let (data, final_url) = retrieve_asset(
cache, cache,
&client,
app_args.url_target.as_str(), app_args.url_target.as_str(),
false, false,
"", "",
app_args.user_agent.as_str(),
app_args.silent, app_args.silent,
app_args.insecure,
) )
.unwrap(); .unwrap();
let dom = html_to_dom(&data); let dom = html_to_dom(&data);
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&final_url, &final_url,
&dom.document, &dom.document,
app_args.no_css, app_args.no_css,
app_args.no_js, app_args.no_js,
app_args.no_images, app_args.no_images,
app_args.user_agent.as_str(),
app_args.silent, app_args.silent,
app_args.insecure,
app_args.no_frames, app_args.no_frames,
); );

View File

@ -70,18 +70,18 @@ fn test_walk_and_embed_assets() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -106,18 +106,18 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -144,18 +144,17 @@ fn test_walk_and_embed_assets_no_css() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false; let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -189,18 +188,18 @@ fn test_walk_and_embed_assets_no_images() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = true; let opt_no_images: bool = true;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -236,18 +235,17 @@ fn test_walk_and_embed_assets_no_frames() {
let opt_no_js: bool = false; let opt_no_js: bool = false;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false; let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );
@ -275,18 +273,18 @@ fn test_walk_and_embed_assets_no_js() {
let opt_no_js: bool = true; let opt_no_js: bool = true;
let opt_no_images: bool = false; let opt_no_images: bool = false;
let opt_silent = true; let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
&client,
&url, &url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
"",
opt_silent, opt_silent,
opt_insecure,
opt_no_frames, opt_no_frames,
); );

View File

@ -3,26 +3,18 @@ use std::collections::HashMap;
#[test] #[test]
fn test_retrieve_asset() { fn test_retrieve_asset() {
let cache = &mut HashMap::new(); let cache = &mut HashMap::new();
let (data, final_url) = retrieve_asset( let client = reqwest::Client::new();
cache, let (data, final_url) =
"data:text/html;base64,...", retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
true,
"",
"",
true,
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,..."); assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,..."); assert_eq!(&final_url, "data:text/html;base64,...");
let (data, final_url) = retrieve_asset( let (data, final_url) = retrieve_asset(
cache, cache,
&client,
"data:text/html;base64,...", "data:text/html;base64,...",
true, true,
"image/png", "image/png",
"",
true,
false, false,
) )
.unwrap(); .unwrap();

View File

@ -1,12 +1,47 @@
extern crate base64; extern crate base64;
use self::base64::encode; use self::base64::encode;
use http::retrieve_asset;
use regex::Regex; use regex::Regex;
use reqwest::Client;
use std::collections::HashMap;
use url::{ParseError, Url}; use url::{ParseError, Url};
/// This monster of a regex is used to match any kind of URL found in CSS.
///
/// There are roughly three different categories that a found URL could fit
/// into:
/// - Font [found after a src: property in an @font-family rule]
/// - Stylesheet [denoted by an @import before the url
/// - Image [covers all other uses of the url() function]
///
/// This regex aims to extract the following information:
/// - What type of URL is it (font/image/css)
/// - Where is the part that needs to be replaced (incl any wrapping quotes)
/// - What is the URL (excl any wrapping quotes)
///
/// Essentially, the regex can be broken down into two parts:
///
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
/// This matches the precursor to a font or CSS URL, and fills in a match under
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
/// Determining whether or not it's an image can be done by the negation of both
/// of these. Either zero or one of these can match.
///
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
/// This matches the actual URL part of the url(), and must always match. It also
/// sets `<to_repl>` and `<url>` which correspond to everything within
/// `url(...)` and a usable URL, respectively.
///
/// Note, however, that this does not perform any validation of the found URL.
/// Malformed CSS could lead to an invalid URL being present. It is therefore
/// recomended that the URL gets manually validated.
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
lazy_static! { lazy_static! {
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap(); static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
} }
const MAGIC: [[&[u8]; 2]; 19] = [ const MAGIC: [[&[u8]; 2]; 19] = [
@ -75,3 +110,89 @@ pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<Strin
}; };
Ok(result) Ok(result)
} }
pub fn resolve_css_imports(
cache: &mut HashMap<String, String>,
client: &Client,
css_string: &str,
as_dataurl: bool,
href: &str,
opt_no_images: bool,
opt_silent: bool,
) -> String {
let mut resolved_css = String::from(css_string);
for link in REGEX_CSS_URL.captures_iter(&css_string) {
let target_link = link.name("url").unwrap().as_str();
// Determine the type of link
let is_stylesheet = link.name("stylesheet").is_some();
let is_font = link.name("font").is_some();
let is_image = !is_stylesheet && !is_font;
// Generate absolute URL for content
let embedded_url = match resolve_url(href, target_link) {
Ok(url) => url,
Err(_) => continue, // Malformed URL
};
// Download the asset. If it's more CSS, resolve that too
let content = if is_stylesheet {
// The link is an @import link
retrieve_asset(
cache,
client,
&embedded_url,
false, // Formating as data URL will be done later
"text/css", // Expect CSS
opt_silent,
)
.map(|(content, _)| {
resolve_css_imports(
cache,
client,
&content,
true, // Finally, convert to a dataurl
&embedded_url,
opt_no_images,
opt_silent,
)
})
} else if (is_image && !opt_no_images) || is_font {
// The link is some other, non-@import link
retrieve_asset(
cache,
client,
&embedded_url,
true, // Format as data URL
"", // Unknown MIME type
opt_silent,
)
.map(|(a, _)| a)
} else {
// If it's a datatype that has been opt_no'd out of, replace with
// absolute URL
Ok(embedded_url.clone())
}
.unwrap_or_else(|e| {
eprintln!("Warning: {}", e);
// If failed to resolve, replace with absolute URL
embedded_url
});
let replacement = format!("\"{}\"", &content);
let dest = link.name("to_repl").unwrap();
let offset = resolved_css.len() - css_string.len();
let target_range = (dest.start() + offset)..(dest.end() + offset);
resolved_css.replace_range(target_range, &replacement);
}
if as_dataurl {
data_to_dataurl("text/css", resolved_css.as_bytes())
} else {
resolved_css
}
}