Merged Y2Z/master with Alch-Emi/lazyload
This commit is contained in:
commit
dab4ae6965
12 changed files with 295 additions and 114 deletions
11
.travis.yml
11
.travis.yml
|
@ -16,6 +16,11 @@ before_script:
|
|||
- rustup component add rustfmt
|
||||
|
||||
script:
|
||||
- cargo build --locked --verbose
|
||||
- cargo test --locked --verbose
|
||||
- cargo fmt --all -- --check
|
||||
- cargo build --all --locked --verbose
|
||||
- cargo test --all --locked --verbose
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
jobs:
|
||||
allow_failures:
|
||||
- rust: nightly
|
||||
fast_finish: true
|
||||
|
|
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -625,7 +625,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "monolith"
|
||||
version = "2.0.23"
|
||||
version = "2.1.0"
|
||||
dependencies = [
|
||||
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
[package]
|
||||
name = "monolith"
|
||||
version = "2.0.23"
|
||||
version = "2.1.0"
|
||||
authors = [
|
||||
"Sunshine <sunshine@uberspace.net>",
|
||||
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
|
||||
"Emmanuel Delaborde <th3rac25@gmail.com>",
|
||||
"Emi Simpson <emi@alchemi.dev>",
|
||||
]
|
||||
description = "CLI tool for saving web pages as a single HTML file"
|
||||
|
||||
|
|
15
README.md
15
README.md
|
@ -18,17 +18,20 @@ Unlike the conventional "Save page as", `monolith` not only saves the target doc
|
|||
|
||||
If compared to saving websites with `wget -mpk`, this tool embeds all assets as data URLs and therefore lets browsers render the saved page exactly the way it was on the Internet, even when no network connection is available.
|
||||
|
||||
<!-- `This program works both on remote and local targets. -->
|
||||
## Installation
|
||||
|
||||
### Installation
|
||||
### From source
|
||||
$ git clone https://github.com/Y2Z/monolith.git
|
||||
$ cd monolith
|
||||
$ cargo install --path .
|
||||
|
||||
### Usage
|
||||
### On macOS (via Homebrew)
|
||||
$ brew install monolith
|
||||
|
||||
## Usage
|
||||
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
|
||||
|
||||
### Options
|
||||
## Options
|
||||
- `-c`: Ignore styles
|
||||
- `-f`: Exclude iframes
|
||||
- `-i`: Remove images
|
||||
|
@ -38,11 +41,11 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
|
|||
- `-s`: Silent mode
|
||||
- `-u`: Specify custom User-Agent
|
||||
|
||||
### Related Projects
|
||||
## Related projects
|
||||
- `Pagesaver`: https://github.com/distributed-mind/pagesaver
|
||||
- `SingleFile`: https://github.com/gildas-lormeau/SingleFile
|
||||
|
||||
### License
|
||||
## License
|
||||
The Unlicense
|
||||
|
||||
<!-- Microtext -->
|
||||
|
|
150
src/html.rs
150
src/html.rs
|
@ -7,13 +7,10 @@ use html5ever::tree_builder::{Attribute, TreeSink};
|
|||
use html5ever::{local_name, namespace_url, ns};
|
||||
use http::retrieve_asset;
|
||||
use js::attr_is_event_handler;
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use std::default::Default;
|
||||
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
|
||||
|
||||
lazy_static! {
|
||||
static ref EMPTY_STRING: String = String::new();
|
||||
}
|
||||
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
|
||||
|
||||
const ICON_VALUES: [&str; 5] = [
|
||||
"icon",
|
||||
|
@ -35,7 +32,7 @@ pub fn get_parent_node(node: &Handle) -> Handle {
|
|||
pub fn get_node_name(node: &Handle) -> String {
|
||||
match &node.data {
|
||||
NodeData::Element { ref name, .. } => name.local.as_ref().to_string(),
|
||||
_ => EMPTY_STRING.clone(),
|
||||
_ => str!(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -45,14 +42,13 @@ pub fn is_icon(attr_value: &str) -> bool {
|
|||
|
||||
pub fn walk_and_embed_assets(
|
||||
cache: &mut HashMap<String, String>,
|
||||
client: &Client,
|
||||
url: &str,
|
||||
node: &Handle,
|
||||
opt_no_css: bool,
|
||||
opt_no_js: bool,
|
||||
opt_no_images: bool,
|
||||
opt_user_agent: &str,
|
||||
opt_silent: bool,
|
||||
opt_insecure: bool,
|
||||
opt_no_frames: bool,
|
||||
) {
|
||||
match node.data {
|
||||
|
@ -61,14 +57,13 @@ pub fn walk_and_embed_assets(
|
|||
for child in node.children.borrow().iter() {
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
client,
|
||||
&url,
|
||||
child,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
opt_no_frames,
|
||||
);
|
||||
}
|
||||
|
@ -104,17 +99,16 @@ pub fn walk_and_embed_assets(
|
|||
} else {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
.unwrap_or(str!());
|
||||
let (favicon_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&href_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(favicon_dataurl.as_str());
|
||||
}
|
||||
|
@ -128,19 +122,37 @@ pub fn walk_and_embed_assets(
|
|||
} else {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let (css_dataurl, _) = retrieve_asset(
|
||||
.unwrap_or(str!());
|
||||
let replacement_text = match retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&href_full_url,
|
||||
true,
|
||||
false,
|
||||
"text/css",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||
) {
|
||||
// On successful retrieval, traverse CSS
|
||||
Ok((css_data, _)) => resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
&css_data,
|
||||
true,
|
||||
&href_full_url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
),
|
||||
|
||||
// If a network error occured, warn
|
||||
Err(e) => {
|
||||
eprintln!("Warning: {}", e,);
|
||||
|
||||
// If failed to resolve, replace with absolute URL
|
||||
href_full_url
|
||||
}
|
||||
};
|
||||
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(css_dataurl.as_str());
|
||||
attr.value.push_slice(&replacement_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -148,8 +160,7 @@ pub fn walk_and_embed_assets(
|
|||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&href_full_url.as_str());
|
||||
}
|
||||
|
@ -184,16 +195,15 @@ pub fn walk_and_embed_assets(
|
|||
.map(|attr| &attr.value)
|
||||
.filter(|src| !src.is_empty()) // Ignore empty srcs
|
||||
.next()
|
||||
.and_then(|src| resolve_url(&url, src).ok()) //Make absolute
|
||||
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
|
||||
.and_then(|abs_src| // Download and convert to dataurl
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&abs_src,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
).ok())
|
||||
{
|
||||
// Add the new dataurl src attribute
|
||||
|
@ -220,17 +230,16 @@ pub fn walk_and_embed_assets(
|
|||
} else {
|
||||
let srcset_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
.unwrap_or(str!());
|
||||
let (source_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&srcset_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(source_dataurl.as_str());
|
||||
}
|
||||
|
@ -246,8 +255,8 @@ pub fn walk_and_embed_assets(
|
|||
continue;
|
||||
}
|
||||
|
||||
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
|
@ -266,18 +275,16 @@ pub fn walk_and_embed_assets(
|
|||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
let (js_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&src_full_url,
|
||||
true,
|
||||
"application/javascript",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(js_dataurl.as_str());
|
||||
}
|
||||
|
@ -288,6 +295,23 @@ pub fn walk_and_embed_assets(
|
|||
if opt_no_css {
|
||||
// Empty inner content of STYLE tags
|
||||
node.children.borrow_mut().clear();
|
||||
} else {
|
||||
for node in node.children.borrow_mut().iter_mut() {
|
||||
if let NodeData::Text { ref contents } = node.data {
|
||||
let mut tendril = contents.borrow_mut();
|
||||
let replacement = resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
tendril.as_ref(),
|
||||
false,
|
||||
&url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
);
|
||||
tendril.clear();
|
||||
tendril.push_slice(&replacement);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"form" => {
|
||||
|
@ -296,8 +320,7 @@ pub fn walk_and_embed_assets(
|
|||
// Modify action to be a full URL
|
||||
if !is_valid_url(&attr.value) {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
|
@ -316,33 +339,31 @@ pub fn walk_and_embed_assets(
|
|||
let iframe_src: String = attr.value.to_string();
|
||||
|
||||
// Ignore iframes with empty source (they cause infinite loops)
|
||||
if iframe_src == EMPTY_STRING.clone() {
|
||||
if iframe_src == str!() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
|
||||
resolve_url(&url, &iframe_src).unwrap_or(str!());
|
||||
let (iframe_data, iframe_final_url) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&src_full_url,
|
||||
false,
|
||||
"text/html",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or((EMPTY_STRING.clone(), src_full_url));
|
||||
.unwrap_or((str!(), src_full_url));
|
||||
let dom = html_to_dom(&iframe_data);
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
client,
|
||||
&iframe_final_url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
opt_no_frames,
|
||||
);
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
|
@ -359,25 +380,24 @@ pub fn walk_and_embed_assets(
|
|||
let video_poster = attr.value.to_string();
|
||||
|
||||
// Skip posters with empty source
|
||||
if video_poster == EMPTY_STRING.clone() {
|
||||
if video_poster == str!() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let poster_full_url: String = resolve_url(&url, &video_poster)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let poster_full_url: String =
|
||||
resolve_url(&url, &video_poster).unwrap_or(str!());
|
||||
let (poster_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&poster_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or((poster_full_url, EMPTY_STRING.clone()));
|
||||
.unwrap_or((poster_full_url, str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(poster_dataurl.as_str());
|
||||
}
|
||||
|
@ -387,6 +407,7 @@ pub fn walk_and_embed_assets(
|
|||
_ => {}
|
||||
}
|
||||
|
||||
// Process style attributes
|
||||
if opt_no_css {
|
||||
// Get rid of style attributes
|
||||
let mut style_attr_indexes = Vec::new();
|
||||
|
@ -399,6 +420,24 @@ pub fn walk_and_embed_assets(
|
|||
for attr_index in style_attr_indexes {
|
||||
attrs_mut.remove(attr_index);
|
||||
}
|
||||
} else {
|
||||
// Otherwise, parse any links found in the attributes
|
||||
for attribute in attrs_mut
|
||||
.iter_mut()
|
||||
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
|
||||
{
|
||||
let replacement = resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
attribute.value.as_ref(),
|
||||
false,
|
||||
&url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
);
|
||||
attribute.value.clear();
|
||||
attribute.value.push_slice(&replacement);
|
||||
}
|
||||
}
|
||||
|
||||
if opt_no_js {
|
||||
|
@ -419,14 +458,13 @@ pub fn walk_and_embed_assets(
|
|||
for child in node.children.borrow().iter() {
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
client,
|
||||
&url,
|
||||
child,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
opt_no_frames,
|
||||
);
|
||||
}
|
||||
|
@ -481,7 +519,7 @@ pub fn stringify_document(
|
|||
let doc = dom.get_document();
|
||||
let html = get_child_node_by_name(&doc, "html");
|
||||
let head = get_child_node_by_name(&html, "head");
|
||||
let mut content_attr = EMPTY_STRING.clone();
|
||||
let mut content_attr = str!();
|
||||
if opt_isolate {
|
||||
content_attr += " default-src 'unsafe-inline' data:;";
|
||||
}
|
||||
|
|
18
src/http.rs
18
src/http.rs
|
@ -1,17 +1,15 @@
|
|||
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use utils::{data_to_dataurl, is_data_url};
|
||||
|
||||
pub fn retrieve_asset(
|
||||
cache: &mut HashMap<String, String>,
|
||||
client: &Client,
|
||||
url: &str,
|
||||
as_dataurl: bool,
|
||||
mime: &str,
|
||||
opt_user_agent: &str,
|
||||
opt_silent: bool,
|
||||
opt_insecure: bool,
|
||||
) -> Result<(String, String), reqwest::Error> {
|
||||
if is_data_url(&url).unwrap() {
|
||||
Ok((url.to_string(), url.to_string()))
|
||||
|
@ -19,23 +17,19 @@ pub fn retrieve_asset(
|
|||
if cache.contains_key(&url.to_string()) {
|
||||
// url is in cache
|
||||
if !opt_silent {
|
||||
eprintln!("[ {} ] (from cache)", &url);
|
||||
eprintln!("{} (from cache)", &url);
|
||||
}
|
||||
let data = cache.get(&url.to_string()).unwrap();
|
||||
Ok((data.to_string(), url.to_string()))
|
||||
} else {
|
||||
// url not in cache, we request it
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(opt_insecure)
|
||||
.build()?;
|
||||
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
|
||||
let mut response = client.get(url).send()?;
|
||||
|
||||
if !opt_silent {
|
||||
if url == response.url().as_str() {
|
||||
eprintln!("[ {} ]", &url);
|
||||
eprintln!("{}", &url);
|
||||
} else {
|
||||
eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
|
||||
eprintln!("{} -> {}", &url, &response.url().as_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
extern crate html5ever;
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
extern crate html5ever;
|
||||
extern crate regex;
|
||||
extern crate reqwest;
|
||||
extern crate url;
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
pub mod html;
|
||||
pub mod http;
|
||||
pub mod js;
|
||||
|
|
9
src/macros.rs
Normal file
9
src/macros.rs
Normal file
|
@ -0,0 +1,9 @@
|
|||
#[macro_export]
|
||||
macro_rules! str {
|
||||
() => {
|
||||
String::new()
|
||||
};
|
||||
($val: expr) => {
|
||||
ToString::to_string(&$val)
|
||||
};
|
||||
}
|
25
src/main.rs
25
src/main.rs
|
@ -1,6 +1,7 @@
|
|||
#[macro_use]
|
||||
extern crate clap;
|
||||
extern crate monolith;
|
||||
extern crate reqwest;
|
||||
|
||||
mod args;
|
||||
|
||||
|
@ -8,34 +9,50 @@ use args::AppArgs;
|
|||
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
|
||||
use monolith::http::retrieve_asset;
|
||||
use monolith::utils::is_valid_url;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
|
||||
fn main() {
|
||||
let app_args = AppArgs::get();
|
||||
let cache = &mut HashMap::new();
|
||||
if is_valid_url(app_args.url_target.as_str()) {
|
||||
// Initialize client
|
||||
let mut header_map = HeaderMap::new();
|
||||
match HeaderValue::from_str(&app_args.user_agent) {
|
||||
Ok(header) => header_map.insert(USER_AGENT, header),
|
||||
Err(err) => {
|
||||
eprintln!("Invalid user agent! {}", err);
|
||||
return;
|
||||
}
|
||||
};
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(app_args.insecure)
|
||||
.default_headers(header_map)
|
||||
.build()
|
||||
.expect("Failed to initialize HTTP client");
|
||||
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
&client,
|
||||
app_args.url_target.as_str(),
|
||||
false,
|
||||
"",
|
||||
app_args.user_agent.as_str(),
|
||||
app_args.silent,
|
||||
app_args.insecure,
|
||||
)
|
||||
.unwrap();
|
||||
let dom = html_to_dom(&data);
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&final_url,
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
app_args.no_js,
|
||||
app_args.no_images,
|
||||
app_args.user_agent.as_str(),
|
||||
app_args.silent,
|
||||
app_args.insecure,
|
||||
app_args.no_frames,
|
||||
);
|
||||
|
||||
|
|
|
@ -70,18 +70,18 @@ fn test_walk_and_embed_assets() {
|
|||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let opt_insecure = false;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
"",
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
|
@ -106,18 +106,18 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
|
|||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let opt_insecure = false;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
"",
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
|
@ -144,18 +144,17 @@ fn test_walk_and_embed_assets_no_css() {
|
|||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let opt_insecure = false;
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
"",
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
|
@ -189,18 +188,18 @@ fn test_walk_and_embed_assets_no_images() {
|
|||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = true;
|
||||
let opt_silent = true;
|
||||
let opt_insecure = false;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
"",
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
|
@ -236,18 +235,17 @@ fn test_walk_and_embed_assets_no_frames() {
|
|||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let opt_insecure = false;
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
"",
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
|
@ -275,18 +273,18 @@ fn test_walk_and_embed_assets_no_js() {
|
|||
let opt_no_js: bool = true;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let opt_insecure = false;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
"",
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
|
|
|
@ -3,26 +3,18 @@ use std::collections::HashMap;
|
|||
#[test]
|
||||
fn test_retrieve_asset() {
|
||||
let cache = &mut HashMap::new();
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
"data:text/html;base64,...",
|
||||
true,
|
||||
"",
|
||||
"",
|
||||
true,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
let client = reqwest::Client::new();
|
||||
let (data, final_url) =
|
||||
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
|
||||
assert_eq!(&data, "data:text/html;base64,...");
|
||||
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
&client,
|
||||
"data:text/html;base64,...",
|
||||
true,
|
||||
"image/png",
|
||||
"",
|
||||
true,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
|
|
121
src/utils.rs
121
src/utils.rs
|
@ -1,12 +1,47 @@
|
|||
extern crate base64;
|
||||
|
||||
use self::base64::encode;
|
||||
use http::retrieve_asset;
|
||||
use regex::Regex;
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use url::{ParseError, Url};
|
||||
|
||||
/// This monster of a regex is used to match any kind of URL found in CSS.
|
||||
///
|
||||
/// There are roughly three different categories that a found URL could fit
|
||||
/// into:
|
||||
/// - Font [found after a src: property in an @font-family rule]
|
||||
/// - Stylesheet [denoted by an @import before the url
|
||||
/// - Image [covers all other uses of the url() function]
|
||||
///
|
||||
/// This regex aims to extract the following information:
|
||||
/// - What type of URL is it (font/image/css)
|
||||
/// - Where is the part that needs to be replaced (incl any wrapping quotes)
|
||||
/// - What is the URL (excl any wrapping quotes)
|
||||
///
|
||||
/// Essentially, the regex can be broken down into two parts:
|
||||
///
|
||||
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
|
||||
/// This matches the precursor to a font or CSS URL, and fills in a match under
|
||||
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
|
||||
/// Determining whether or not it's an image can be done by the negation of both
|
||||
/// of these. Either zero or one of these can match.
|
||||
///
|
||||
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
|
||||
/// This matches the actual URL part of the url(), and must always match. It also
|
||||
/// sets `<to_repl>` and `<url>` which correspond to everything within
|
||||
/// `url(...)` and a usable URL, respectively.
|
||||
///
|
||||
/// Note, however, that this does not perform any validation of the found URL.
|
||||
/// Malformed CSS could lead to an invalid URL being present. It is therefore
|
||||
/// recomended that the URL gets manually validated.
|
||||
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
|
||||
|
||||
lazy_static! {
|
||||
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
||||
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
|
||||
}
|
||||
|
||||
const MAGIC: [[&[u8]; 2]; 19] = [
|
||||
|
@ -75,3 +110,89 @@ pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<Strin
|
|||
};
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn resolve_css_imports(
|
||||
cache: &mut HashMap<String, String>,
|
||||
client: &Client,
|
||||
css_string: &str,
|
||||
as_dataurl: bool,
|
||||
href: &str,
|
||||
opt_no_images: bool,
|
||||
opt_silent: bool,
|
||||
) -> String {
|
||||
let mut resolved_css = String::from(css_string);
|
||||
|
||||
for link in REGEX_CSS_URL.captures_iter(&css_string) {
|
||||
let target_link = link.name("url").unwrap().as_str();
|
||||
|
||||
// Determine the type of link
|
||||
let is_stylesheet = link.name("stylesheet").is_some();
|
||||
let is_font = link.name("font").is_some();
|
||||
let is_image = !is_stylesheet && !is_font;
|
||||
|
||||
// Generate absolute URL for content
|
||||
let embedded_url = match resolve_url(href, target_link) {
|
||||
Ok(url) => url,
|
||||
Err(_) => continue, // Malformed URL
|
||||
};
|
||||
|
||||
// Download the asset. If it's more CSS, resolve that too
|
||||
let content = if is_stylesheet {
|
||||
// The link is an @import link
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&embedded_url,
|
||||
false, // Formating as data URL will be done later
|
||||
"text/css", // Expect CSS
|
||||
opt_silent,
|
||||
)
|
||||
.map(|(content, _)| {
|
||||
resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
&content,
|
||||
true, // Finally, convert to a dataurl
|
||||
&embedded_url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
)
|
||||
})
|
||||
} else if (is_image && !opt_no_images) || is_font {
|
||||
// The link is some other, non-@import link
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&embedded_url,
|
||||
true, // Format as data URL
|
||||
"", // Unknown MIME type
|
||||
opt_silent,
|
||||
)
|
||||
.map(|(a, _)| a)
|
||||
} else {
|
||||
// If it's a datatype that has been opt_no'd out of, replace with
|
||||
// absolute URL
|
||||
|
||||
Ok(embedded_url.clone())
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("Warning: {}", e);
|
||||
|
||||
// If failed to resolve, replace with absolute URL
|
||||
embedded_url
|
||||
});
|
||||
|
||||
let replacement = format!("\"{}\"", &content);
|
||||
let dest = link.name("to_repl").unwrap();
|
||||
let offset = resolved_css.len() - css_string.len();
|
||||
let target_range = (dest.start() + offset)..(dest.end() + offset);
|
||||
|
||||
resolved_css.replace_range(target_range, &replacement);
|
||||
}
|
||||
|
||||
if as_dataurl {
|
||||
data_to_dataurl("text/css", resolved_css.as_bytes())
|
||||
} else {
|
||||
resolved_css
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue