Merge pull request #135 from snshn/local-file-support

Add support for working with local assets
This commit is contained in:
Sunshine 2020-03-22 17:18:43 -04:00 committed by GitHub
commit 061386ccc2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 694 additions and 233 deletions

3
.gitignore vendored
View file

@ -4,6 +4,3 @@
# These are backup files generated by rustfmt
**/*.rs.bk
# Exclude accidental HTML files
*.html

View file

@ -11,8 +11,8 @@ rust:
- beta
- nightly
services:
- docker
git:
autocrlf: false # don't mangle LF into CRLF on windows
before_script:
- rustup component add rustfmt

View file

@ -1,7 +1,7 @@
use crate::http::retrieve_asset;
use crate::js::attr_is_event_handler;
use crate::utils::{
data_to_data_url, is_http_url, resolve_css_imports, resolve_url, url_has_protocol,
data_to_data_url, is_http_url, resolve_css_imports, resolve_url, retrieve_asset,
url_has_protocol,
};
use html5ever::interface::QualName;
use html5ever::parse_document;
@ -133,6 +133,7 @@ pub fn walk_and_embed_assets(
let (favicon_data_url, _) = retrieve_asset(
cache,
client,
&url,
&href_full_url,
true,
"",
@ -156,6 +157,7 @@ pub fn walk_and_embed_assets(
let replacement_text = match retrieve_asset(
cache,
client,
&url,
&href_full_url,
false,
"text/css",
@ -167,6 +169,7 @@ pub fn walk_and_embed_assets(
client,
&css_data,
true,
&url,
&href_full_url,
opt_no_images,
opt_silent,
@ -231,6 +234,7 @@ pub fn walk_and_embed_assets(
retrieve_asset(
cache,
client,
&url,
&abs_src,
true,
"",
@ -278,6 +282,7 @@ pub fn walk_and_embed_assets(
retrieve_asset(
cache,
client,
&url,
&abs_src,
true,
"",
@ -311,6 +316,7 @@ pub fn walk_and_embed_assets(
let (source_data_url, _) = retrieve_asset(
cache,
client,
&url,
&srcset_full_url,
true,
"",
@ -375,6 +381,7 @@ pub fn walk_and_embed_assets(
let (js_data_url, _) = retrieve_asset(
cache,
client,
&url,
&src_full_url,
true,
"application/javascript",
@ -401,6 +408,7 @@ pub fn walk_and_embed_assets(
tendril.as_ref(),
false,
&url,
&url,
opt_no_images,
opt_silent,
);
@ -444,6 +452,7 @@ pub fn walk_and_embed_assets(
let (frame_data, frame_final_url) = retrieve_asset(
cache,
client,
&url,
&src_full_url,
false,
"text/html",
@ -488,6 +497,7 @@ pub fn walk_and_embed_assets(
let (poster_data_url, _) = retrieve_asset(
cache,
client,
&url,
&poster_full_url,
true,
"",
@ -528,6 +538,7 @@ pub fn walk_and_embed_assets(
attribute.value.as_ref(),
false,
&url,
&url,
opt_no_images,
opt_silent,
);

View file

@ -1,68 +0,0 @@
use crate::utils::{clean_url, data_to_data_url, is_data_url};
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use std::collections::HashMap;
pub fn retrieve_asset(
cache: &mut HashMap<String, String>,
client: &Client,
url: &str,
as_data_url: bool,
mime: &str,
opt_silent: bool,
) -> Result<(String, String), reqwest::Error> {
let cache_key = clean_url(&url);
if is_data_url(&url) {
Ok((url.to_string(), url.to_string()))
} else {
if cache.contains_key(&cache_key) {
// url is in cache
if !opt_silent {
eprintln!("{} (from cache)", &url);
}
let data = cache.get(&cache_key).unwrap();
Ok((data.to_string(), url.to_string()))
} else {
// url not in cache, we request it
let mut response = client.get(url).send()?;
let res_url = response.url().to_string();
if !opt_silent {
if url == res_url {
eprintln!("{}", &url);
} else {
eprintln!("{} -> {}", &url, &res_url);
}
}
let new_cache_key = clean_url(&res_url);
if as_data_url {
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
response.copy_to(&mut data)?;
// Attempt to obtain MIME type by reading the Content-Type header
let mimetype = if mime == "" {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or(&mime)
} else {
mime
};
let data_url = data_to_data_url(&mimetype, &data);
// insert in cache
cache.insert(new_cache_key, data_url.clone());
Ok((data_url, res_url))
} else {
let content = response.text().unwrap();
// insert in cache
cache.insert(new_cache_key, content.clone());
Ok((content, res_url))
}
}
}
}

View file

@ -1,7 +1,7 @@
const JS_DOM_EVENT_ATTRS: &[&str] = &[
// From WHATWG HTML spec 8.1.5.2 'Event handlers on elements, Document objects, and Window objects':
// From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects":
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
// https://html.spec.whatwg.org/#attributes-3 (table 'List of event handler content attributes')
// https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes")
// Global event handlers
"onabort",

View file

@ -5,7 +5,6 @@ extern crate lazy_static;
mod macros;
pub mod html;
pub mod http;
pub mod js;
pub mod utils;

View file

@ -6,19 +6,20 @@ mod macros;
use crate::args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset;
use monolith::utils::{data_url_to_text, is_data_url, is_http_url};
use monolith::utils::{data_url_to_text, is_data_url, is_file_url, is_http_url, retrieve_asset};
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap;
use std::fs::File;
use std::env;
use std::fs;
use std::io::{self, Error, Write};
use std::path::Path;
use std::process;
use std::time::Duration;
enum Output {
Stdout(io::Stdout),
File(File),
File(fs::File),
}
impl Output {
@ -26,7 +27,7 @@ impl Output {
if file_path.is_empty() {
Ok(Output::Stdout(io::stdout()))
} else {
Ok(Output::File(File::create(file_path)?))
Ok(Output::File(fs::File::create(file_path)?))
}
}
@ -46,16 +47,41 @@ impl Output {
fn main() {
let app_args = AppArgs::get();
let target_url: &str = app_args.url_target.as_str();
let mut original_target: String = app_args.url_target.clone();
let target_url: &str;
let base_url;
let dom;
if !is_http_url(target_url) && !is_data_url(target_url) {
eprintln!(
"Only HTTP(S) or data URLs are supported but got: {}",
&target_url
);
// Pre-process the input
let cwd_normalized: String =
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
let path = Path::new(original_target.as_str());
let path_is_relative: bool = path.is_relative();
if original_target.clone().len() == 0 {
eprintln!("No target specified");
process::exit(1);
} else if is_http_url(original_target.clone()) || is_data_url(original_target.clone()) {
target_url = original_target.as_str();
} else if is_file_url(original_target.clone()) {
target_url = original_target.as_str();
} else if path.exists() {
if !path.is_file() {
eprintln!("Local target is not a file: {}", original_target);
process::exit(1);
}
original_target.insert_str(0, if cfg!(windows) { "file:///" } else { "file://" });
original_target = original_target.replace("\\", "/");
if path_is_relative {
original_target.insert_str(if cfg!(windows) { 8 } else { 7 }, &cwd_normalized);
original_target.insert_str(
if cfg!(windows) { 8 } else { 7 } + &cwd_normalized.len(),
"/",
);
}
target_url = original_target.as_str();
} else {
original_target.insert_str(0, "http://");
target_url = original_target.as_str();
}
let mut output = Output::new(&app_args.output).expect("Could not prepare output");
@ -81,21 +107,26 @@ fn main() {
.expect("Failed to initialize HTTP client");
// Retrieve root document
if is_http_url(target_url) {
let (data, final_url) =
retrieve_asset(&mut cache, &client, target_url, false, "", app_args.silent)
.expect("Could not retrieve assets in HTML");
if is_file_url(target_url) || is_http_url(target_url) {
let (data, final_url) = retrieve_asset(
&mut cache,
&client,
target_url,
target_url,
false,
"",
app_args.silent,
)
.expect("Could not retrieve target document");
base_url = final_url;
dom = html_to_dom(&data);
} else if is_data_url(target_url) {
let text: String = data_url_to_text(target_url);
if text.len() == 0 {
eprintln!("Unsupported data URL input");
process::exit(1);
}
base_url = str!();
base_url = str!(target_url);
dom = html_to_dom(&text);
} else {
process::exit(1);

View file

@ -1,4 +1,5 @@
use assert_cmd::prelude::*;
use std::env;
use std::process::Command;
#[test]
@ -22,9 +23,9 @@ fn print_version() -> Result<(), Box<dyn std::error::Error>> {
}
#[test]
fn bad_input() -> Result<(), Box<dyn std::error::Error>> {
fn bad_input_empty_target() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd.arg("kernel.org").output().unwrap();
let out = cmd.arg("").output().unwrap();
// STDOUT should be empty
assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), "");
@ -32,7 +33,7 @@ fn bad_input() -> Result<(), Box<dyn std::error::Error>> {
// STDERR should contain error description
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
"Only HTTP(S) or data URLs are supported but got: kernel.org\n"
"No target specified\n"
);
// The exit code should be 1
@ -73,7 +74,9 @@ fn isolate_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain isolated HTML
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:;\"></meta></head><body>Hello, World!</body></html>\n"
"<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:;\"></meta>\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
@ -97,7 +100,10 @@ fn remove_css_from_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none';\"></meta><style></style></head><body>Hello</body></html>\n"
"<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none';\"></meta>\
<style></style>\
</head><body>Hello</body></html>\n"
);
// STDERR should be empty
@ -121,7 +127,9 @@ fn remove_frames_from_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain HTML with no iframes
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"frame-src 'none';child-src 'none';\"></meta></head><body><iframe src=\"\"></iframe>Hi</body></html>\n"
"<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"frame-src 'none';child-src 'none';\"></meta>\
</head><body><iframe src=\"\"></iframe>Hi</body></html>\n"
);
// STDERR should be empty
@ -145,7 +153,15 @@ fn remove_images_from_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain HTML with no images
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"img-src data:;\"></meta></head><body><img src=\"\">Hi</body></html>\n"
"<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"img-src data:;\"></meta>\
</head>\
<body>\
<img src=\"\">\
Hi\
</body>\
</html>\n"
);
// STDERR should be empty
@ -169,7 +185,203 @@ fn remove_js_from_data_url() -> Result<(), Box<dyn std::error::Error>> {
// STDOUT should contain HTML with no JS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><meta http-equiv=\"Content-Security-Policy\" content=\"script-src 'none';\"></meta><script></script></head><body>Hi</body></html>\n"
"<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"script-src 'none';\"></meta>\
<script></script></head>\
<body>Hi</body>\
</html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn local_file_target_input() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let cwd_normalized: String =
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
let out = cmd
.arg(if cfg!(windows) {
"src\\tests\\data\\local-file.html"
} else {
"src/tests/data/local-file.html"
})
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDOUT should contain HTML from the local file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<!DOCTYPE html><html lang=\"en\"><head>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n \
<title>Local HTML file</title>\n \
<link href=\"data:text/css;base64,Ym9keSB7CiAgICBiYWNrZ3JvdW5kLWNvbG9yOiAjMDAwOwogICAgY29sb3I6ICNmZmY7Cn0K\" rel=\"stylesheet\" type=\"text/css\">\n \
<link href=\"data:text/css;base64,\" rel=\"stylesheet\" type=\"text/css\">\n</head>\n\n<body>\n \
<img alt=\"\" src=\"\">\n \
<a href=\"file://local-file.html/\">Tricky href</a>\n \
<a href=\"https://github.com/Y2Z/monolith\">Remote URL</a>\n \
<script src=\"data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==\"></script>\n\n\n\n\
</body></html>\n"
);
// STDERR should contain list of retrieved file URLs
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"{file}{cwd}/src/tests/data/local-file.html\n\
{file}{cwd}/src/tests/data/local-style.css\n\
{file}{cwd}/src/tests/data/local-script.js\n",
file = file_url_protocol,
cwd = cwd_normalized
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn local_file_target_input_absolute_target_path() -> Result<(), Box<dyn std::error::Error>> {
let cwd = env::current_dir().unwrap();
let cwd_normalized: String =
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("-jciI")
.arg(if cfg!(windows) {
format!(
"{cwd}\\src\\tests\\data\\local-file.html",
cwd = cwd.to_str().unwrap()
)
} else {
format!(
"{cwd}/src/tests/data/local-file.html",
cwd = cwd.to_str().unwrap()
)
})
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDOUT should contain HTML from the local file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<!DOCTYPE html><html lang=\"en\"><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:; style-src 'none'; script-src 'none'; img-src data:;\"></meta>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n \
<title>Local HTML file</title>\n \
<link href=\"\" rel=\"stylesheet\" type=\"text/css\">\n \
<link href=\"\" rel=\"stylesheet\" type=\"text/css\">\n</head>\n\n<body>\n \
<img alt=\"\" src=\"\">\n \
<a href=\"file://local-file.html/\">Tricky href</a>\n \
<a href=\"https://github.com/Y2Z/monolith\">Remote URL</a>\n \
<script src=\"\"></script>\n\n\n\n\
</body></html>\n"
);
// STDERR should contain only the target file
let cwd = env::current_dir().unwrap();
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"{file}{cwd}/src/tests/data/local-file.html\n",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn local_file_url_target_input() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let cwd = env::current_dir().unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
let out = cmd
.arg("-cji")
.arg(if cfg!(windows) {
format!(
"{file}{cwd}\\src\\tests\\data\\local-file.html",
file = file_url_protocol,
cwd = cwd.to_str().unwrap(),
)
} else {
format!(
"{file}{cwd}/src/tests/data/local-file.html",
file = file_url_protocol,
cwd = cwd.to_str().unwrap(),
)
})
.output()
.unwrap();
// STDOUT should contain HTML from the local file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<!DOCTYPE html><html lang=\"en\"><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none'; script-src 'none'; img-src data:;\"></meta>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n \
<title>Local HTML file</title>\n \
<link href=\"\" rel=\"stylesheet\" type=\"text/css\">\n \
<link href=\"\" rel=\"stylesheet\" type=\"text/css\">\n</head>\n\n<body>\n \
<img alt=\"\" src=\"\">\n \
<a href=\"file://local-file.html/\">Tricky href</a>\n \
<a href=\"https://github.com/Y2Z/monolith\">Remote URL</a>\n \
<script src=\"\"></script>\n\n\n\n\
</body></html>\n"
);
// STDERR should contain list of retrieved file URLs
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
if cfg!(windows) {
format!(
"{file}{cwd}\\src\\tests\\data\\local-file.html\n",
file = file_url_protocol,
cwd = cwd.to_str().unwrap(),
)
} else {
format!(
"{file}{cwd}/src/tests/data/local-file.html\n",
file = file_url_protocol,
cwd = cwd.to_str().unwrap(),
)
}
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn security_disallow_local_assets_within_data_url_targets() -> Result<(), Box<dyn std::error::Error>>
{
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("data:text/html,%3Cscript%20src=\"src/tests/data/local-script.js\"%3E%3C/script%3E")
.output()
.unwrap();
// STDOUT should contain HTML with no JS in it
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><script src=\"\"></script></head><body></body></html>\n"
);
// STDERR should be empty

View file

@ -0,0 +1,19 @@
<!doctype html>
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>Local HTML file</title>
<link href="local-style.css" rel="stylesheet" type="text/css" />
<link href="local-style-does-not-exist.css" rel="stylesheet" type="text/css" />
</head>
<body>
<img src="monolith.png" alt="" />
<a href="//local-file.html">Tricky href</a>
<a href="https://github.com/Y2Z/monolith">Remote URL</a>
<script src="local-script.js"></script>
</body>
</html>

View file

@ -0,0 +1,2 @@
document.body.style.backgroundColor = "green";
document.body.style.color = "red";

View file

@ -0,0 +1,4 @@
body {
background-color: #000;
color: #fff;
}

View file

@ -1,25 +0,0 @@
use crate::http::retrieve_asset;
use reqwest::blocking::Client;
use std::collections::HashMap;
#[test]
fn test_retrieve_asset() {
let cache = &mut HashMap::new();
let client = Client::new();
let (data, final_url) =
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
let (data, final_url) = retrieve_asset(
cache,
&client,
"data:text/html;base64,...",
true,
"image/png",
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
}

View file

@ -1,5 +1,4 @@
mod cli;
mod html;
mod http;
mod js;
mod utils;

View file

@ -1,14 +1,14 @@
use crate::utils::{
clean_url, data_to_data_url, data_url_to_text, detect_mimetype, is_data_url, is_http_url,
resolve_url, url_has_protocol,
};
use crate::utils;
use reqwest::blocking::Client;
use std::collections::HashMap;
use std::env;
use url::ParseError;
#[test]
fn test_data_to_data_url() {
fn data_to_data_url() {
let mime = "application/javascript";
let data = "var word = 'hello';\nalert(word);\n";
let datauri = data_to_data_url(mime, data.as_bytes());
let datauri = utils::data_to_data_url(mime, data.as_bytes());
assert_eq!(
&datauri,
"data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
@ -16,90 +16,124 @@ fn test_data_to_data_url() {
}
#[test]
fn test_detect_mimetype() {
// image
assert_eq!(detect_mimetype(b"GIF87a"), "image/gif");
assert_eq!(detect_mimetype(b"GIF89a"), "image/gif");
assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png");
assert_eq!(detect_mimetype(b"<?xml "), "image/svg+xml");
assert_eq!(detect_mimetype(b"<svg "), "image/svg+xml");
assert_eq!(detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
assert_eq!(detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
// audio
assert_eq!(detect_mimetype(b"ID3"), "audio/mpeg");
assert_eq!(detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
assert_eq!(detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
assert_eq!(detect_mimetype(b"OggS"), "audio/ogg");
assert_eq!(detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
assert_eq!(detect_mimetype(b"fLaC"), "audio/x-flac");
// video
assert_eq!(detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
assert_eq!(detect_mimetype(b"....ftyp"), "video/mp4");
assert_eq!(detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
assert_eq!(detect_mimetype(b"....moov"), "video/quicktime");
assert_eq!(detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
fn detect_mimetype() {
// Image
assert_eq!(utils::detect_mimetype(b"GIF87a"), "image/gif");
assert_eq!(utils::detect_mimetype(b"GIF89a"), "image/gif");
assert_eq!(utils::detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
assert_eq!(
utils::detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"),
"image/png"
);
assert_eq!(utils::detect_mimetype(b"<?xml "), "image/svg+xml");
assert_eq!(utils::detect_mimetype(b"<svg "), "image/svg+xml");
assert_eq!(utils::detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
assert_eq!(utils::detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
// Audio
assert_eq!(utils::detect_mimetype(b"ID3"), "audio/mpeg");
assert_eq!(utils::detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
assert_eq!(utils::detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
assert_eq!(utils::detect_mimetype(b"OggS"), "audio/ogg");
assert_eq!(utils::detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
assert_eq!(utils::detect_mimetype(b"fLaC"), "audio/x-flac");
// Video
assert_eq!(utils::detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
assert_eq!(utils::detect_mimetype(b"....ftyp"), "video/mp4");
assert_eq!(utils::detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
assert_eq!(utils::detect_mimetype(b"....moov"), "video/quicktime");
assert_eq!(utils::detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
}
#[test]
fn test_url_has_protocol() {
// passing
fn url_has_protocol() {
// Passing
assert_eq!(
url_has_protocol("mailto:somebody@somewhere.com?subject=hello"),
utils::url_has_protocol("mailto:somebody@somewhere.com?subject=hello"),
true
);
assert_eq!(url_has_protocol("tel:5551234567"), true);
assert_eq!(utils::url_has_protocol("tel:5551234567"), true);
assert_eq!(
url_has_protocol("ftp:user:password@some-ftp-server.com"),
utils::url_has_protocol("ftp:user:password@some-ftp-server.com"),
true
);
assert_eq!(url_has_protocol("javascript:void(0)"), true);
assert_eq!(url_has_protocol("http://news.ycombinator.com"), true);
assert_eq!(url_has_protocol("https://github.com"), true);
assert_eq!(utils::url_has_protocol("javascript:void(0)"), true);
assert_eq!(utils::url_has_protocol("http://news.ycombinator.com"), true);
assert_eq!(utils::url_has_protocol("https://github.com"), true);
assert_eq!(
url_has_protocol("MAILTO:somebody@somewhere.com?subject=hello"),
utils::url_has_protocol("MAILTO:somebody@somewhere.com?subject=hello"),
true
);
// failing
// Failing
assert_eq!(
url_has_protocol("//some-hostname.com/some-file.html"),
utils::url_has_protocol("//some-hostname.com/some-file.html"),
false
);
assert_eq!(url_has_protocol("some-hostname.com/some-file.html"), false);
assert_eq!(url_has_protocol("/some-file.html"), false);
assert_eq!(url_has_protocol(""), false);
assert_eq!(
utils::url_has_protocol("some-hostname.com/some-file.html"),
false
);
assert_eq!(utils::url_has_protocol("/some-file.html"), false);
assert_eq!(utils::url_has_protocol(""), false);
}
#[test]
fn test_is_http_url() {
// passing
assert!(is_http_url("https://www.rust-lang.org/"));
assert!(is_http_url("http://kernel.org"));
// failing
assert!(!is_http_url("//kernel.org"));
assert!(!is_http_url("./index.html"));
assert!(!is_http_url("some-local-page.htm"));
assert!(!is_http_url("ftp://1.2.3.4/www/index.html"));
assert!(!is_http_url(
fn is_file_url() {
// Passing
assert!(utils::is_file_url(
"file:///home/user/Websites/my-website/index.html"
));
assert!(utils::is_file_url(
"file:///C:/Documents%20and%20Settings/user/Websites/my-website/assets/images/logo.png"
));
assert!(utils::is_file_url(
"file:\\\\\\home\\user\\Websites\\my-website\\index.html"
));
// Failing
assert!(!utils::is_file_url("//kernel.org"));
assert!(!utils::is_file_url("./index.html"));
assert!(!utils::is_file_url("some-local-page.htm"));
assert!(!utils::is_file_url("https://1.2.3.4:80/www/index.html"));
assert!(!utils::is_file_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
));
}
#[test]
fn test_resolve_url() -> Result<(), ParseError> {
let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?;
fn is_http_url() {
// Passing
assert!(utils::is_http_url("https://www.rust-lang.org/"));
assert!(utils::is_http_url("http://kernel.org"));
assert!(utils::is_http_url("http:\\\\freebsd.org\\"));
// Failing
assert!(!utils::is_http_url("//kernel.org"));
assert!(!utils::is_http_url("./index.html"));
assert!(!utils::is_http_url("some-local-page.htm"));
assert!(!utils::is_http_url("ftp://1.2.3.4/www/index.html"));
assert!(!utils::is_http_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
));
}
#[test]
fn resolve_url() -> Result<(), ParseError> {
let resolved_url = utils::resolve_url("https://www.kernel.org", "../category/signatures.html")?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?;
let resolved_url = utils::resolve_url("https://www.kernel.org", "category/signatures.html")?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
let resolved_url = utils::resolve_url(
"saved_page.htm",
"https://www.kernel.org/category/signatures.html",
)?;
@ -108,7 +142,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
let resolved_url = utils::resolve_url(
"https://www.kernel.org",
"//www.kernel.org/theme/images/logos/tux.png",
)?;
@ -117,7 +151,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.kernel.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
let resolved_url = utils::resolve_url(
"https://www.kernel.org",
"//another-host.org/theme/images/logos/tux.png",
)?;
@ -126,7 +160,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://another-host.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
let resolved_url = utils::resolve_url(
"https://www.kernel.org/category/signatures.html",
"/theme/images/logos/tux.png",
)?;
@ -135,7 +169,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.kernel.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
let resolved_url = utils::resolve_url(
"https://www.w3schools.com/html/html_iframe.asp",
"default.asp",
)?;
@ -144,7 +178,7 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.w3schools.com/html/default.asp"
);
let resolved_url = resolve_url(
let resolved_url = utils::resolve_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
"https://www.kernel.org/category/signatures.html",
)?;
@ -153,62 +187,197 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
let resolved_url = utils::resolve_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
"//www.w3schools.com/html/html_iframe.asp",
)
.unwrap_or(str!());
assert_eq!(resolved_url.as_str(), "");
let resolved_url = utils::resolve_url(
"file:///home/user/Websites/my-website/index.html",
"assets/images/logo.png",
)
.unwrap_or(str!());
assert_eq!(
resolved_url.as_str(),
"file:///home/user/Websites/my-website/assets/images/logo.png"
);
let resolved_url = utils::resolve_url(
"file:\\\\\\home\\user\\Websites\\my-website\\index.html",
"assets\\images\\logo.png",
)
.unwrap_or(str!());
assert_eq!(
resolved_url.as_str(),
"file:///home/user/Websites/my-website/assets/images/logo.png"
);
Ok(())
}
#[test]
fn test_is_data_url() {
// passing
assert!(is_data_url(
fn is_data_url() {
// Passing
assert!(utils::is_data_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
));
// failing
assert!(!is_data_url("https://kernel.org"));
assert!(!is_data_url("//kernel.org"));
assert!(!is_data_url(""));
// Failing
assert!(!utils::is_data_url("https://kernel.org"));
assert!(!utils::is_data_url("//kernel.org"));
assert!(!utils::is_data_url(""));
}
#[test]
fn test_clean_url() {
fn clean_url() {
assert_eq!(
clean_url("https://somewhere.com/font.eot#iefix"),
utils::clean_url("https://somewhere.com/font.eot#iefix"),
"https://somewhere.com/font.eot"
);
assert_eq!(
clean_url("https://somewhere.com/font.eot#"),
utils::clean_url("https://somewhere.com/font.eot#"),
"https://somewhere.com/font.eot"
);
assert_eq!(
clean_url("https://somewhere.com/font.eot?#"),
utils::clean_url("https://somewhere.com/font.eot?#"),
"https://somewhere.com/font.eot"
);
}
#[test]
fn test_data_url_to_text() {
fn data_url_to_text() {
assert_eq!(
data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="),
utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="),
"Work expands so as to fill the time available for its completion"
);
assert_eq!(
data_url_to_text(
utils::data_url_to_text(
"data:text/html;utf8,Work expands so as to fill the time available for its completion"
),
"Work expands so as to fill the time available for its completion"
);
assert_eq!(
data_url_to_text(
utils::data_url_to_text(
"data:text/html,Work expands so as to fill the time available for its completion"
),
"Work expands so as to fill the time available for its completion"
);
assert_eq!(
utils::data_url_to_text(
" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "
),
"Work expands so as to fill the time available for its completion"
);
}
#[test]
fn decode_url() {
assert_eq!(
utils::decode_url(str!(
"%E6%A4%9C%E3%83%92%E3%83%A0%E8%A7%A3%E5%A1%97%E3%82%83%E3%83%83%20%3D%20%E3%82%B5"
)),
"検ヒム解塗ゃッ = サ"
);
assert_eq!(utils::decode_url(str!("%20 %20")), " ");
}
#[test]
fn retrieve_asset() {
let cache = &mut HashMap::new();
let client = Client::new();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// If both source and target are data URLs,
// ensure the result contains target data URL
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
"data:text/html;base64,SoUrCe",
"data:text/html;base64,TaRgEt",
true,
"",
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,TaRgEt");
assert_eq!(&final_url, "data:text/html;base64,TaRgEt");
// Media type parameter should not influence data URLs
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
"data:text/html;base64,SoUrCe",
"data:text/html;base64,TaRgEt",
true,
"image/png",
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,TaRgEt");
assert_eq!(&final_url, "data:text/html;base64,TaRgEt");
// Inclusion of local assets from data URL sources should not be allowed
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
"data:text/html;base64,SoUrCe",
"file:///etc/passwd",
true,
"",
false,
)
.unwrap();
assert_eq!(&data, "");
assert_eq!(&final_url, "");
// Inclusion of local assets from remote sources should not be allowed
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
"https://kernel.org/",
"file:///etc/passwd",
true,
"",
false,
)
.unwrap();
assert_eq!(&data, "");
assert_eq!(&final_url, "");
// Inclusion of local assets from local sources should be allowed
let cwd = env::current_dir().unwrap();
let (data, final_url) = utils::retrieve_asset(
cache,
&client,
&format!(
"{file}{cwd}/src/tests/data/local-file.html",
file = file_url_protocol,
cwd = cwd.to_str().unwrap()
),
&format!(
"{file}{cwd}/src/tests/data/local-script.js",
file = file_url_protocol,
cwd = cwd.to_str().unwrap()
),
true,
"application/javascript",
false,
)
.unwrap();
assert_eq!(&data, "data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==");
assert_eq!(
&final_url,
&format!(
"{file}{cwd}/src/tests/data/local-script.js",
file = file_url_protocol,
cwd = cwd.to_str().unwrap()
)
);
}

View file

@ -1,8 +1,10 @@
use crate::http::retrieve_asset;
use base64::{decode, encode};
use base64;
use regex::Regex;
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use url::{form_urlencoded, ParseError, Url};
/// This monster of a regex is used to match any kind of URL found in CSS.
@ -71,7 +73,7 @@ pub fn data_to_data_url(mime: &str, data: &[u8]) -> String {
} else {
mime.to_string()
};
format!("data:{};base64,{}", mimetype, encode(data))
format!("data:{};base64,{}", mimetype, base64::encode(data))
}
pub fn detect_mimetype(data: &[u8]) -> String {
@ -95,6 +97,12 @@ pub fn is_data_url<T: AsRef<str>>(url: T) -> bool {
.unwrap_or(false)
}
pub fn is_file_url<T: AsRef<str>>(url: T) -> bool {
Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme() == "file"))
.unwrap_or(false)
}
pub fn is_http_url<T: AsRef<str>>(url: T) -> bool {
Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https"))
@ -118,6 +126,7 @@ pub fn resolve_css_imports(
client: &Client,
css_string: &str,
as_data_url: bool,
parent_url: &str,
href: &str,
opt_no_images: bool,
opt_silent: bool,
@ -127,12 +136,12 @@ pub fn resolve_css_imports(
for link in REGEX_CSS_URL.captures_iter(&css_string) {
let target_link = link.name("url").unwrap().as_str();
// Determine the type of link
// Determine linked asset type
let is_stylesheet = link.name("stylesheet").is_some();
let is_font = link.name("font").is_some();
let is_image = !is_stylesheet && !is_font;
// Generate absolute URL for content
// Generate absolute URL for the content
let embedded_url = match resolve_url(href, target_link) {
Ok(url) => url,
Err(_) => continue, // Malformed URL
@ -144,8 +153,9 @@ pub fn resolve_css_imports(
retrieve_asset(
cache,
client,
&parent_url,
&embedded_url,
false, // Formating as data URL will be done later
false, // Formatting as data URL will be done later
"text/css", // Expect CSS
opt_silent,
)
@ -155,6 +165,7 @@ pub fn resolve_css_imports(
client,
&content,
true, // Finally, convert to a data URL
&parent_url,
&embedded_url,
opt_no_images,
opt_silent,
@ -165,6 +176,7 @@ pub fn resolve_css_imports(
retrieve_asset(
cache,
client,
&parent_url,
&embedded_url,
true, // Format as data URL
"", // Unknown MIME type
@ -186,10 +198,11 @@ pub fn resolve_css_imports(
let replacement = format!("\"{}\"", &content);
let dest = link.name("to_repl").unwrap();
let offset = resolved_css.len() - css_string.len();
let target_range = (dest.start() + offset)..(dest.end() + offset);
resolved_css.replace_range(target_range, &replacement);
if resolved_css.len() > css_string.len() {
let offset = resolved_css.len() - css_string.len();
let target_range = (dest.start() + offset)..(dest.end() + offset);
resolved_css.replace_range(target_range, &replacement);
}
}
if as_data_url {
@ -222,20 +235,7 @@ pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
let meta_data: String = path.chars().take(comma_loc).collect();
let raw_data: String = path.chars().skip(comma_loc + 1).collect();
let data: String = form_urlencoded::parse(raw_data.as_bytes())
.map(|(key, val)| {
[
key.to_string(),
if val.to_string().len() == 0 {
str!()
} else {
str!('=')
},
val.to_string(),
]
.concat()
})
.collect();
let data: String = decode_url(raw_data);
let meta_data_items: Vec<&str> = meta_data.split(';').collect();
let mut mime_type: &str = "";
@ -259,7 +259,7 @@ pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
if mime_type.eq_ignore_ascii_case("text/html") {
if encoding.eq_ignore_ascii_case("base64") {
String::from_utf8(decode(&data).unwrap_or(vec![])).unwrap_or(str!())
String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!())
} else {
data
}
@ -267,3 +267,114 @@ pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
str!()
}
}
pub fn decode_url(input: String) -> String {
form_urlencoded::parse(input.as_bytes())
.map(|(key, val)| {
[
key.to_string(),
if val.to_string().len() == 0 {
str!()
} else {
str!('=')
},
val.to_string(),
]
.concat()
})
.collect()
}
pub fn retrieve_asset(
cache: &mut HashMap<String, String>,
client: &Client,
parent_url: &str,
url: &str,
as_data_url: bool,
mime: &str,
opt_silent: bool,
) -> Result<(String, String), reqwest::Error> {
if url.len() == 0 {
return Ok((str!(), str!()));
}
let cache_key = clean_url(&url);
if is_data_url(&url) {
Ok((url.to_string(), url.to_string()))
} else if is_file_url(&url) {
// Check if parent_url is also file:///
// (if not then we don't download/embed the asset)
if !is_file_url(&parent_url) {
return Ok((str!(), str!()));
}
let cutoff = if cfg!(windows) { 8 } else { 7 };
let fs_file_path: String = decode_url(url.to_string()[cutoff..].to_string());
let path = Path::new(&fs_file_path);
if path.exists() {
if !opt_silent {
eprintln!("{}", &url);
}
if as_data_url {
let data_url: String = data_to_data_url(&mime, &fs::read(&fs_file_path).unwrap());
Ok((data_url, url.to_string()))
} else {
let data: String = fs::read_to_string(&fs_file_path).expect(url);
Ok((data, url.to_string()))
}
} else {
Ok((str!(), url.to_string()))
}
} else {
if cache.contains_key(&cache_key) {
// URL is in cache
if !opt_silent {
eprintln!("{} (from cache)", &url);
}
let data = cache.get(&cache_key).unwrap();
Ok((data.to_string(), url.to_string()))
} else {
// URL not in cache, we request it
let mut response = client.get(url).send()?;
let res_url = response.url().to_string();
if !opt_silent {
if url == res_url {
eprintln!("{}", &url);
} else {
eprintln!("{} -> {}", &url, &res_url);
}
}
let new_cache_key = clean_url(&res_url);
if as_data_url {
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
response.copy_to(&mut data)?;
// Attempt to obtain MIME type by reading the Content-Type header
let mimetype = if mime == "" {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or(&mime)
} else {
mime
};
let data_url = data_to_data_url(&mimetype, &data);
// Add to cache
cache.insert(new_cache_key, data_url.clone());
Ok((data_url, res_url))
} else {
let content = response.text().unwrap();
// Add to cache
cache.insert(new_cache_key, content.clone());
Ok((content, res_url))
}
}
}
}