add more tests

This commit is contained in:
Sunshine 2021-06-02 03:41:41 -10:00
parent a308a20411
commit a6e891b3c5
No known key found for this signature in database
GPG key ID: B80CA68703CD8AB1
7 changed files with 338 additions and 138 deletions

View file

@ -79,11 +79,11 @@ or
- `-j`: Exclude JavaScript
- `-k`: Accept invalid X.509 (TLS) certificates
- `-M`: Don't add timestamp and URL information
- `-n`: Extract contents of NOSCRIPT tags
- `-n`: Extract contents of NOSCRIPT elements
- `-o`: Write output to `file`
- `-s`: Be quiet
- `-t`: Adjust `network request timeout`
- `-u`: Provide `custom User-Agent`
- `-u`: Provide custom `User-Agent`
- `-v`: Exclude videos
---------------------------------------------------
@ -99,19 +99,15 @@ Please open an issue if something is wrong, that helps make this project better.
---------------------------------------------------
## Related projects
- `Monolith Chrome Extension`: https://github.com/rhysd/monolith-of-web
- `Pagesaver`: https://github.com/distributed-mind/pagesaver
- `Personal WayBack Machine`: https://github.com/popey/pwbm
- `Hako`: https://github.com/dmpop/hako
- Monolith Chrome Extension: https://github.com/rhysd/monolith-of-web
- Pagesaver: https://github.com/distributed-mind/pagesaver
- Personal WayBack Machine: https://github.com/popey/pwbm
- Hako: https://github.com/dmpop/hako
---------------------------------------------------
## License
<a href="https://creativecommons.org/publicdomain/zero/1.0/">
<img src="https://i.creativecommons.org/p/zero/1.0/88x31.png" alt="CC0-1.0" />
</a>
<br />
To the extent possible under law, the author(s) have dedicated all copyright related and neighboring rights to this software to the public domain worldwide.
This software is distributed without any warranty.

View file

@ -474,8 +474,9 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String {
result = String::from_utf8(buf).unwrap();
}
// Unwrap NOSCRIPT elements
if options.unwrap_noscript {
let noscript_re = Regex::new(r"<(?P<c>/?noscript)>").unwrap();
let noscript_re = Regex::new(r"<(?P<c>/?noscript[^>]*)>").unwrap();
result = noscript_re.replace_all(&result, "<!--$c-->").to_string();
}
@ -503,44 +504,39 @@ pub fn retrieve_and_embed_asset(
depth + 1,
) {
Ok((data, final_url, mut media_type)) => {
// Check integrity if it's a LINK or SCRIPT tag
let node_name: &str = get_node_name(&node).unwrap();
// Check integrity if it's a LINK or SCRIPT element
let mut ok_to_include: bool = true;
if node_name == "link" || node_name == "script" {
let node_integrity_attr_value: Option<String> = get_node_attr(node, "integrity");
// Check integrity
if let Some(node_integrity_attr_value) = node_integrity_attr_value {
if let Some(node_integrity_attr_value) = get_node_attr(node, "integrity") {
if !node_integrity_attr_value.is_empty() {
ok_to_include = check_integrity(&data, &node_integrity_attr_value);
}
}
// Wipe integrity attribute
set_node_attr(node, "integrity", None);
// Wipe the integrity attribute
set_node_attr(node, "integrity", None);
}
}
if ok_to_include {
if node_name == "link" {
let link_type: &str = determine_link_node_type(node);
// CSS LINK nodes requires special treatment
if link_type == "stylesheet" {
let css: String = embed_css(
cache,
client,
&final_url,
&String::from_utf8_lossy(&data),
options,
depth + 1,
);
let css_data_url = create_data_url("text/css", css.as_bytes(), &final_url);
if node_name == "link" && determine_link_node_type(node) == "stylesheet" {
// Stylesheet LINK elements require special treatment
let css: String = embed_css(
cache,
client,
&final_url,
&String::from_utf8_lossy(&data),
options,
depth + 1,
);
set_node_attr(&node, attr_name, Some(css_data_url.to_string()));
return; // Do not fall through
}
// Create and embed data URL
let css_data_url = create_data_url("text/css", css.as_bytes(), &final_url);
set_node_attr(&node, attr_name, Some(css_data_url.to_string()));
} else if node_name == "frame" || node_name == "iframe" {
// (I)FRAMEs are also quite different from conventional resources
let frame_dom = html_to_dom(&String::from_utf8_lossy(&data));
walk_and_embed_assets(
cache,
@ -559,30 +555,38 @@ pub fn retrieve_and_embed_asset(
)
.unwrap();
// Create and embed data URL
let mut frame_data_url = create_data_url(&media_type, &frame_data, &final_url);
frame_data_url.set_fragment(resolved_url.fragment());
set_node_attr(node, attr_name, Some(frame_data_url.to_string()));
} else {
// Every other type of element gets processed here
return; // Do not fall through
}
// Parse media type for SCRIPT elements
if node_name == "script" {
if let Some(_) = get_node_attr(node, "src") {
if let Some(script_node_type_attr_value) = get_node_attr(node, "type") {
media_type = script_node_type_attr_value.to_string();
} else {
// Fallback to default one if it's not specified
media_type = "application/javascript".to_string();
}
}
}
// Everything else
if node_name == "script" {
media_type = "application/javascript".to_string();
// Create and embed data URL
let mut data_url = create_data_url(&media_type, &data, &final_url);
data_url.set_fragment(resolved_url.fragment());
set_node_attr(node, attr_name, Some(data_url.to_string()));
}
let mut data_url = create_data_url(&media_type, &data, &final_url);
data_url.set_fragment(resolved_url.fragment());
set_node_attr(node, attr_name, Some(data_url.to_string()));
}
}
Err(_) => {
if resolved_url.scheme() == "http" || resolved_url.scheme() == "https" {
// Keep remote reference if unable to retrieve the asset
// Keep remote references if unable to retrieve the asset
set_node_attr(node, attr_name, Some(resolved_url.to_string()));
} else {
// Exclude non-remote URLs
// Remove local references if they can't be successfully embedded as data URLs
set_node_attr(node, attr_name, None);
}
}
@ -645,7 +649,7 @@ pub fn walk_and_embed_assets(
let link_type: &str = determine_link_node_type(node);
if link_type == "icon" {
// Find and resolve this LINK node's href attribute
// Find and resolve LINK's href attribute
if let Some(link_attr_href_value) = get_node_attr(node, "href") {
if !options.no_images && !link_attr_href_value.is_empty() {
retrieve_and_embed_asset(
@ -663,10 +667,12 @@ pub fn walk_and_embed_assets(
}
}
} else if link_type == "stylesheet" {
// Find and resolve this LINK node's href attribute
// Resolve LINK's href attribute
if let Some(link_attr_href_value) = get_node_attr(node, "href") {
if options.no_css {
set_node_attr(node, "href", None);
// Wipe integrity attribute
set_node_attr(node, "integrity", None);
} else {
if !link_attr_href_value.is_empty() {
retrieve_and_embed_asset(
@ -916,14 +922,15 @@ pub fn walk_and_embed_assets(
// Replace with empty JS call to preserve original behavior
set_node_attr(node, "href", Some(str!("javascript:;")));
}
} else if anchor_attr_href_value.clone().starts_with('#')
|| is_url_and_has_protocol(&anchor_attr_href_value.clone())
{
// Don't touch mailto: links or hrefs which begin with a hash sign
} else {
let href_full_url: Url =
resolve_url(document_url, &anchor_attr_href_value);
set_node_attr(node, "href", Some(href_full_url.to_string()));
// Don't touch mailto: links or hrefs which begin with a hash sign
if !anchor_attr_href_value.clone().starts_with('#')
&& !is_url_and_has_protocol(&anchor_attr_href_value.clone())
{
let href_full_url: Url =
resolve_url(document_url, &anchor_attr_href_value);
set_node_attr(node, "href", Some(href_full_url.to_string()));
}
}
}
}
@ -937,6 +944,8 @@ pub fn walk_and_embed_assets(
// Remove src attribute
if script_attr_src != None {
set_node_attr(node, "src", None);
// Wipe integrity attribute
set_node_attr(node, "integrity", None);
}
} else if !script_attr_src.clone().unwrap_or_default().is_empty() {
retrieve_and_embed_asset(
@ -1081,7 +1090,7 @@ pub fn walk_and_embed_assets(
);
// Get rid of original contents
noscript_contents.clear();
// Insert HTML containing embedded assets back into NOSCRIPT node
// Insert HTML containing embedded assets into NOSCRIPT node
if let Some(html) =
get_child_node_by_name(&noscript_contents_dom.document, "html")
{

View file

@ -88,7 +88,7 @@ mod passing {
}
#[test]
fn remove_existing_when_empty_provided() {
fn set_existing_to_empty_when_empty_provided() {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")

View file

@ -11,24 +11,6 @@ mod passing {
use std::env;
use std::process::Command;
#[test]
fn bad_input_data_url() {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap();
// STDOUT should contain HTML
assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), "");
// STDERR should contain error description
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
"Unsupported data URL media type\n"
);
// The exit code should be 1
out.assert().code(1);
}
#[test]
fn isolate_data_url() {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
@ -192,6 +174,38 @@ mod passing {
// The exit code should be 0
out.assert().code(0);
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use assert_cmd::prelude::*;
use std::env;
use std::process::Command;
#[test]
fn bad_input_data_url() {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap();
// STDOUT should contain HTML
assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), "");
// STDERR should contain error description
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
"Unsupported data URL media type\n"
);
// The exit code should be 1
out.assert().code(1);
}
#[test]
fn security_disallow_local_assets_within_data_url_targets() {

View file

@ -130,7 +130,14 @@ mod passing {
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head></head><body><!--noscript--><img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\"><!--/noscript-->\n</body></html>\n"
"<html>\
<head></head>\
<body>\
<!--noscript-->\
<img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\">\
<!--/noscript-->\n\
</body>\
</html>\n"
);
// STDERR should contain target HTML and embedded SVG files
@ -153,4 +160,27 @@ mod passing {
// The exit code should be 0
out.assert().code(0);
}
#[test]
fn unwrap_noscript_contents_attr_data_url() {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg("-n")
.arg("data:text/html,<noscript class=\"\">test</noscript>")
.output()
.unwrap();
// STDOUT should contain unwrapped contents of NOSCRIPT element
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><!--noscript class=\"\"-->test<!--/noscript--></head><body></body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
out.assert().code(0);
}
}

View file

@ -30,7 +30,14 @@ mod passing {
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head>\n <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n </head>\n <body>\n © Some Company\n \n\n</body></html>\n"
"<html>\
<head>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n \
</head>\n \
<body>\n \
© Some Company\n \
\n\n</body>\
</html>\n"
);
// STDERR should contain only the target file

View file

@ -87,10 +87,12 @@ mod passing {
#[test]
fn no_css() {
let html = "<link rel=\"stylesheet\" href=\"main.css\">\
<link rel=\"alternate stylesheet\" href=\"main.css\">\
<style>html{background-color: #000;}</style>\
<div style=\"display: none;\"></div>";
let html = "\
<link rel=\"stylesheet\" href=\"main.css\">\
<link rel=\"alternate stylesheet\" href=\"main.css\">\
<style>html{background-color: #000;}</style>\
<div style=\"display: none;\"></div>\
";
let dom = html::html_to_dom(&html);
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@ -108,16 +110,18 @@ mod passing {
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html>\
<head>\
<link rel=\"stylesheet\">\
<link rel=\"alternate stylesheet\">\
<style></style>\
</head>\
<body>\
<div></div>\
</body>\
</html>"
"\
<html>\
<head>\
<link rel=\"stylesheet\">\
<link rel=\"alternate stylesheet\">\
<style></style>\
</head>\
<body>\
<div></div>\
</body>\
</html>\
"
);
}
@ -203,7 +207,15 @@ mod passing {
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><frameset><frame src=\"\"></frameset></html>"
"\
<html>\
<head>\
</head>\
<frameset>\
<frame src=\"\">\
</frameset>\
</html>\
"
);
}
@ -227,16 +239,25 @@ mod passing {
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><iframe src=\"\"></iframe></body></html>"
"\
<html>\
<head></head>\
<body>\
<iframe src=\"\"></iframe>\
</body>\
</html>\
"
);
}
#[test]
fn no_js() {
let html = "<div onClick=\"void(0)\">\
<script src=\"http://localhost/assets/some.js\"></script>\
<script>alert(1)</script>\
</div>";
let html = "\
<div onClick=\"void(0)\">\
<script src=\"http://localhost/assets/some.js\"></script>\
<script>alert(1)</script>\
</div>\
";
let dom = html::html_to_dom(&html);
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@ -254,52 +275,141 @@ mod passing {
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><script></script>\
<script></script></div></body></html>"
"\
<html>\
<head></head>\
<body>\
<div>\
<script></script>\
<script></script>\
</div>\
</body>\
</html>\
"
);
}
// #[test]
// fn discards_integrity() {
// let html = "<title>No integrity</title>\
// <link integrity=\"sha384-...\" rel=\"something\"/>\
// <script integrity=\"sha384-...\" src=\"some.js\"></script>";
// let dom = html::html_to_dom(&html);
// let url: Url = Url::parse("http://localhost").unwrap();
// let cache = &mut HashMap::new();
#[test]
fn keeps_integrity_for_linked_assets() {
let html = "<title>Has integrity</title>\
<link integrity=\"sha384-12345\" rel=\"something\" href=\"https://some-site.com/some-file.ext\" />";
let dom = html::html_to_dom(&html);
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
// let mut options = Options::default();
// options.no_css = true;
// options.no_frames = true;
// options.no_js = true;
// options.no_images = true;
// options.silent = true;
let mut options = Options::default();
options.silent = true;
// let client = Client::new();
let client = Client::new();
// html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
// let mut buf: Vec<u8> = Vec::new();
// serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
// assert_eq!(
// buf.iter().map(|&c| c as char).collect::<String>(),
// "<html>\
// <head><title>No integrity</title><link rel=\"something\"><script></script></head>\
// <body></body>\
// </html>"
// );
// }
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"\
<html>\
<head>\
<title>Has integrity</title>\
<link integrity=\"sha384-12345\" rel=\"something\" href=\"https://some-site.com/some-file.ext\">\
</head>\
<body></body>\
</html>\
"
);
}
#[test]
fn discards_integrity_for_linked_assets_nojs_nocss() {
let html = "\
<title>No integrity</title>\
<link integrity=\"\" rel=\"stylesheet\" href=\"data:;\"/>\
<script integrity=\"\" src=\"some.js\"></script>\
";
let dom = html::html_to_dom(&html);
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
let mut options = Options::default();
options.no_css = true;
options.no_js = true;
options.silent = true;
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"\
<html>\
<head>\
<title>No integrity</title>\
<link rel=\"stylesheet\">\
<script></script>\
</head>\
<body></body>\
</html>\
"
);
}
#[test]
fn discards_integrity_for_embedded_assets() {
let html = "\
<title>No integrity</title>\
<link integrity=\"sha384-123\" rel=\"something\" href=\"data:;\"/>\
<script integrity=\"sha384-456\" src=\"some.js\"></script>\
";
let dom = html::html_to_dom(&html);
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
let mut options = Options::default();
options.no_css = true;
options.no_js = true;
options.silent = true;
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"\
<html>\
<head>\
<title>No integrity</title>\
<link integrity=\"sha384-123\" rel=\"something\" href=\"data:;\">\
<script></script>\
</head>\
<body>\
</body>\
</html>\
"
);
}
#[test]
fn removes_unwanted_meta_tags() {
let html = "<html>\
<head>\
<meta http-equiv=\"Refresh\" value=\"20\"/>\
<meta http-equiv=\"Location\" value=\"https://freebsd.org\"/>\
</head>\
<body></body>\
</html>";
let html = "\
<html>\
<head>\
<meta http-equiv=\"Refresh\" value=\"20\"/>\
<meta http-equiv=\"Location\" value=\"https://freebsd.org\"/>\
</head>\
<body>\
</body>\
</html>\
";
let dom = html::html_to_dom(&html);
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@ -320,19 +430,22 @@ mod passing {
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html>\
"\
<html>\
<head>\
<meta http-equiv=\"disabled by monolith (Refresh)\" value=\"20\">\
<meta http-equiv=\"disabled by monolith (Location)\" value=\"https://freebsd.org\">\
</head>\
<body></body>\
<body>\
</body>\
</html>"
);
}
#[test]
fn processes_noscript_tags() {
let html = "<html>\
let html = "\
<html>\
<body>\
<noscript>\
<img src=\"image.png\" />\
@ -357,7 +470,8 @@ mod passing {
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
format!(
"<html>\
"\
<html>\
<head>\
</head>\
<body>\
@ -370,4 +484,34 @@ mod passing {
)
);
}
#[test]
fn preserves_script_type_json() {
let html = "<script id=\"data\" type=\"application/json\">{\"mono\":\"lith\"}</script>";
let dom = html::html_to_dom(&html);
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
let mut options = Options::default();
options.silent = true;
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"\
<html>\
<head>\
<script id=\"data\" type=\"application/json\">{\"mono\":\"lith\"}</script>\
</head>\
<body>\
</body>\
</html>"
);
}
}