Merge pull request #253 from snshn/unwrap-noscript

Make possible to unwrap NOSCRIPT nodes
This commit is contained in:
Sunshine 2021-03-11 22:43:28 -10:00 committed by GitHub
commit 8256d17efd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 190 additions and 6 deletions

View file

@ -79,6 +79,7 @@ or
- `-j`: Exclude JavaScript - `-j`: Exclude JavaScript
- `-k`: Accept invalid X.509 (TLS) certificates - `-k`: Accept invalid X.509 (TLS) certificates
- `-M`: Don't add timestamp and URL information - `-M`: Don't add timestamp and URL information
- `-n`: Extract contents of NOSCRIPT tags
- `-o`: Write output to `file` - `-o`: Write output to `file`
- `-s`: Be quiet - `-s`: Be quiet
- `-t`: Adjust `network request timeout` - `-t`: Adjust `network request timeout`

View file

@ -474,6 +474,11 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String {
result = String::from_utf8(buf).unwrap(); result = String::from_utf8(buf).unwrap();
} }
if options.unwrap_noscript {
let noscript_re = Regex::new(r"<(?P<c>/?noscript)>").unwrap();
result = noscript_re.replace_all(&result, "<!--$c-->").to_string();
}
result result
} }
@ -1060,11 +1065,11 @@ pub fn walk_and_embed_assets(
for child_node in node.children.borrow_mut().iter_mut() { for child_node in node.children.borrow_mut().iter_mut() {
match child_node.data { match child_node.data {
NodeData::Text { ref contents } => { NodeData::Text { ref contents } => {
// Get contents of the NOSCRIPT node // Get contents of NOSCRIPT node
let mut noscript_contents = contents.borrow_mut(); let mut noscript_contents = contents.borrow_mut();
// Parse contents of the NOSCRIPT node // Parse contents of NOSCRIPT node as DOM
let noscript_contents_dom: RcDom = html_to_dom(&noscript_contents); let noscript_contents_dom: RcDom = html_to_dom(&noscript_contents);
// Embed assets within the NOSCRIPT node // Embed assets of NOSCRIPT node contents
walk_and_embed_assets( walk_and_embed_assets(
cache, cache,
client, client,
@ -1075,7 +1080,7 @@ pub fn walk_and_embed_assets(
); );
// Get rid of original contents // Get rid of original contents
noscript_contents.clear(); noscript_contents.clear();
// Insert HTML containing embedded assets into the NOSCRIPT node // Insert HTML containing embedded assets back into NOSCRIPT node
if let Some(html) = if let Some(html) =
get_child_node_by_name(&noscript_contents_dom.document, "html") get_child_node_by_name(&noscript_contents_dom.document, "html")
{ {

View file

@ -21,6 +21,7 @@ pub struct Options {
pub no_video: bool, pub no_video: bool,
pub target: String, pub target: String,
pub no_color: bool, pub no_color: bool,
pub unwrap_noscript: bool,
} }
const ASCII: &'static str = " \ const ASCII: &'static str = " \
@ -55,6 +56,9 @@ impl Options {
.args_from_usage("-j, --no-js 'Removes JavaScript'") .args_from_usage("-j, --no-js 'Removes JavaScript'")
.args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'") .args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'")
.args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'") .args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'")
.args_from_usage(
"-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'",
)
.args_from_usage("-o, --output=[document.html] 'Writes output to <file>'") .args_from_usage("-o, --output=[document.html] 'Writes output to <file>'")
.args_from_usage("-s, --silent 'Suppresses verbosity'") .args_from_usage("-s, --silent 'Suppresses verbosity'")
.args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'") .args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'")
@ -100,6 +104,7 @@ impl Options {
} else { } else {
options.user_agent = Some(DEFAULT_USER_AGENT.to_string()); options.user_agent = Some(DEFAULT_USER_AGENT.to_string());
} }
options.unwrap_noscript = app.is_present("unwrap-noscript");
options.no_video = app.is_present("no-video"); options.no_video = app.is_present("no-video");
options.no_color = options.no_color =

View file

@ -202,12 +202,12 @@ mod passing {
format!( format!(
"\ "\
{file_url_html}\n \ {file_url_html}\n \
{file_url_css}\n\ {file_url_svg}\n\
", ",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()) file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap())
.unwrap() .unwrap()
.into_string(), .into_string(),
file_url_css = Url::from_file_path(fs::canonicalize(&path_svg).unwrap()) file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap())
.unwrap() .unwrap()
.into_string(), .into_string(),
) )

View file

@ -2,4 +2,5 @@ mod base_url;
mod basic; mod basic;
mod data_url; mod data_url;
mod local_files; mod local_files;
mod noscript;
mod unusual_encodings; mod unusual_encodings;

164
src/tests/cli/noscript.rs Normal file
View file

@ -0,0 +1,164 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use assert_cmd::prelude::*;
use std::env;
use std::fs;
use std::path::Path;
use std::process::Command;
use url::Url;
#[test]
fn parse_noscript_contents() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let path_html: &Path = Path::new("src/tests/data/noscript/index.html");
let path_svg: &Path = Path::new("src/tests/data/noscript/image.svg");
let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head></head><body><noscript><img src=\"\"></noscript>\n</body></html>\n"
);
// STDERR should contain target HTML and embedded SVG files
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"\
{file_url_html}\n \
{file_url_svg}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap())
.unwrap()
.into_string(),
file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap())
.unwrap()
.into_string(),
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn unwrap_noscript_contents() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let path_html: &Path = Path::new("src/tests/data/noscript/index.html");
let path_svg: &Path = Path::new("src/tests/data/noscript/image.svg");
let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head></head><body><!--noscript--><img src=\"\"><!--/noscript-->\n</body></html>\n"
);
// STDERR should contain target HTML and embedded SVG files
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"\
{file_url_html}\n \
{file_url_svg}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap())
.unwrap()
.into_string(),
file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap())
.unwrap()
.into_string(),
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn unwrap_noscript_contents_nested() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let path_html: &Path = Path::new("src/tests/data/noscript/nested.html");
let path_svg: &Path = Path::new("src/tests/data/noscript/image.svg");
let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head></head><body><!--noscript--><h1>JS is not active</h1><!--noscript--><img src=\"\"><!--/noscript--><!--/noscript-->\n</body></html>\n"
);
// STDERR should contain target HTML and embedded SVG files
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"\
{file_url_html}\n \
{file_url_svg}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap())
.unwrap()
.into_string(),
file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap())
.unwrap()
.into_string(),
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
#[test]
fn unwrap_noscript_contents_with_script() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let path_html: &Path = Path::new("src/tests/data/noscript/script.html");
let path_svg: &Path = Path::new("src/tests/data/noscript/image.svg");
let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head></head><body><!--noscript--><img src=\"\"><!--/noscript-->\n</body></html>\n"
);
// STDERR should contain target HTML and embedded SVG files
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"\
{file_url_html}\n \
{file_url_svg}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap())
.unwrap()
.into_string(),
file_url_svg = Url::from_file_path(fs::canonicalize(&path_svg).unwrap())
.unwrap()
.into_string(),
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
}

View file

@ -0,0 +1,5 @@
<svg version="1.1" baseProfile="full" width="300" height="200" xmlns="http://www.w3.org/2000/svg">
<rect width="100%" height="100%" fill="red" />
<circle cx="150" cy="100" r="80" fill="green" />
<text x="150" y="125" font-size="60" text-anchor="middle" fill="white">SVG</text>
</svg>

After

Width:  |  Height:  |  Size: 296 B

View file

@ -0,0 +1 @@
<body><noscript><img src="image.svg" /></noscript></body>

View file

@ -0,0 +1 @@
<body><noscript><h1>JS is not active</h1><noscript><img src="image.svg" /></noscript></noscript></body>

View file

@ -0,0 +1 @@
<body><noscript><script>alert(1);</script><img src="image.svg" /></noscript></body>