forcefully set document's charset to UTF-8
This commit is contained in:
parent
4ad07c0519
commit
e0273c664a
11 changed files with 106 additions and 33 deletions
2
Makefile
2
Makefile
|
@ -10,7 +10,7 @@ build:
|
||||||
test: build
|
test: build
|
||||||
@cargo test --locked
|
@cargo test --locked
|
||||||
@cargo fmt --all -- --check
|
@cargo fmt --all -- --check
|
||||||
.PHONY: test_code_formatting
|
.PHONY: test
|
||||||
|
|
||||||
lint:
|
lint:
|
||||||
@cargo fmt --all --
|
@cargo fmt --all --
|
||||||
|
|
|
@ -26,7 +26,7 @@ const CSS_PROPS_WITH_IMAGE_URLS: &[&str] = &[
|
||||||
"suffix",
|
"suffix",
|
||||||
"symbols",
|
"symbols",
|
||||||
];
|
];
|
||||||
const CSS_SPECIAL_CHARS: &str = "~!@$%^&*()+=,./'\";:?><[]{}|`#";
|
const CSS_SPECIAL_CHARS: &'static str = "~!@$%^&*()+=,./'\";:?><[]{}|`#";
|
||||||
|
|
||||||
pub fn is_image_url_prop(prop_name: &str) -> bool {
|
pub fn is_image_url_prop(prop_name: &str) -> bool {
|
||||||
CSS_PROPS_WITH_IMAGE_URLS
|
CSS_PROPS_WITH_IMAGE_URLS
|
||||||
|
|
46
src/html.rs
46
src/html.rs
|
@ -28,7 +28,7 @@ struct SrcSetItem<'a> {
|
||||||
descriptor: &'a str,
|
descriptor: &'a str,
|
||||||
}
|
}
|
||||||
|
|
||||||
const ICON_VALUES: &[&str] = &["icon", "shortcut icon"];
|
const ICON_VALUES: &'static [&str] = &["icon", "shortcut icon"];
|
||||||
|
|
||||||
pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
|
pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
|
||||||
let mut buf: Vec<u8> = Vec::new();
|
let mut buf: Vec<u8> = Vec::new();
|
||||||
|
@ -105,7 +105,7 @@ pub fn compose_csp(options: &Options) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
if options.no_images {
|
if options.no_images {
|
||||||
// Note: data: is needed for transparent pixels
|
// Note: "data:" is required for transparent pixel images to work
|
||||||
string_list.push("img-src data:;");
|
string_list.push("img-src data:;");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,22 +127,17 @@ pub fn create_metadata_tag(url: &str) -> String {
|
||||||
clean_url.set_password(None).unwrap();
|
clean_url.set_password(None).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
if is_http_url(url) {
|
format!(
|
||||||
format!(
|
"<!-- Saved from {} at {} using {} v{} -->",
|
||||||
"<!-- Saved from {} at {} using {} v{} -->",
|
if is_http_url(url) {
|
||||||
&clean_url,
|
&clean_url.as_str()
|
||||||
timestamp,
|
} else {
|
||||||
env!("CARGO_PKG_NAME"),
|
"local source"
|
||||||
env!("CARGO_PKG_VERSION"),
|
},
|
||||||
)
|
timestamp,
|
||||||
} else {
|
env!("CARGO_PKG_NAME"),
|
||||||
format!(
|
env!("CARGO_PKG_VERSION"),
|
||||||
"<!-- Saved from local source at {} using {} v{} -->",
|
)
|
||||||
timestamp,
|
|
||||||
env!("CARGO_PKG_NAME"),
|
|
||||||
env!("CARGO_PKG_VERSION"),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Err(_) => str!(),
|
Err(_) => str!(),
|
||||||
}
|
}
|
||||||
|
@ -498,12 +493,12 @@ pub fn walk_and_embed_assets(
|
||||||
} => {
|
} => {
|
||||||
match name.local.as_ref() {
|
match name.local.as_ref() {
|
||||||
"meta" => {
|
"meta" => {
|
||||||
// Remove http-equiv attributes from META nodes if they're able to control the page
|
|
||||||
if let Some(meta_attr_http_equiv_value) = get_node_attr(node, "http-equiv") {
|
if let Some(meta_attr_http_equiv_value) = get_node_attr(node, "http-equiv") {
|
||||||
let meta_attr_http_equiv_value: &str = &meta_attr_http_equiv_value;
|
let meta_attr_http_equiv_value: &str = &meta_attr_http_equiv_value;
|
||||||
if meta_attr_http_equiv_value.eq_ignore_ascii_case("refresh")
|
if meta_attr_http_equiv_value.eq_ignore_ascii_case("refresh")
|
||||||
|| meta_attr_http_equiv_value.eq_ignore_ascii_case("location")
|
|| meta_attr_http_equiv_value.eq_ignore_ascii_case("location")
|
||||||
{
|
{
|
||||||
|
// Remove http-equiv attributes from META nodes if they're able to control the page
|
||||||
set_node_attr(
|
set_node_attr(
|
||||||
&node,
|
&node,
|
||||||
"http-equiv",
|
"http-equiv",
|
||||||
|
@ -512,7 +507,20 @@ pub fn walk_and_embed_assets(
|
||||||
meta_attr_http_equiv_value
|
meta_attr_http_equiv_value
|
||||||
)),
|
)),
|
||||||
);
|
);
|
||||||
|
} else if meta_attr_http_equiv_value.eq_ignore_ascii_case("Content-Type") {
|
||||||
|
// Enforce charset to be set to UTF-8
|
||||||
|
if let Some(_attr_value) = get_node_attr(node, "content") {
|
||||||
|
set_node_attr(
|
||||||
|
&node,
|
||||||
|
"content",
|
||||||
|
Some(str!("text/html; charset=utf-8")),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else if let Some(_meta_attr_http_equiv_value) = get_node_attr(node, "charset")
|
||||||
|
{
|
||||||
|
// Enforce charset to be set to UTF-8
|
||||||
|
set_node_attr(&node, "charset", Some(str!("utf-8")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"link" => {
|
"link" => {
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
const JS_DOM_EVENT_ATTRS: &[&str] = &[
|
const JS_DOM_EVENT_ATTRS: &'static [&str] = &[
|
||||||
// From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects":
|
// From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects":
|
||||||
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
|
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
|
||||||
// https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes")
|
// https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes")
|
||||||
|
|
|
@ -212,6 +212,9 @@ fn main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Remove charset meta-tag
|
||||||
|
// set_charset_meta_to_utf8(&dom.document);
|
||||||
|
|
||||||
// Serialize DOM tree
|
// Serialize DOM tree
|
||||||
let mut result: String = stringify_document(&dom.document, &options);
|
let mut result: String = stringify_document(&dom.document, &options);
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ pub struct Options {
|
||||||
pub target: String,
|
pub target: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
const ASCII: &str = " \
|
const ASCII: &'static str = " \
|
||||||
_____ ______________ __________ ___________________ ___
|
_____ ______________ __________ ___________________ ___
|
||||||
| \\ / \\ | | | | | |
|
| \\ / \\ | | | | | |
|
||||||
| \\_/ __ \\_| __ | | ___ ___ |__| |
|
| \\_/ __ \\_| __ | | ___ ___ |__| |
|
||||||
|
@ -31,7 +31,7 @@ const ASCII: &str = " \
|
||||||
|___| |__________| \\_____________________| |___| |___| |___|
|
|___| |__________| \\_____________________| |___| |___| |___|
|
||||||
";
|
";
|
||||||
const DEFAULT_NETWORK_TIMEOUT: u64 = 120;
|
const DEFAULT_NETWORK_TIMEOUT: u64 = 120;
|
||||||
const DEFAULT_USER_AGENT: &str =
|
const DEFAULT_USER_AGENT: &'static str =
|
||||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0";
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0";
|
||||||
|
|
||||||
impl Options {
|
impl Options {
|
||||||
|
|
|
@ -69,8 +69,7 @@ mod passing {
|
||||||
#[test]
|
#[test]
|
||||||
fn local_file_target_input_absolute_target_path() -> Result<(), Box<dyn std::error::Error>> {
|
fn local_file_target_input_absolute_target_path() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
let cwd = env::current_dir().unwrap();
|
let cwd = env::current_dir().unwrap();
|
||||||
let cwd_normalized: String =
|
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
||||||
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
|
|
||||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
|
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
|
||||||
let out = cmd
|
let out = cmd
|
||||||
.arg("-M")
|
.arg("-M")
|
||||||
|
|
|
@ -2,3 +2,4 @@ mod base_url;
|
||||||
mod basic;
|
mod basic;
|
||||||
mod data_url;
|
mod data_url;
|
||||||
mod local_files;
|
mod local_files;
|
||||||
|
mod unusual_encodings;
|
||||||
|
|
51
src/tests/cli/unusual_encodings.rs
Normal file
51
src/tests/cli/unusual_encodings.rs
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
|
||||||
|
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
|
||||||
|
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
|
||||||
|
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
|
||||||
|
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
|
||||||
|
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod passing {
|
||||||
|
use assert_cmd::prelude::*;
|
||||||
|
use std::env;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn change_encoding_to_utf_8() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
let cwd = env::current_dir().unwrap();
|
||||||
|
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
||||||
|
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
|
||||||
|
let out = cmd
|
||||||
|
.arg("-M")
|
||||||
|
.arg(if cfg!(windows) {
|
||||||
|
"src\\tests\\data\\unusual_encodings\\iso-8859-1.html"
|
||||||
|
} else {
|
||||||
|
"src/tests/data/unusual_encodings/iso-8859-1.html"
|
||||||
|
})
|
||||||
|
.output()
|
||||||
|
.unwrap();
|
||||||
|
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
||||||
|
|
||||||
|
// STDOUT should contain newly added base URL
|
||||||
|
assert_eq!(
|
||||||
|
std::str::from_utf8(&out.stdout).unwrap(),
|
||||||
|
"<html><head>\n <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n </head>\n <body>\n © Some Company\n \n\n</body></html>\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
// STDERR should contain only the target file
|
||||||
|
assert_eq!(
|
||||||
|
std::str::from_utf8(&out.stderr).unwrap(),
|
||||||
|
format!(
|
||||||
|
"{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n",
|
||||||
|
file = file_url_protocol,
|
||||||
|
cwd = cwd_normalized,
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
// The exit code should be 0
|
||||||
|
out.assert().code(0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
8
src/tests/data/unusual_encodings/iso-8859-1.html
Normal file
8
src/tests/data/unusual_encodings/iso-8859-1.html
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
© Some Company
|
||||||
|
</body>
|
||||||
|
</html>
|
17
src/utils.rs
17
src/utils.rs
|
@ -7,7 +7,7 @@ use std::path::Path;
|
||||||
use crate::opts::Options;
|
use crate::opts::Options;
|
||||||
use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url};
|
use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url};
|
||||||
|
|
||||||
const INDENT: &str = " ";
|
const INDENT: &'static str = " ";
|
||||||
|
|
||||||
const MAGIC: [[&[u8]; 2]; 18] = [
|
const MAGIC: [[&[u8]; 2]; 18] = [
|
||||||
// Image
|
// Image
|
||||||
|
@ -34,11 +34,13 @@ const MAGIC: [[&[u8]; 2]; 18] = [
|
||||||
];
|
];
|
||||||
|
|
||||||
const PLAINTEXT_MEDIA_TYPES: &[&str] = &[
|
const PLAINTEXT_MEDIA_TYPES: &[&str] = &[
|
||||||
|
"application/javascript",
|
||||||
"image/svg+xml",
|
"image/svg+xml",
|
||||||
"text/css",
|
// "text/css",
|
||||||
"text/html",
|
// "text/csv",
|
||||||
"text/javascript",
|
// "text/html",
|
||||||
"text/plain",
|
// "text/javascript",
|
||||||
|
// "text/plain",
|
||||||
];
|
];
|
||||||
|
|
||||||
pub fn detect_media_type(data: &[u8], url: &str) -> String {
|
pub fn detect_media_type(data: &[u8], url: &str) -> String {
|
||||||
|
@ -56,7 +58,8 @@ pub fn detect_media_type(data: &[u8], url: &str) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_plaintext_media_type(media_type: &str) -> bool {
|
pub fn is_plaintext_media_type(media_type: &str) -> bool {
|
||||||
PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
|
media_type.to_lowercase().as_str().starts_with("text/")
|
||||||
|
|| PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn indent(level: u32) -> String {
|
pub fn indent(level: u32) -> String {
|
||||||
|
@ -125,7 +128,7 @@ pub fn retrieve_asset(
|
||||||
Ok(mut response) => {
|
Ok(mut response) => {
|
||||||
if !options.ignore_errors && response.status() != 200 {
|
if !options.ignore_errors && response.status() != 200 {
|
||||||
if !options.silent {
|
if !options.silent {
|
||||||
eprintln!("Unable to retrieve {} ({})", &url, response.status());
|
eprintln!("Unable to retrieve {} (error: {})", &url, response.status());
|
||||||
}
|
}
|
||||||
// Provoke error
|
// Provoke error
|
||||||
return Err(client.get("").send().unwrap_err());
|
return Err(client.get("").send().unwrap_err());
|
||||||
|
|
Loading…
Reference in a new issue