Merge pull request #245 from snshn/change-meta-charset-to-utf-8
Forcefully set document's charset to UTF-8
This commit is contained in:
commit
a3516b2ae9
10 changed files with 105 additions and 34 deletions
2
Makefile
2
Makefile
|
@ -10,7 +10,7 @@ build:
|
|||
test: build
|
||||
@cargo test --locked
|
||||
@cargo fmt --all -- --check
|
||||
.PHONY: test_code_formatting
|
||||
.PHONY: test
|
||||
|
||||
lint:
|
||||
@cargo fmt --all --
|
||||
|
|
|
@ -26,7 +26,7 @@ const CSS_PROPS_WITH_IMAGE_URLS: &[&str] = &[
|
|||
"suffix",
|
||||
"symbols",
|
||||
];
|
||||
const CSS_SPECIAL_CHARS: &str = "~!@$%^&*()+=,./'\";:?><[]{}|`#";
|
||||
const CSS_SPECIAL_CHARS: &'static str = "~!@$%^&*()+=,./'\";:?><[]{}|`#";
|
||||
|
||||
pub fn is_image_url_prop(prop_name: &str) -> bool {
|
||||
CSS_PROPS_WITH_IMAGE_URLS
|
||||
|
|
46
src/html.rs
46
src/html.rs
|
@ -28,7 +28,7 @@ struct SrcSetItem<'a> {
|
|||
descriptor: &'a str,
|
||||
}
|
||||
|
||||
const ICON_VALUES: &[&str] = &["icon", "shortcut icon"];
|
||||
const ICON_VALUES: &'static [&str] = &["icon", "shortcut icon"];
|
||||
|
||||
pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
|
@ -105,7 +105,7 @@ pub fn compose_csp(options: &Options) -> String {
|
|||
}
|
||||
|
||||
if options.no_images {
|
||||
// Note: data: is needed for transparent pixels
|
||||
// Note: "data:" is required for transparent pixel images to work
|
||||
string_list.push("img-src data:;");
|
||||
}
|
||||
|
||||
|
@ -127,22 +127,17 @@ pub fn create_metadata_tag(url: &str) -> String {
|
|||
clean_url.set_password(None).unwrap();
|
||||
}
|
||||
|
||||
if is_http_url(url) {
|
||||
format!(
|
||||
"<!-- Saved from {} at {} using {} v{} -->",
|
||||
&clean_url,
|
||||
timestamp,
|
||||
env!("CARGO_PKG_NAME"),
|
||||
env!("CARGO_PKG_VERSION"),
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"<!-- Saved from local source at {} using {} v{} -->",
|
||||
timestamp,
|
||||
env!("CARGO_PKG_NAME"),
|
||||
env!("CARGO_PKG_VERSION"),
|
||||
)
|
||||
}
|
||||
format!(
|
||||
"<!-- Saved from {} at {} using {} v{} -->",
|
||||
if is_http_url(url) {
|
||||
&clean_url.as_str()
|
||||
} else {
|
||||
"local source"
|
||||
},
|
||||
timestamp,
|
||||
env!("CARGO_PKG_NAME"),
|
||||
env!("CARGO_PKG_VERSION"),
|
||||
)
|
||||
}
|
||||
Err(_) => str!(),
|
||||
}
|
||||
|
@ -498,12 +493,12 @@ pub fn walk_and_embed_assets(
|
|||
} => {
|
||||
match name.local.as_ref() {
|
||||
"meta" => {
|
||||
// Remove http-equiv attributes from META nodes if they're able to control the page
|
||||
if let Some(meta_attr_http_equiv_value) = get_node_attr(node, "http-equiv") {
|
||||
let meta_attr_http_equiv_value: &str = &meta_attr_http_equiv_value;
|
||||
if meta_attr_http_equiv_value.eq_ignore_ascii_case("refresh")
|
||||
|| meta_attr_http_equiv_value.eq_ignore_ascii_case("location")
|
||||
{
|
||||
// Remove http-equiv attributes from META nodes if they're able to control the page
|
||||
set_node_attr(
|
||||
&node,
|
||||
"http-equiv",
|
||||
|
@ -512,7 +507,20 @@ pub fn walk_and_embed_assets(
|
|||
meta_attr_http_equiv_value
|
||||
)),
|
||||
);
|
||||
} else if meta_attr_http_equiv_value.eq_ignore_ascii_case("Content-Type") {
|
||||
// Enforce charset to be set to UTF-8
|
||||
if let Some(_attr_value) = get_node_attr(node, "content") {
|
||||
set_node_attr(
|
||||
&node,
|
||||
"content",
|
||||
Some(str!("text/html; charset=utf-8")),
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if let Some(_meta_attr_http_equiv_value) = get_node_attr(node, "charset")
|
||||
{
|
||||
// Enforce charset to be set to UTF-8
|
||||
set_node_attr(&node, "charset", Some(str!("utf-8")));
|
||||
}
|
||||
}
|
||||
"link" => {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
const JS_DOM_EVENT_ATTRS: &[&str] = &[
|
||||
const JS_DOM_EVENT_ATTRS: &'static [&str] = &[
|
||||
// From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects":
|
||||
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
|
||||
// https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes")
|
||||
|
|
|
@ -23,7 +23,7 @@ pub struct Options {
|
|||
pub no_color: bool,
|
||||
}
|
||||
|
||||
const ASCII: &str = " \
|
||||
const ASCII: &'static str = " \
|
||||
_____ ______________ __________ ___________________ ___
|
||||
| \\ / \\ | | | | | |
|
||||
| \\_/ __ \\_| __ | | ___ ___ |__| |
|
||||
|
@ -33,7 +33,7 @@ const ASCII: &str = " \
|
|||
|___| |__________| \\_____________________| |___| |___| |___|
|
||||
";
|
||||
const DEFAULT_NETWORK_TIMEOUT: u64 = 120;
|
||||
const DEFAULT_USER_AGENT: &str =
|
||||
const DEFAULT_USER_AGENT: &'static str =
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0";
|
||||
const ENV_VAR_NO_COLOR: &str = "NO_COLOR";
|
||||
const ENV_VAR_TERM: &str = "TERM";
|
||||
|
|
|
@ -69,8 +69,7 @@ mod passing {
|
|||
#[test]
|
||||
fn local_file_target_input_absolute_target_path() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let cwd = env::current_dir().unwrap();
|
||||
let cwd_normalized: String =
|
||||
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
|
||||
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
|
||||
let out = cmd
|
||||
.arg("-M")
|
||||
|
|
|
@ -2,3 +2,4 @@ mod base_url;
|
|||
mod basic;
|
||||
mod data_url;
|
||||
mod local_files;
|
||||
mod unusual_encodings;
|
||||
|
|
51
src/tests/cli/unusual_encodings.rs
Normal file
51
src/tests/cli/unusual_encodings.rs
Normal file
|
@ -0,0 +1,51 @@
|
|||
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
|
||||
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
|
||||
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
|
||||
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
|
||||
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
|
||||
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
|
||||
|
||||
#[cfg(test)]
|
||||
mod passing {
|
||||
use assert_cmd::prelude::*;
|
||||
use std::env;
|
||||
use std::process::Command;
|
||||
|
||||
#[test]
|
||||
fn change_encoding_to_utf_8() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let cwd = env::current_dir().unwrap();
|
||||
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
|
||||
let out = cmd
|
||||
.arg("-M")
|
||||
.arg(if cfg!(windows) {
|
||||
"src\\tests\\data\\unusual_encodings\\iso-8859-1.html"
|
||||
} else {
|
||||
"src/tests/data/unusual_encodings/iso-8859-1.html"
|
||||
})
|
||||
.output()
|
||||
.unwrap();
|
||||
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
||||
|
||||
// STDOUT should contain newly added base URL
|
||||
assert_eq!(
|
||||
std::str::from_utf8(&out.stdout).unwrap(),
|
||||
"<html><head>\n <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n </head>\n <body>\n © Some Company\n \n\n</body></html>\n"
|
||||
);
|
||||
|
||||
// STDERR should contain only the target file
|
||||
assert_eq!(
|
||||
std::str::from_utf8(&out.stderr).unwrap(),
|
||||
format!(
|
||||
"{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n",
|
||||
file = file_url_protocol,
|
||||
cwd = cwd_normalized,
|
||||
)
|
||||
);
|
||||
|
||||
// The exit code should be 0
|
||||
out.assert().code(0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
8
src/tests/data/unusual_encodings/iso-8859-1.html
Normal file
8
src/tests/data/unusual_encodings/iso-8859-1.html
Normal file
|
@ -0,0 +1,8 @@
|
|||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
© Some Company
|
||||
</body>
|
||||
</html>
|
20
src/utils.rs
20
src/utils.rs
|
@ -7,9 +7,10 @@ use std::path::Path;
|
|||
use crate::opts::Options;
|
||||
use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url};
|
||||
|
||||
const ANSI_COLOR_RED: &str = "\x1b[31m";
|
||||
const ANSI_COLOR_RESET: &str = "\x1b[0m";
|
||||
const INDENT: &str = " ";
|
||||
const ANSI_COLOR_RED: &'static str = "\x1b[31m";
|
||||
const ANSI_COLOR_RESET: &'static str = "\x1b[0m";
|
||||
const INDENT: &'static str = " ";
|
||||
|
||||
const MAGIC: [[&[u8]; 2]; 18] = [
|
||||
// Image
|
||||
[b"GIF87a", b"image/gif"],
|
||||
|
@ -34,11 +35,13 @@ const MAGIC: [[&[u8]; 2]; 18] = [
|
|||
[b"\x1A\x45\xDF\xA3", b"video/webm"],
|
||||
];
|
||||
const PLAINTEXT_MEDIA_TYPES: &[&str] = &[
|
||||
"application/javascript",
|
||||
"image/svg+xml",
|
||||
"text/css",
|
||||
"text/html",
|
||||
"text/javascript",
|
||||
"text/plain",
|
||||
// "text/css",
|
||||
// "text/csv",
|
||||
// "text/html",
|
||||
// "text/javascript",
|
||||
// "text/plain",
|
||||
];
|
||||
|
||||
pub fn detect_media_type(data: &[u8], url: &str) -> String {
|
||||
|
@ -56,7 +59,8 @@ pub fn detect_media_type(data: &[u8], url: &str) -> String {
|
|||
}
|
||||
|
||||
pub fn is_plaintext_media_type(media_type: &str) -> bool {
|
||||
PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
|
||||
media_type.to_lowercase().as_str().starts_with("text/")
|
||||
|| PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
|
||||
}
|
||||
|
||||
pub fn indent(level: u32) -> String {
|
||||
|
|
Loading…
Reference in a new issue