forcefully set document's charset to UTF-8

This commit is contained in:
Sunshine 2021-02-23 23:33:45 -10:00
parent 4ad07c0519
commit e0273c664a
No known key found for this signature in database
GPG key ID: B80CA68703CD8AB1
11 changed files with 106 additions and 33 deletions

View file

@ -10,7 +10,7 @@ build:
test: build test: build
@cargo test --locked @cargo test --locked
@cargo fmt --all -- --check @cargo fmt --all -- --check
.PHONY: test_code_formatting .PHONY: test
lint: lint:
@cargo fmt --all -- @cargo fmt --all --

View file

@ -26,7 +26,7 @@ const CSS_PROPS_WITH_IMAGE_URLS: &[&str] = &[
"suffix", "suffix",
"symbols", "symbols",
]; ];
const CSS_SPECIAL_CHARS: &str = "~!@$%^&*()+=,./'\";:?><[]{}|`#"; const CSS_SPECIAL_CHARS: &'static str = "~!@$%^&*()+=,./'\";:?><[]{}|`#";
pub fn is_image_url_prop(prop_name: &str) -> bool { pub fn is_image_url_prop(prop_name: &str) -> bool {
CSS_PROPS_WITH_IMAGE_URLS CSS_PROPS_WITH_IMAGE_URLS

View file

@ -28,7 +28,7 @@ struct SrcSetItem<'a> {
descriptor: &'a str, descriptor: &'a str,
} }
const ICON_VALUES: &[&str] = &["icon", "shortcut icon"]; const ICON_VALUES: &'static [&str] = &["icon", "shortcut icon"];
pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom { pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
@ -105,7 +105,7 @@ pub fn compose_csp(options: &Options) -> String {
} }
if options.no_images { if options.no_images {
// Note: data: is needed for transparent pixels // Note: "data:" is required for transparent pixel images to work
string_list.push("img-src data:;"); string_list.push("img-src data:;");
} }
@ -127,22 +127,17 @@ pub fn create_metadata_tag(url: &str) -> String {
clean_url.set_password(None).unwrap(); clean_url.set_password(None).unwrap();
} }
if is_http_url(url) { format!(
format!( "<!-- Saved from {} at {} using {} v{} -->",
"<!-- Saved from {} at {} using {} v{} -->", if is_http_url(url) {
&clean_url, &clean_url.as_str()
timestamp, } else {
env!("CARGO_PKG_NAME"), "local source"
env!("CARGO_PKG_VERSION"), },
) timestamp,
} else { env!("CARGO_PKG_NAME"),
format!( env!("CARGO_PKG_VERSION"),
"<!-- Saved from local source at {} using {} v{} -->", )
timestamp,
env!("CARGO_PKG_NAME"),
env!("CARGO_PKG_VERSION"),
)
}
} }
Err(_) => str!(), Err(_) => str!(),
} }
@ -498,12 +493,12 @@ pub fn walk_and_embed_assets(
} => { } => {
match name.local.as_ref() { match name.local.as_ref() {
"meta" => { "meta" => {
// Remove http-equiv attributes from META nodes if they're able to control the page
if let Some(meta_attr_http_equiv_value) = get_node_attr(node, "http-equiv") { if let Some(meta_attr_http_equiv_value) = get_node_attr(node, "http-equiv") {
let meta_attr_http_equiv_value: &str = &meta_attr_http_equiv_value; let meta_attr_http_equiv_value: &str = &meta_attr_http_equiv_value;
if meta_attr_http_equiv_value.eq_ignore_ascii_case("refresh") if meta_attr_http_equiv_value.eq_ignore_ascii_case("refresh")
|| meta_attr_http_equiv_value.eq_ignore_ascii_case("location") || meta_attr_http_equiv_value.eq_ignore_ascii_case("location")
{ {
// Remove http-equiv attributes from META nodes if they're able to control the page
set_node_attr( set_node_attr(
&node, &node,
"http-equiv", "http-equiv",
@ -512,7 +507,20 @@ pub fn walk_and_embed_assets(
meta_attr_http_equiv_value meta_attr_http_equiv_value
)), )),
); );
} else if meta_attr_http_equiv_value.eq_ignore_ascii_case("Content-Type") {
// Enforce charset to be set to UTF-8
if let Some(_attr_value) = get_node_attr(node, "content") {
set_node_attr(
&node,
"content",
Some(str!("text/html; charset=utf-8")),
);
}
} }
} else if let Some(_meta_attr_http_equiv_value) = get_node_attr(node, "charset")
{
// Enforce charset to be set to UTF-8
set_node_attr(&node, "charset", Some(str!("utf-8")));
} }
} }
"link" => { "link" => {

View file

@ -1,4 +1,4 @@
const JS_DOM_EVENT_ATTRS: &[&str] = &[ const JS_DOM_EVENT_ATTRS: &'static [&str] = &[
// From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects": // From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects":
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects // https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
// https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes") // https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes")

View file

@ -212,6 +212,9 @@ fn main() {
} }
} }
// Remove charset meta-tag
// set_charset_meta_to_utf8(&dom.document);
// Serialize DOM tree // Serialize DOM tree
let mut result: String = stringify_document(&dom.document, &options); let mut result: String = stringify_document(&dom.document, &options);

View file

@ -21,7 +21,7 @@ pub struct Options {
pub target: String, pub target: String,
} }
const ASCII: &str = " \ const ASCII: &'static str = " \
_____ ______________ __________ ___________________ ___ _____ ______________ __________ ___________________ ___
| \\ / \\ | | | | | | | \\ / \\ | | | | | |
| \\_/ __ \\_| __ | | ___ ___ |__| | | \\_/ __ \\_| __ | | ___ ___ |__| |
@ -31,7 +31,7 @@ const ASCII: &str = " \
|___| |__________| \\_____________________| |___| |___| |___| |___| |__________| \\_____________________| |___| |___| |___|
"; ";
const DEFAULT_NETWORK_TIMEOUT: u64 = 120; const DEFAULT_NETWORK_TIMEOUT: u64 = 120;
const DEFAULT_USER_AGENT: &str = const DEFAULT_USER_AGENT: &'static str =
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0"; "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0";
impl Options { impl Options {

View file

@ -69,8 +69,7 @@ mod passing {
#[test] #[test]
fn local_file_target_input_absolute_target_path() -> Result<(), Box<dyn std::error::Error>> { fn local_file_target_input_absolute_target_path() -> Result<(), Box<dyn std::error::Error>> {
let cwd = env::current_dir().unwrap(); let cwd = env::current_dir().unwrap();
let cwd_normalized: String = let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd let out = cmd
.arg("-M") .arg("-M")

View file

@ -2,3 +2,4 @@ mod base_url;
mod basic; mod basic;
mod data_url; mod data_url;
mod local_files; mod local_files;
mod unusual_encodings;

View file

@ -0,0 +1,51 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use assert_cmd::prelude::*;
use std::env;
use std::process::Command;
#[test]
fn change_encoding_to_utf_8() -> Result<(), Box<dyn std::error::Error>> {
let cwd = env::current_dir().unwrap();
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("-M")
.arg(if cfg!(windows) {
"src\\tests\\data\\unusual_encodings\\iso-8859-1.html"
} else {
"src/tests/data/unusual_encodings/iso-8859-1.html"
})
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head>\n <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n </head>\n <body>\n © Some Company\n \n\n</body></html>\n"
);
// STDERR should contain only the target file
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
}

View file

@ -0,0 +1,8 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
&copy; Some Company
</body>
</html>

View file

@ -7,7 +7,7 @@ use std::path::Path;
use crate::opts::Options; use crate::opts::Options;
use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url}; use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url};
const INDENT: &str = " "; const INDENT: &'static str = " ";
const MAGIC: [[&[u8]; 2]; 18] = [ const MAGIC: [[&[u8]; 2]; 18] = [
// Image // Image
@ -34,11 +34,13 @@ const MAGIC: [[&[u8]; 2]; 18] = [
]; ];
const PLAINTEXT_MEDIA_TYPES: &[&str] = &[ const PLAINTEXT_MEDIA_TYPES: &[&str] = &[
"application/javascript",
"image/svg+xml", "image/svg+xml",
"text/css", // "text/css",
"text/html", // "text/csv",
"text/javascript", // "text/html",
"text/plain", // "text/javascript",
// "text/plain",
]; ];
pub fn detect_media_type(data: &[u8], url: &str) -> String { pub fn detect_media_type(data: &[u8], url: &str) -> String {
@ -56,7 +58,8 @@ pub fn detect_media_type(data: &[u8], url: &str) -> String {
} }
pub fn is_plaintext_media_type(media_type: &str) -> bool { pub fn is_plaintext_media_type(media_type: &str) -> bool {
PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str()) media_type.to_lowercase().as_str().starts_with("text/")
|| PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
} }
pub fn indent(level: u32) -> String { pub fn indent(level: u32) -> String {
@ -125,7 +128,7 @@ pub fn retrieve_asset(
Ok(mut response) => { Ok(mut response) => {
if !options.ignore_errors && response.status() != 200 { if !options.ignore_errors && response.status() != 200 {
if !options.silent { if !options.silent {
eprintln!("Unable to retrieve {} ({})", &url, response.status()); eprintln!("Unable to retrieve {} (error: {})", &url, response.status());
} }
// Provoke error // Provoke error
return Err(client.get("").send().unwrap_err()); return Err(client.get("").send().unwrap_err());