forcefully set document's charset to UTF-8

This commit is contained in:
Sunshine 2021-02-23 23:33:45 -10:00
parent 4ad07c0519
commit e0273c664a
No known key found for this signature in database
GPG key ID: B80CA68703CD8AB1
11 changed files with 106 additions and 33 deletions

View file

@ -10,7 +10,7 @@ build:
test: build
@cargo test --locked
@cargo fmt --all -- --check
.PHONY: test_code_formatting
.PHONY: test
lint:
@cargo fmt --all --

View file

@ -26,7 +26,7 @@ const CSS_PROPS_WITH_IMAGE_URLS: &[&str] = &[
"suffix",
"symbols",
];
const CSS_SPECIAL_CHARS: &str = "~!@$%^&*()+=,./'\";:?><[]{}|`#";
const CSS_SPECIAL_CHARS: &'static str = "~!@$%^&*()+=,./'\";:?><[]{}|`#";
pub fn is_image_url_prop(prop_name: &str) -> bool {
CSS_PROPS_WITH_IMAGE_URLS

View file

@ -28,7 +28,7 @@ struct SrcSetItem<'a> {
descriptor: &'a str,
}
const ICON_VALUES: &[&str] = &["icon", "shortcut icon"];
const ICON_VALUES: &'static [&str] = &["icon", "shortcut icon"];
pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
let mut buf: Vec<u8> = Vec::new();
@ -105,7 +105,7 @@ pub fn compose_csp(options: &Options) -> String {
}
if options.no_images {
// Note: data: is needed for transparent pixels
// Note: "data:" is required for transparent pixel images to work
string_list.push("img-src data:;");
}
@ -127,23 +127,18 @@ pub fn create_metadata_tag(url: &str) -> String {
clean_url.set_password(None).unwrap();
}
if is_http_url(url) {
format!(
"<!-- Saved from {} at {} using {} v{} -->",
&clean_url,
timestamp,
env!("CARGO_PKG_NAME"),
env!("CARGO_PKG_VERSION"),
)
if is_http_url(url) {
&clean_url.as_str()
} else {
format!(
"<!-- Saved from local source at {} using {} v{} -->",
"local source"
},
timestamp,
env!("CARGO_PKG_NAME"),
env!("CARGO_PKG_VERSION"),
)
}
}
Err(_) => str!(),
}
}
@ -498,12 +493,12 @@ pub fn walk_and_embed_assets(
} => {
match name.local.as_ref() {
"meta" => {
// Remove http-equiv attributes from META nodes if they're able to control the page
if let Some(meta_attr_http_equiv_value) = get_node_attr(node, "http-equiv") {
let meta_attr_http_equiv_value: &str = &meta_attr_http_equiv_value;
if meta_attr_http_equiv_value.eq_ignore_ascii_case("refresh")
|| meta_attr_http_equiv_value.eq_ignore_ascii_case("location")
{
// Remove http-equiv attributes from META nodes if they're able to control the page
set_node_attr(
&node,
"http-equiv",
@ -512,8 +507,21 @@ pub fn walk_and_embed_assets(
meta_attr_http_equiv_value
)),
);
} else if meta_attr_http_equiv_value.eq_ignore_ascii_case("Content-Type") {
// Enforce charset to be set to UTF-8
if let Some(_attr_value) = get_node_attr(node, "content") {
set_node_attr(
&node,
"content",
Some(str!("text/html; charset=utf-8")),
);
}
}
} else if let Some(_meta_attr_http_equiv_value) = get_node_attr(node, "charset")
{
// Enforce charset to be set to UTF-8
set_node_attr(&node, "charset", Some(str!("utf-8")));
}
}
"link" => {
// Read and remember integrity attribute value of this LINK node

View file

@ -1,4 +1,4 @@
const JS_DOM_EVENT_ATTRS: &[&str] = &[
const JS_DOM_EVENT_ATTRS: &'static [&str] = &[
// From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects":
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
// https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes")

View file

@ -212,6 +212,9 @@ fn main() {
}
}
// Remove charset meta-tag
// set_charset_meta_to_utf8(&dom.document);
// Serialize DOM tree
let mut result: String = stringify_document(&dom.document, &options);

View file

@ -21,7 +21,7 @@ pub struct Options {
pub target: String,
}
const ASCII: &str = " \
const ASCII: &'static str = " \
_____ ______________ __________ ___________________ ___
| \\ / \\ | | | | | |
| \\_/ __ \\_| __ | | ___ ___ |__| |
@ -31,7 +31,7 @@ const ASCII: &str = " \
|___| |__________| \\_____________________| |___| |___| |___|
";
const DEFAULT_NETWORK_TIMEOUT: u64 = 120;
const DEFAULT_USER_AGENT: &str =
const DEFAULT_USER_AGENT: &'static str =
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0";
impl Options {

View file

@ -69,8 +69,7 @@ mod passing {
#[test]
fn local_file_target_input_absolute_target_path() -> Result<(), Box<dyn std::error::Error>> {
let cwd = env::current_dir().unwrap();
let cwd_normalized: String =
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("-M")

View file

@ -2,3 +2,4 @@ mod base_url;
mod basic;
mod data_url;
mod local_files;
mod unusual_encodings;

View file

@ -0,0 +1,51 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use assert_cmd::prelude::*;
use std::env;
use std::process::Command;
#[test]
fn change_encoding_to_utf_8() -> Result<(), Box<dyn std::error::Error>> {
let cwd = env::current_dir().unwrap();
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?;
let out = cmd
.arg("-M")
.arg(if cfg!(windows) {
"src\\tests\\data\\unusual_encodings\\iso-8859-1.html"
} else {
"src/tests/data/unusual_encodings/iso-8859-1.html"
})
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head>\n <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n </head>\n <body>\n © Some Company\n \n\n</body></html>\n"
);
// STDERR should contain only the target file
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// The exit code should be 0
out.assert().code(0);
Ok(())
}
}

View file

@ -0,0 +1,8 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
&copy; Some Company
</body>
</html>

View file

@ -7,7 +7,7 @@ use std::path::Path;
use crate::opts::Options;
use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url};
const INDENT: &str = " ";
const INDENT: &'static str = " ";
const MAGIC: [[&[u8]; 2]; 18] = [
// Image
@ -34,11 +34,13 @@ const MAGIC: [[&[u8]; 2]; 18] = [
];
const PLAINTEXT_MEDIA_TYPES: &[&str] = &[
"application/javascript",
"image/svg+xml",
"text/css",
"text/html",
"text/javascript",
"text/plain",
// "text/css",
// "text/csv",
// "text/html",
// "text/javascript",
// "text/plain",
];
pub fn detect_media_type(data: &[u8], url: &str) -> String {
@ -56,7 +58,8 @@ pub fn detect_media_type(data: &[u8], url: &str) -> String {
}
pub fn is_plaintext_media_type(media_type: &str) -> bool {
PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
media_type.to_lowercase().as_str().starts_with("text/")
|| PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
}
pub fn indent(level: u32) -> String {
@ -125,7 +128,7 @@ pub fn retrieve_asset(
Ok(mut response) => {
if !options.ignore_errors && response.status() != 200 {
if !options.silent {
eprintln!("Unable to retrieve {} ({})", &url, response.status());
eprintln!("Unable to retrieve {} (error: {})", &url, response.status());
}
// Provoke error
return Err(client.get("").send().unwrap_err());