add option for saving document using custom encoding
This commit is contained in:
parent
cbda57cfa8
commit
b29b9a6a7c
6 changed files with 168 additions and 49 deletions
|
@ -71,6 +71,7 @@ or
|
|||
- `-a`: Exclude audio sources
|
||||
- `-b`: Use custom `base URL`
|
||||
- `-c`: Exclude CSS
|
||||
- `-C`: Save document using custom `charset`
|
||||
- `-e`: Ignore network errors
|
||||
- `-f`: Omit frames
|
||||
- `-F`: Exclude web fonts
|
||||
|
@ -80,7 +81,7 @@ or
|
|||
- `-k`: Accept invalid X.509 (TLS) certificates
|
||||
- `-M`: Don't add timestamp and URL information
|
||||
- `-n`: Extract contents of NOSCRIPT elements
|
||||
- `-o`: Write output to `file`
|
||||
- `-o`: Write output to `file`, use “-” for STDOUT
|
||||
- `-s`: Be quiet
|
||||
- `-t`: Adjust `network request timeout`
|
||||
- `-u`: Provide custom `User-Agent`
|
||||
|
|
16
src/main.rs
16
src/main.rs
|
@ -1,3 +1,4 @@
|
|||
use encoding_rs::Encoding;
|
||||
use html5ever::rcdom::RcDom;
|
||||
use reqwest::blocking::Client;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
|
@ -76,6 +77,14 @@ fn main() {
|
|||
process::exit(1);
|
||||
}
|
||||
|
||||
// Check if custom charset is valid
|
||||
if let Some(custom_charset) = options.charset.clone() {
|
||||
if !Encoding::for_label_no_replacement(custom_charset.as_bytes()).is_some() {
|
||||
eprintln!("Unknown encoding: {}", &custom_charset);
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
let target_url: Url;
|
||||
let mut base_url: Url;
|
||||
let mut use_stdin: bool = false;
|
||||
|
@ -296,10 +305,9 @@ fn main() {
|
|||
}
|
||||
}
|
||||
|
||||
// Enforce UTF-8 encoding for documents that may end up having garbled html entities
|
||||
// due to html5ever forcefully converting them into UTF-8 byte sequences.
|
||||
if document_encoding.eq_ignore_ascii_case("iso-8859-1") {
|
||||
document_encoding = str!("utf-8");
|
||||
// Save using specified charset, if given
|
||||
if let Some(custom_charset) = options.charset.clone() {
|
||||
document_encoding = custom_charset;
|
||||
dom = set_charset(dom, document_encoding.clone());
|
||||
}
|
||||
|
||||
|
|
11
src/opts.rs
11
src/opts.rs
|
@ -6,6 +6,7 @@ pub struct Options {
|
|||
pub no_audio: bool,
|
||||
pub base_url: Option<String>,
|
||||
pub no_css: bool,
|
||||
pub charset: Option<String>,
|
||||
pub ignore_errors: bool,
|
||||
pub no_frames: bool,
|
||||
pub no_fonts: bool,
|
||||
|
@ -48,6 +49,7 @@ impl Options {
|
|||
.args_from_usage("-a, --no-audio 'Removes audio sources'")
|
||||
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
|
||||
.args_from_usage("-c, --no-css 'Removes CSS'")
|
||||
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
|
||||
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
|
||||
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
|
||||
.args_from_usage("-F, --no-fonts 'Removes fonts'")
|
||||
|
@ -59,7 +61,9 @@ impl Options {
|
|||
.args_from_usage(
|
||||
"-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'",
|
||||
)
|
||||
.args_from_usage("-o, --output=[document.html] 'Writes output to <file>'")
|
||||
.args_from_usage(
|
||||
"-o, --output=[document.html] 'Writes output to <file>, use - for STDOUT'",
|
||||
)
|
||||
.args_from_usage("-s, --silent 'Suppresses verbosity'")
|
||||
.args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'")
|
||||
.args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'")
|
||||
|
@ -69,7 +73,7 @@ impl Options {
|
|||
.required(true)
|
||||
.takes_value(true)
|
||||
.index(1)
|
||||
.help("URL or file path, use - for stdin"),
|
||||
.help("URL or file path, use - for STDIN"),
|
||||
)
|
||||
.get_matches();
|
||||
let mut options: Options = Options::default();
|
||||
|
@ -84,6 +88,9 @@ impl Options {
|
|||
options.base_url = Some(str!(base_url));
|
||||
}
|
||||
options.no_css = app.is_present("no-css");
|
||||
if let Some(charset) = app.value_of("charset") {
|
||||
options.charset = Some(str!(charset));
|
||||
}
|
||||
options.ignore_errors = app.is_present("ignore-errors");
|
||||
options.no_frames = app.is_present("no-frames");
|
||||
options.no_fonts = app.is_present("no-fonts");
|
||||
|
|
|
@ -14,6 +14,21 @@ mod passing {
|
|||
use std::process::{Command, Stdio};
|
||||
use url::Url;
|
||||
|
||||
#[test]
|
||||
fn print_help_information() {
|
||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||
let out = cmd.arg("-h").output().unwrap();
|
||||
|
||||
// STDERR should be empty
|
||||
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
|
||||
|
||||
// STDOUT should contain program name, version, and usage information
|
||||
// TODO
|
||||
|
||||
// Exit code should be 0
|
||||
out.assert().code(0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_version() {
|
||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||
|
|
|
@ -13,48 +13,6 @@ mod passing {
|
|||
use std::path::MAIN_SEPARATOR;
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
#[test]
|
||||
fn change_iso88591_to_utf8_to_properly_display_html_entities() {
|
||||
let cwd = env::current_dir().unwrap();
|
||||
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||
let out = cmd
|
||||
.arg("-M")
|
||||
.arg(format!(
|
||||
"src{s}tests{s}data{s}unusual_encodings{s}iso-8859-1.html",
|
||||
s = MAIN_SEPARATOR
|
||||
))
|
||||
.output()
|
||||
.unwrap();
|
||||
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
||||
|
||||
// STDERR should contain only the target file
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(&out.stderr),
|
||||
format!(
|
||||
"{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n",
|
||||
file = file_url_protocol,
|
||||
cwd = cwd_normalized,
|
||||
)
|
||||
);
|
||||
|
||||
// STDOUT should contain original document but with UTF-8 charset
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(&out.stdout),
|
||||
"<html>\
|
||||
<head>\n \
|
||||
<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\">\n \
|
||||
</head>\n \
|
||||
<body>\n \
|
||||
© Some Company\n \
|
||||
\n\n</body>\
|
||||
</html>\n"
|
||||
);
|
||||
|
||||
// Exit code should be 0
|
||||
out.assert().code(0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn properly_save_document_with_gb2312() {
|
||||
let cwd = env::current_dir().unwrap();
|
||||
|
@ -149,4 +107,133 @@ mod passing {
|
|||
// Exit code should be 0
|
||||
out.assert().code(0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn properly_save_document_with_gb2312_custom_charset() {
|
||||
let cwd = env::current_dir().unwrap();
|
||||
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||
let out = cmd
|
||||
.arg("-M")
|
||||
.arg("-C")
|
||||
.arg("utf8")
|
||||
.arg(format!(
|
||||
"src{s}tests{s}data{s}unusual_encodings{s}gb2312.html",
|
||||
s = MAIN_SEPARATOR
|
||||
))
|
||||
.output()
|
||||
.unwrap();
|
||||
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
||||
|
||||
// STDERR should contain only the target file
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(&out.stderr),
|
||||
format!(
|
||||
"{file}{cwd}/src/tests/data/unusual_encodings/gb2312.html\n",
|
||||
file = file_url_protocol,
|
||||
cwd = cwd_normalized,
|
||||
)
|
||||
);
|
||||
|
||||
// STDOUT should contain original document without any modificatons
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(&out.stdout).to_string(),
|
||||
"<html>\
|
||||
<head>\n \
|
||||
<meta http-equiv=\"content-type\" content=\"text/html;charset=utf8\">\n \
|
||||
<title>近七成人减少线下需求\u{3000}银行数字化转型提速--经济·科技--人民网 </title>\n\
|
||||
</head>\n\
|
||||
<body>\n \
|
||||
<h1>近七成人减少线下需求\u{3000}银行数字化转型提速</h1>\n\n\n\
|
||||
</body>\
|
||||
</html>\n"
|
||||
);
|
||||
|
||||
// Exit code should be 0
|
||||
out.assert().code(0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn properly_save_document_with_gb2312_custom_charset_bad() {
|
||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||
let out = cmd
|
||||
.arg("-M")
|
||||
.arg("-C")
|
||||
.arg("utf0")
|
||||
.arg(format!(
|
||||
"src{s}tests{s}data{s}unusual_encodings{s}gb2312.html",
|
||||
s = MAIN_SEPARATOR
|
||||
))
|
||||
.output()
|
||||
.unwrap();
|
||||
|
||||
// STDERR should contain error message
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(&out.stderr),
|
||||
"Unknown encoding: utf0\n"
|
||||
);
|
||||
|
||||
// STDOUT should be empty
|
||||
assert_eq!(String::from_utf8_lossy(&out.stdout).to_string(), "");
|
||||
|
||||
// Exit code should be 1
|
||||
out.assert().code(1);
|
||||
}
|
||||
}
|
||||
|
||||
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
|
||||
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
|
||||
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
|
||||
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
|
||||
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
|
||||
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
|
||||
|
||||
#[cfg(test)]
|
||||
mod failing {
|
||||
use assert_cmd::prelude::*;
|
||||
use std::env;
|
||||
use std::path::MAIN_SEPARATOR;
|
||||
use std::process::Command;
|
||||
|
||||
#[test]
|
||||
fn change_iso88591_to_utf8_to_properly_display_html_entities() {
|
||||
let cwd = env::current_dir().unwrap();
|
||||
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||
let out = cmd
|
||||
.arg("-M")
|
||||
.arg(format!(
|
||||
"src{s}tests{s}data{s}unusual_encodings{s}iso-8859-1.html",
|
||||
s = MAIN_SEPARATOR
|
||||
))
|
||||
.output()
|
||||
.unwrap();
|
||||
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
||||
|
||||
// STDERR should contain only the target file
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(&out.stderr),
|
||||
format!(
|
||||
"{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n",
|
||||
file = file_url_protocol,
|
||||
cwd = cwd_normalized,
|
||||
)
|
||||
);
|
||||
|
||||
// STDOUT should contain original document but with UTF-8 charset
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(&out.stdout),
|
||||
"<html>\
|
||||
<head>\n \
|
||||
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">\n \
|
||||
</head>\n \
|
||||
<body>\n \
|
||||
<EFBFBD> Some Company\n \
|
||||
\n\n</body>\
|
||||
</html>\n"
|
||||
);
|
||||
|
||||
// Exit code should be 0
|
||||
out.assert().code(0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@ mod passing {
|
|||
assert_eq!(options.no_audio, false);
|
||||
assert_eq!(options.base_url, None);
|
||||
assert_eq!(options.no_css, false);
|
||||
assert_eq!(options.charset, None);
|
||||
assert_eq!(options.no_frames, false);
|
||||
assert_eq!(options.no_fonts, false);
|
||||
assert_eq!(options.no_images, false);
|
||||
|
|
Loading…
Reference in a new issue