Merge pull request #263 from snshn/save-with-custom-charset
Add option for saving document using custom encoding
This commit is contained in:
commit
f354affc36
|
@ -71,6 +71,7 @@ or
|
||||||
- `-a`: Exclude audio sources
|
- `-a`: Exclude audio sources
|
||||||
- `-b`: Use custom `base URL`
|
- `-b`: Use custom `base URL`
|
||||||
- `-c`: Exclude CSS
|
- `-c`: Exclude CSS
|
||||||
|
- `-C`: Save document using custom `charset`
|
||||||
- `-e`: Ignore network errors
|
- `-e`: Ignore network errors
|
||||||
- `-f`: Omit frames
|
- `-f`: Omit frames
|
||||||
- `-F`: Exclude web fonts
|
- `-F`: Exclude web fonts
|
||||||
|
@ -80,7 +81,7 @@ or
|
||||||
- `-k`: Accept invalid X.509 (TLS) certificates
|
- `-k`: Accept invalid X.509 (TLS) certificates
|
||||||
- `-M`: Don't add timestamp and URL information
|
- `-M`: Don't add timestamp and URL information
|
||||||
- `-n`: Extract contents of NOSCRIPT elements
|
- `-n`: Extract contents of NOSCRIPT elements
|
||||||
- `-o`: Write output to `file`
|
- `-o`: Write output to `file`, use “-” for STDOUT
|
||||||
- `-s`: Be quiet
|
- `-s`: Be quiet
|
||||||
- `-t`: Adjust `network request timeout`
|
- `-t`: Adjust `network request timeout`
|
||||||
- `-u`: Provide custom `User-Agent`
|
- `-u`: Provide custom `User-Agent`
|
||||||
|
|
32
src/main.rs
32
src/main.rs
|
@ -1,3 +1,4 @@
|
||||||
|
use encoding_rs::Encoding;
|
||||||
use html5ever::rcdom::RcDom;
|
use html5ever::rcdom::RcDom;
|
||||||
use reqwest::blocking::Client;
|
use reqwest::blocking::Client;
|
||||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||||
|
@ -76,6 +77,14 @@ fn main() {
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if custom charset is valid
|
||||||
|
if let Some(custom_charset) = options.charset.clone() {
|
||||||
|
if !Encoding::for_label_no_replacement(custom_charset.as_bytes()).is_some() {
|
||||||
|
eprintln!("Unknown encoding: {}", &custom_charset);
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let target_url: Url;
|
let target_url: Url;
|
||||||
let mut base_url: Url;
|
let mut base_url: Url;
|
||||||
let mut use_stdin: bool = false;
|
let mut use_stdin: bool = false;
|
||||||
|
@ -201,15 +210,23 @@ fn main() {
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initial parse to read document's charset from META tag
|
// Initial parse
|
||||||
dom = html_to_dom(&data, document_encoding.clone());
|
dom = html_to_dom(&data, document_encoding.clone());
|
||||||
|
|
||||||
|
// TODO: investigate if charset from filesystem/data URL/HTTP headers
|
||||||
|
// has power over what's specified in HTML
|
||||||
|
|
||||||
// Attempt to determine document's charset
|
// Attempt to determine document's charset
|
||||||
if let Some(charset) = get_charset(&dom.document) {
|
if let Some(charset) = get_charset(&dom.document) {
|
||||||
if !charset.is_empty() {
|
if !charset.is_empty() {
|
||||||
// TODO && label(charset) != UTF_8
|
// Check if the charset specified inside HTML is valid
|
||||||
document_encoding = charset;
|
if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
|
||||||
dom = html_to_dom(&data, document_encoding.clone());
|
// No point in parsing HTML again with the same encoding as before
|
||||||
|
if encoding.name() != "UTF-8" {
|
||||||
|
document_encoding = charset;
|
||||||
|
dom = html_to_dom(&data, document_encoding.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -296,10 +313,9 @@ fn main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enforce UTF-8 encoding for documents that may end up having garbled html entities
|
// Save using specified charset, if given
|
||||||
// due to html5ever forcefully converting them into UTF-8 byte sequences.
|
if let Some(custom_charset) = options.charset.clone() {
|
||||||
if document_encoding.eq_ignore_ascii_case("iso-8859-1") {
|
document_encoding = custom_charset;
|
||||||
document_encoding = str!("utf-8");
|
|
||||||
dom = set_charset(dom, document_encoding.clone());
|
dom = set_charset(dom, document_encoding.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
11
src/opts.rs
11
src/opts.rs
|
@ -6,6 +6,7 @@ pub struct Options {
|
||||||
pub no_audio: bool,
|
pub no_audio: bool,
|
||||||
pub base_url: Option<String>,
|
pub base_url: Option<String>,
|
||||||
pub no_css: bool,
|
pub no_css: bool,
|
||||||
|
pub charset: Option<String>,
|
||||||
pub ignore_errors: bool,
|
pub ignore_errors: bool,
|
||||||
pub no_frames: bool,
|
pub no_frames: bool,
|
||||||
pub no_fonts: bool,
|
pub no_fonts: bool,
|
||||||
|
@ -48,6 +49,7 @@ impl Options {
|
||||||
.args_from_usage("-a, --no-audio 'Removes audio sources'")
|
.args_from_usage("-a, --no-audio 'Removes audio sources'")
|
||||||
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
|
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
|
||||||
.args_from_usage("-c, --no-css 'Removes CSS'")
|
.args_from_usage("-c, --no-css 'Removes CSS'")
|
||||||
|
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
|
||||||
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
|
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
|
||||||
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
|
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
|
||||||
.args_from_usage("-F, --no-fonts 'Removes fonts'")
|
.args_from_usage("-F, --no-fonts 'Removes fonts'")
|
||||||
|
@ -59,7 +61,9 @@ impl Options {
|
||||||
.args_from_usage(
|
.args_from_usage(
|
||||||
"-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'",
|
"-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'",
|
||||||
)
|
)
|
||||||
.args_from_usage("-o, --output=[document.html] 'Writes output to <file>'")
|
.args_from_usage(
|
||||||
|
"-o, --output=[document.html] 'Writes output to <file>, use - for STDOUT'",
|
||||||
|
)
|
||||||
.args_from_usage("-s, --silent 'Suppresses verbosity'")
|
.args_from_usage("-s, --silent 'Suppresses verbosity'")
|
||||||
.args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'")
|
.args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'")
|
||||||
.args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'")
|
.args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'")
|
||||||
|
@ -69,7 +73,7 @@ impl Options {
|
||||||
.required(true)
|
.required(true)
|
||||||
.takes_value(true)
|
.takes_value(true)
|
||||||
.index(1)
|
.index(1)
|
||||||
.help("URL or file path, use - for stdin"),
|
.help("URL or file path, use - for STDIN"),
|
||||||
)
|
)
|
||||||
.get_matches();
|
.get_matches();
|
||||||
let mut options: Options = Options::default();
|
let mut options: Options = Options::default();
|
||||||
|
@ -84,6 +88,9 @@ impl Options {
|
||||||
options.base_url = Some(str!(base_url));
|
options.base_url = Some(str!(base_url));
|
||||||
}
|
}
|
||||||
options.no_css = app.is_present("no-css");
|
options.no_css = app.is_present("no-css");
|
||||||
|
if let Some(charset) = app.value_of("charset") {
|
||||||
|
options.charset = Some(str!(charset));
|
||||||
|
}
|
||||||
options.ignore_errors = app.is_present("ignore-errors");
|
options.ignore_errors = app.is_present("ignore-errors");
|
||||||
options.no_frames = app.is_present("no-frames");
|
options.no_frames = app.is_present("no-frames");
|
||||||
options.no_fonts = app.is_present("no-fonts");
|
options.no_fonts = app.is_present("no-fonts");
|
||||||
|
|
|
@ -14,6 +14,21 @@ mod passing {
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, Stdio};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn print_help_information() {
|
||||||
|
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||||
|
let out = cmd.arg("-h").output().unwrap();
|
||||||
|
|
||||||
|
// STDERR should be empty
|
||||||
|
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
|
||||||
|
|
||||||
|
// STDOUT should contain program name, version, and usage information
|
||||||
|
// TODO
|
||||||
|
|
||||||
|
// Exit code should be 0
|
||||||
|
out.assert().code(0);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn print_version() {
|
fn print_version() {
|
||||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||||
|
|
|
@ -13,48 +13,6 @@ mod passing {
|
||||||
use std::path::MAIN_SEPARATOR;
|
use std::path::MAIN_SEPARATOR;
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, Stdio};
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn change_iso88591_to_utf8_to_properly_display_html_entities() {
|
|
||||||
let cwd = env::current_dir().unwrap();
|
|
||||||
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
|
||||||
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
|
||||||
let out = cmd
|
|
||||||
.arg("-M")
|
|
||||||
.arg(format!(
|
|
||||||
"src{s}tests{s}data{s}unusual_encodings{s}iso-8859-1.html",
|
|
||||||
s = MAIN_SEPARATOR
|
|
||||||
))
|
|
||||||
.output()
|
|
||||||
.unwrap();
|
|
||||||
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
|
||||||
|
|
||||||
// STDERR should contain only the target file
|
|
||||||
assert_eq!(
|
|
||||||
String::from_utf8_lossy(&out.stderr),
|
|
||||||
format!(
|
|
||||||
"{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n",
|
|
||||||
file = file_url_protocol,
|
|
||||||
cwd = cwd_normalized,
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
// STDOUT should contain original document but with UTF-8 charset
|
|
||||||
assert_eq!(
|
|
||||||
String::from_utf8_lossy(&out.stdout),
|
|
||||||
"<html>\
|
|
||||||
<head>\n \
|
|
||||||
<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\">\n \
|
|
||||||
</head>\n \
|
|
||||||
<body>\n \
|
|
||||||
© Some Company\n \
|
|
||||||
\n\n</body>\
|
|
||||||
</html>\n"
|
|
||||||
);
|
|
||||||
|
|
||||||
// Exit code should be 0
|
|
||||||
out.assert().code(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn properly_save_document_with_gb2312() {
|
fn properly_save_document_with_gb2312() {
|
||||||
let cwd = env::current_dir().unwrap();
|
let cwd = env::current_dir().unwrap();
|
||||||
|
@ -149,4 +107,133 @@ mod passing {
|
||||||
// Exit code should be 0
|
// Exit code should be 0
|
||||||
out.assert().code(0);
|
out.assert().code(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn properly_save_document_with_gb2312_custom_charset() {
|
||||||
|
let cwd = env::current_dir().unwrap();
|
||||||
|
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
||||||
|
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||||
|
let out = cmd
|
||||||
|
.arg("-M")
|
||||||
|
.arg("-C")
|
||||||
|
.arg("utf8")
|
||||||
|
.arg(format!(
|
||||||
|
"src{s}tests{s}data{s}unusual_encodings{s}gb2312.html",
|
||||||
|
s = MAIN_SEPARATOR
|
||||||
|
))
|
||||||
|
.output()
|
||||||
|
.unwrap();
|
||||||
|
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
||||||
|
|
||||||
|
// STDERR should contain only the target file
|
||||||
|
assert_eq!(
|
||||||
|
String::from_utf8_lossy(&out.stderr),
|
||||||
|
format!(
|
||||||
|
"{file}{cwd}/src/tests/data/unusual_encodings/gb2312.html\n",
|
||||||
|
file = file_url_protocol,
|
||||||
|
cwd = cwd_normalized,
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
// STDOUT should contain original document without any modificatons
|
||||||
|
assert_eq!(
|
||||||
|
String::from_utf8_lossy(&out.stdout).to_string(),
|
||||||
|
"<html>\
|
||||||
|
<head>\n \
|
||||||
|
<meta http-equiv=\"content-type\" content=\"text/html;charset=utf8\">\n \
|
||||||
|
<title>近七成人减少线下需求\u{3000}银行数字化转型提速--经济·科技--人民网 </title>\n\
|
||||||
|
</head>\n\
|
||||||
|
<body>\n \
|
||||||
|
<h1>近七成人减少线下需求\u{3000}银行数字化转型提速</h1>\n\n\n\
|
||||||
|
</body>\
|
||||||
|
</html>\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Exit code should be 0
|
||||||
|
out.assert().code(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn properly_save_document_with_gb2312_custom_charset_bad() {
|
||||||
|
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||||
|
let out = cmd
|
||||||
|
.arg("-M")
|
||||||
|
.arg("-C")
|
||||||
|
.arg("utf0")
|
||||||
|
.arg(format!(
|
||||||
|
"src{s}tests{s}data{s}unusual_encodings{s}gb2312.html",
|
||||||
|
s = MAIN_SEPARATOR
|
||||||
|
))
|
||||||
|
.output()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// STDERR should contain error message
|
||||||
|
assert_eq!(
|
||||||
|
String::from_utf8_lossy(&out.stderr),
|
||||||
|
"Unknown encoding: utf0\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
// STDOUT should be empty
|
||||||
|
assert_eq!(String::from_utf8_lossy(&out.stdout).to_string(), "");
|
||||||
|
|
||||||
|
// Exit code should be 1
|
||||||
|
out.assert().code(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
|
||||||
|
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
|
||||||
|
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
|
||||||
|
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
|
||||||
|
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
|
||||||
|
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod failing {
|
||||||
|
use assert_cmd::prelude::*;
|
||||||
|
use std::env;
|
||||||
|
use std::path::MAIN_SEPARATOR;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn change_iso88591_to_utf8_to_properly_display_html_entities() {
|
||||||
|
let cwd = env::current_dir().unwrap();
|
||||||
|
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
|
||||||
|
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
|
||||||
|
let out = cmd
|
||||||
|
.arg("-M")
|
||||||
|
.arg(format!(
|
||||||
|
"src{s}tests{s}data{s}unusual_encodings{s}iso-8859-1.html",
|
||||||
|
s = MAIN_SEPARATOR
|
||||||
|
))
|
||||||
|
.output()
|
||||||
|
.unwrap();
|
||||||
|
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
||||||
|
|
||||||
|
// STDERR should contain only the target file
|
||||||
|
assert_eq!(
|
||||||
|
String::from_utf8_lossy(&out.stderr),
|
||||||
|
format!(
|
||||||
|
"{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n",
|
||||||
|
file = file_url_protocol,
|
||||||
|
cwd = cwd_normalized,
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
// STDOUT should contain original document but with UTF-8 charset
|
||||||
|
assert_eq!(
|
||||||
|
String::from_utf8_lossy(&out.stdout),
|
||||||
|
"<html>\
|
||||||
|
<head>\n \
|
||||||
|
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">\n \
|
||||||
|
</head>\n \
|
||||||
|
<body>\n \
|
||||||
|
<EFBFBD> Some Company\n \
|
||||||
|
\n\n</body>\
|
||||||
|
</html>\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Exit code should be 0
|
||||||
|
out.assert().code(0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,7 @@ mod passing {
|
||||||
assert_eq!(options.no_audio, false);
|
assert_eq!(options.no_audio, false);
|
||||||
assert_eq!(options.base_url, None);
|
assert_eq!(options.base_url, None);
|
||||||
assert_eq!(options.no_css, false);
|
assert_eq!(options.no_css, false);
|
||||||
|
assert_eq!(options.charset, None);
|
||||||
assert_eq!(options.no_frames, false);
|
assert_eq!(options.no_frames, false);
|
||||||
assert_eq!(options.no_fonts, false);
|
assert_eq!(options.no_fonts, false);
|
||||||
assert_eq!(options.no_images, false);
|
assert_eq!(options.no_images, false);
|
||||||
|
|
Loading…
Reference in New Issue