improve validation of charset found in HTML, use genuinely infinite timeout
This commit is contained in:
parent
c938ba6a2f
commit
125aeeec3b
1 changed files with 20 additions and 23 deletions
43
src/main.rs
43
src/main.rs
|
@ -86,7 +86,6 @@ fn main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
let target_url: Url;
|
let target_url: Url;
|
||||||
let mut base_url: Url;
|
|
||||||
let mut use_stdin: bool = false;
|
let mut use_stdin: bool = false;
|
||||||
|
|
||||||
// Determine exact target URL
|
// Determine exact target URL
|
||||||
|
@ -156,20 +155,19 @@ fn main() {
|
||||||
HeaderValue::from_str(&user_agent).expect("Invalid User-Agent header specified"),
|
HeaderValue::from_str(&user_agent).expect("Invalid User-Agent header specified"),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
let timeout: u64 = if options.timeout > 0 {
|
let client = if options.timeout > 0 {
|
||||||
options.timeout
|
Client::builder().timeout(Duration::from_secs(options.timeout))
|
||||||
} else {
|
} else {
|
||||||
std::u64::MAX / 4 // This is pretty close to infinity
|
// No timeout is default
|
||||||
};
|
Client::builder()
|
||||||
let client = Client::builder()
|
}
|
||||||
.timeout(Duration::from_secs(timeout))
|
.danger_accept_invalid_certs(options.insecure)
|
||||||
.danger_accept_invalid_certs(options.insecure)
|
.default_headers(header_map)
|
||||||
.default_headers(header_map)
|
.build()
|
||||||
.build()
|
.expect("Failed to initialize HTTP client");
|
||||||
.expect("Failed to initialize HTTP client");
|
|
||||||
|
|
||||||
// At this stage we assume that the base URL is the same as the target URL
|
// At first we assume that base URL is the same as target URL
|
||||||
base_url = target_url.clone();
|
let mut base_url: Url = target_url.clone();
|
||||||
|
|
||||||
let data: Vec<u8>;
|
let data: Vec<u8>;
|
||||||
let mut document_encoding: String = str!();
|
let mut document_encoding: String = str!();
|
||||||
|
@ -214,16 +212,16 @@ fn main() {
|
||||||
dom = html_to_dom(&data, document_encoding.clone());
|
dom = html_to_dom(&data, document_encoding.clone());
|
||||||
|
|
||||||
// TODO: investigate if charset from filesystem/data URL/HTTP headers
|
// TODO: investigate if charset from filesystem/data URL/HTTP headers
|
||||||
// has power over what's specified in HTML
|
// has say over what's specified in HTML
|
||||||
|
|
||||||
// Attempt to determine document's charset
|
// Attempt to determine document's charset
|
||||||
if let Some(charset) = get_charset(&dom.document) {
|
if let Some(html_charset) = get_charset(&dom.document) {
|
||||||
if !charset.is_empty() {
|
if !html_charset.is_empty() {
|
||||||
// Check if the charset specified inside HTML is valid
|
// Check if the charset specified inside HTML is valid
|
||||||
if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
|
if let Some(encoding) = Encoding::for_label_no_replacement(html_charset.as_bytes()) {
|
||||||
// No point in parsing HTML again with the same encoding as before
|
// No point in parsing HTML again with the same encoding as before
|
||||||
if encoding.name() != "UTF-8" {
|
if encoding.name() != "UTF-8" {
|
||||||
document_encoding = charset;
|
document_encoding = html_charset;
|
||||||
dom = html_to_dom(&data, document_encoding.clone());
|
dom = html_to_dom(&data, document_encoding.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -233,8 +231,8 @@ fn main() {
|
||||||
// Use custom base URL if specified, read and use what's in the DOM otherwise
|
// Use custom base URL if specified, read and use what's in the DOM otherwise
|
||||||
let custom_base_url: String = options.base_url.clone().unwrap_or(str!());
|
let custom_base_url: String = options.base_url.clone().unwrap_or(str!());
|
||||||
if custom_base_url.is_empty() {
|
if custom_base_url.is_empty() {
|
||||||
// No custom base URL is specified,
|
// No custom base URL is specified
|
||||||
// try to see if the document has BASE tag
|
// Try to see if document has BASE element
|
||||||
if let Some(existing_base_url) = get_base_url(&dom.document) {
|
if let Some(existing_base_url) = get_base_url(&dom.document) {
|
||||||
base_url = resolve_url(&target_url, &existing_base_url);
|
base_url = resolve_url(&target_url, &existing_base_url);
|
||||||
}
|
}
|
||||||
|
@ -253,8 +251,7 @@ fn main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
// Failed to parse given base URL,
|
// Failed to parse given base URL, perhaps it's a filesystem path?
|
||||||
// perhaps it's a filesystem path?
|
|
||||||
if target_url.scheme() == "file" {
|
if target_url.scheme() == "file" {
|
||||||
// Relative paths could work for documents saved from filesystem
|
// Relative paths could work for documents saved from filesystem
|
||||||
let path: &Path = Path::new(&custom_base_url);
|
let path: &Path = Path::new(&custom_base_url);
|
||||||
|
@ -322,7 +319,7 @@ fn main() {
|
||||||
// Serialize DOM tree
|
// Serialize DOM tree
|
||||||
let mut result: Vec<u8> = serialize_document(dom, document_encoding, &options);
|
let mut result: Vec<u8> = serialize_document(dom, document_encoding, &options);
|
||||||
|
|
||||||
// Add metadata tag
|
// Prepend metadata comment tag
|
||||||
if !options.no_metadata {
|
if !options.no_metadata {
|
||||||
let mut metadata_comment: String = create_metadata_tag(&target_url);
|
let mut metadata_comment: String = create_metadata_tag(&target_url);
|
||||||
metadata_comment += "\n";
|
metadata_comment += "\n";
|
||||||
|
|
Loading…
Reference in a new issue