avoid excessive parsing of HTML into DOM

This commit is contained in:
Sunshine 2021-06-08 03:57:28 -10:00
parent b29b9a6a7c
commit 7686b2ea64
No known key found for this signature in database
GPG key ID: B80CA68703CD8AB1

View file

@ -210,15 +210,23 @@ fn main() {
process::exit(1);
}
// Initial parse to read document's charset from META tag
// Initial parse
dom = html_to_dom(&data, document_encoding.clone());
// TODO: investigate if charset from filesystem/data URL/HTTP headers
// has power over what's specified in HTML
// Attempt to determine document's charset
if let Some(charset) = get_charset(&dom.document) {
if !charset.is_empty() {
// TODO && label(charset) != UTF_8
document_encoding = charset;
dom = html_to_dom(&data, document_encoding.clone());
// Check if the charset specified inside HTML is valid
if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
// No point in parsing HTML again with the same encoding as before
if encoding.name() != "UTF-8" {
document_encoding = charset;
dom = html_to_dom(&data, document_encoding.clone());
}
}
}
}