monolith/src/html.rs

602 lines
25 KiB
Rust
Raw Normal View History

2020-01-02 16:31:55 +01:00
use crate::http::retrieve_asset;
use crate::js::attr_is_event_handler;
use crate::utils::{
data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol,
};
use html5ever::interface::QualName;
2019-08-24 20:48:10 +02:00
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::{format_tendril, Tendril, TendrilSink};
use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns};
use reqwest::blocking::Client;
use std::collections::HashMap;
2019-08-23 05:17:15 +02:00
use std::default::Default;
const ICON_VALUES: &[&str] = &[
2019-09-29 23:15:49 +02:00
"icon",
"shortcut icon",
"mask-icon",
"apple-touch-icon",
"fluid-icon",
];
const TRANSPARENT_PIXEL: &str =
"data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
2019-08-23 10:33:30 +02:00
2019-09-29 23:15:49 +02:00
pub fn get_parent_node(node: &Handle) -> Handle {
2019-08-24 17:21:29 +02:00
let parent = node.parent.take().clone();
parent.and_then(|node| node.upgrade()).unwrap()
}
pub fn get_node_name(node: &Handle) -> &'_ str {
match &node.data {
NodeData::Element { ref name, .. } => name.local.as_ref(),
_ => "",
2019-08-24 17:21:29 +02:00
}
}
2019-09-29 23:15:49 +02:00
pub fn is_icon(attr_value: &str) -> bool {
ICON_VALUES
.iter()
.find(|a| attr_value.eq_ignore_ascii_case(a))
.is_some()
2019-09-29 23:15:49 +02:00
}
2019-08-23 20:33:18 +02:00
pub fn walk_and_embed_assets(
cache: &mut HashMap<String, String>,
client: &Client,
2019-08-23 20:33:18 +02:00
url: &str,
node: &Handle,
opt_no_css: bool,
2019-08-23 20:33:18 +02:00
opt_no_js: bool,
opt_no_images: bool,
2019-08-25 17:41:30 +02:00
opt_silent: bool,
opt_no_frames: bool,
2019-08-23 20:33:18 +02:00
) {
2019-08-23 05:17:15 +02:00
match node.data {
NodeData::Document => {
// Dig deeper
for child in node.children.borrow().iter() {
2019-08-23 20:33:18 +02:00
walk_and_embed_assets(
cache,
client,
&url,
child,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
2019-08-23 05:17:15 +02:00
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
NodeData::Element {
ref name,
ref attrs,
..
} => {
2019-08-23 20:24:45 +02:00
let attrs_mut = &mut attrs.borrow_mut();
2019-08-23 05:17:15 +02:00
2019-08-24 02:16:16 +02:00
match name.local.as_ref() {
"link" => {
2019-12-26 15:44:01 +01:00
// Remove integrity attributes
let mut i = 0;
while i < attrs_mut.len() {
let attr_name = attrs_mut[i].name.local.as_ref();
if attr_name.eq_ignore_ascii_case("integrity") {
attrs_mut.remove(i);
} else {
i += 1;
}
}
enum LinkType {
Icon,
Stylesheet,
Preload,
DnsPrefetch,
Unknown,
}
let mut link_type = LinkType::Unknown;
2019-08-24 02:16:16 +02:00
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" {
if is_icon(attr.value.as_ref()) {
link_type = LinkType::Icon;
2019-08-24 02:16:16 +02:00
break;
} else if attr.value.as_ref() == "stylesheet" {
link_type = LinkType::Stylesheet;
2019-08-24 02:16:16 +02:00
break;
}
2019-08-23 05:17:15 +02:00
}
}
let link_type = link_type;
2019-08-23 05:17:15 +02:00
match link_type {
LinkType::Icon => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_images {
attr.value.clear();
} else {
let href_full_url = resolve_url(&url, attr.value.as_ref())
.unwrap_or_default();
let (favicon_dataurl, _) = retrieve_asset(
cache,
client,
&href_full_url,
true,
"",
opt_silent,
)
.unwrap_or_default();
attr.value.clear();
attr.value.push_slice(favicon_dataurl.as_str());
}
2019-08-25 05:06:40 +02:00
}
}
}
LinkType::Stylesheet => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_css {
attr.value.clear();
} else {
let href_full_url = resolve_url(&url, &attr.value.as_ref())
.unwrap_or_default();
let replacement_text = match retrieve_asset(
cache,
client,
&href_full_url,
false,
"text/css",
opt_silent,
) {
// On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports(
cache,
client,
&css_data,
true,
&href_full_url,
opt_no_images,
opt_silent,
),
// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e);
// If failed to resolve, replace with absolute URL
href_full_url
}
};
attr.value.clear();
attr.value.push_slice(&replacement_text);
}
}
2019-08-25 05:06:40 +02:00
}
}
LinkType::Unknown => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url =
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
attr.value.clear();
attr.value.push_slice(&href_full_url.as_str());
}
2019-08-24 20:22:34 +02:00
}
2019-08-23 05:17:15 +02:00
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-25 05:06:40 +02:00
"img" => {
// Find source tags
let mut found_src: Option<Attribute> = None;
let mut found_datasrc: Option<Attribute> = None;
let mut i = 0;
while i < attrs_mut.len() {
let name = attrs_mut[i].name.local.as_ref();
if name.eq_ignore_ascii_case("src") {
found_src = Some(attrs_mut.remove(i));
} else if name.eq_ignore_ascii_case("data-src") {
found_datasrc = Some(attrs_mut.remove(i));
} else {
i += 1;
2019-08-23 05:17:15 +02:00
}
}
// If images are disabled, clear both sources
if opt_no_images {
attrs_mut.push(Attribute {
name: QualName::new(None, ns!(), local_name!("src")),
value: Tendril::from_slice(TRANSPARENT_PIXEL),
});
} else if let Some((dataurl, _)) = (&found_datasrc)
.into_iter()
.chain(&found_src) // Give dataurl priority
.map(|attr| &attr.value)
.filter(|src| !src.is_empty()) // Ignore empty srcs
.next()
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
.and_then(|abs_src| // Download and convert to dataurl
retrieve_asset(
cache,
client,
&abs_src,
true,
"",
opt_silent,
).ok())
{
// Add the new dataurl src attribute
attrs_mut.push(Attribute {
name: QualName::new(None, ns!(), local_name!("src")),
value: Tendril::from_slice(dataurl.as_ref()),
});
}
2019-08-23 20:24:45 +02:00
}
2019-08-25 05:06:40 +02:00
"source" => {
2019-08-24 17:21:29 +02:00
for attr in attrs_mut.iter_mut() {
2019-09-22 18:57:50 +02:00
let attr_name: &str = &attr.name.local;
if attr_name == "src" {
2020-01-04 08:33:11 +01:00
let src_full_url = resolve_url(&url, attr.value.as_ref())
.unwrap_or_else(|_| attr.value.to_string());
2019-09-22 18:57:50 +02:00
attr.value.clear();
attr.value.push_slice(src_full_url.as_str());
} else if attr_name == "srcset" {
if get_node_name(&get_parent_node(&node)) == "picture" {
2019-08-24 17:21:29 +02:00
if opt_no_images {
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
2020-01-04 08:33:11 +01:00
let srcset_full_url =
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
2019-10-01 05:58:09 +02:00
let (source_dataurl, _) = retrieve_asset(
cache,
client,
&srcset_full_url,
true,
"",
opt_silent,
)
2019-12-11 07:12:57 +01:00
.unwrap_or((str!(), str!()));
2019-08-24 17:21:29 +02:00
attr.value.clear();
2019-10-01 05:58:09 +02:00
attr.value.push_slice(source_dataurl.as_str());
2019-08-24 17:21:29 +02:00
}
}
}
}
}
2019-08-25 05:06:40 +02:00
"a" => {
2019-08-23 05:17:15 +02:00
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
// Don't touch email links or hrefs which begin with a hash sign
2019-09-29 23:15:49 +02:00
if attr.value.starts_with('#') || url_has_protocol(&attr.value) {
2019-08-23 05:17:15 +02:00
continue;
}
2020-01-04 08:33:11 +01:00
let href_full_url =
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
2019-08-23 05:17:15 +02:00
attr.value.clear();
2019-08-25 05:06:40 +02:00
attr.value.push_slice(href_full_url.as_str());
2019-08-23 05:17:15 +02:00
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-25 05:06:40 +02:00
"script" => {
2019-12-26 15:44:01 +01:00
// Remove integrity attributes
let mut i = 0;
while i < attrs_mut.len() {
let attr_name = attrs_mut[i].name.local.as_ref();
if attr_name.eq_ignore_ascii_case("integrity") {
attrs_mut.remove(i);
} else {
i += 1;
}
}
2019-08-23 05:17:15 +02:00
if opt_no_js {
// Empty src and inner content of SCRIPT tags
2019-08-23 05:17:15 +02:00
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
attr.value.clear();
}
}
node.children.borrow_mut().clear();
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let src_full_url =
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
2019-10-01 05:58:09 +02:00
let (js_dataurl, _) = retrieve_asset(
cache,
client,
&src_full_url,
true,
"application/javascript",
opt_silent,
)
2019-12-11 07:12:57 +01:00
.unwrap_or((str!(), str!()));
2019-08-23 05:17:15 +02:00
attr.value.clear();
2019-10-01 05:58:09 +02:00
attr.value.push_slice(js_dataurl.as_str());
2019-08-23 05:17:15 +02:00
}
}
}
2019-08-23 20:24:45 +02:00
}
"style" => {
if opt_no_css {
2019-09-29 23:15:49 +02:00
// Empty inner content of STYLE tags
node.children.borrow_mut().clear();
2019-12-06 02:05:52 +01:00
} else {
for node in node.children.borrow_mut().iter_mut() {
2019-12-06 02:41:43 +01:00
if let NodeData::Text { ref contents } = node.data {
2019-12-06 02:20:09 +01:00
let mut tendril = contents.borrow_mut();
let replacement = resolve_css_imports(
cache,
client,
2019-12-06 16:52:20 +01:00
tendril.as_ref(),
2019-12-06 02:20:09 +01:00
false,
&url,
opt_no_images,
2019-12-06 02:20:09 +01:00
opt_silent,
);
tendril.clear();
2019-12-09 18:41:21 +01:00
tendril.push_slice(&replacement);
2019-12-06 02:05:52 +01:00
}
}
}
}
2019-08-25 05:06:40 +02:00
"form" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "action" {
2019-09-29 23:15:49 +02:00
// Modify action to be a full URL
if !is_valid_url(&attr.value) {
let href_full_url =
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
2019-09-29 23:15:49 +02:00
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
}
}
2019-08-23 20:24:45 +02:00
}
2019-08-25 05:06:40 +02:00
"iframe" => {
2019-09-29 23:15:49 +02:00
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
if opt_no_frames {
// Empty the src attribute
attr.value.clear();
2019-09-29 23:15:49 +02:00
continue;
2019-08-27 04:57:10 +02:00
}
let iframe_src = attr.value.as_ref();
2019-09-29 23:15:49 +02:00
// Ignore iframes with empty source (they cause infinite loops)
if iframe_src.is_empty() {
2019-09-29 23:15:49 +02:00
continue;
}
2019-09-29 23:15:49 +02:00
let src_full_url = resolve_url(&url, iframe_src).unwrap_or_default();
2019-10-01 05:58:09 +02:00
let (iframe_data, iframe_final_url) = retrieve_asset(
cache,
client,
2019-09-29 23:15:49 +02:00
&src_full_url,
false,
"text/html",
opt_silent,
)
2019-12-11 07:12:57 +01:00
.unwrap_or((str!(), src_full_url));
2019-09-29 23:15:49 +02:00
let dom = html_to_dom(&iframe_data);
walk_and_embed_assets(
cache,
client,
2019-10-01 05:58:09 +02:00
&iframe_final_url,
2019-09-29 23:15:49 +02:00
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
2019-10-01 05:58:09 +02:00
let iframe_dataurl = data_to_dataurl("text/html", &buf);
2019-09-29 23:15:49 +02:00
attr.value.clear();
2019-10-01 05:58:09 +02:00
attr.value.push_slice(iframe_dataurl.as_str());
2019-08-24 02:16:16 +02:00
}
}
}
2019-09-22 18:57:50 +02:00
"video" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "poster" {
let video_poster = attr.value.as_ref();
2019-09-22 18:57:50 +02:00
2019-09-29 23:15:49 +02:00
// Skip posters with empty source
if video_poster.is_empty() {
2019-09-22 18:57:50 +02:00
continue;
}
if opt_no_images {
attr.value.clear();
} else {
2020-01-03 23:58:29 +01:00
let poster_full_url =
resolve_url(&url, video_poster).unwrap_or_default();
2019-10-01 05:58:09 +02:00
let (poster_dataurl, _) = retrieve_asset(
cache,
client,
2019-09-22 18:57:50 +02:00
&poster_full_url,
true,
"",
opt_silent,
)
2019-12-11 07:12:57 +01:00
.unwrap_or((poster_full_url, str!()));
2019-09-22 18:57:50 +02:00
attr.value.clear();
2019-10-01 05:58:09 +02:00
attr.value.push_slice(poster_dataurl.as_str());
2019-09-22 18:57:50 +02:00
}
}
}
}
2019-08-25 05:06:40 +02:00
_ => {}
2019-08-23 05:17:15 +02:00
}
2019-12-06 21:28:08 +01:00
// Process style attributes
if opt_no_css {
// Get rid of style attributes
2019-09-29 23:15:49 +02:00
let mut style_attr_indexes = Vec::new();
for (i, attr) in attrs_mut.iter_mut().enumerate() {
if attr.name.local.to_lowercase() == "style" {
2019-09-29 23:15:49 +02:00
style_attr_indexes.push(i);
}
}
2019-09-29 23:15:49 +02:00
style_attr_indexes.reverse();
for attr_index in style_attr_indexes {
attrs_mut.remove(attr_index);
}
2019-12-06 21:28:08 +01:00
} else {
// Otherwise, parse any links found in the attributes
for attribute in attrs_mut
.iter_mut()
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
{
let replacement = resolve_css_imports(
cache,
client,
2019-12-06 21:28:08 +01:00
attribute.value.as_ref(),
false,
&url,
opt_no_images,
opt_silent,
);
attribute.value.clear();
2019-12-09 18:41:21 +01:00
attribute.value.push_slice(&replacement);
2019-12-06 21:28:08 +01:00
}
}
2019-08-23 05:17:15 +02:00
if opt_no_js {
// Get rid of JS event attributes
2019-09-29 23:15:49 +02:00
let mut js_attr_indexes = Vec::new();
for (i, attr) in attrs_mut.iter_mut().enumerate() {
if attr_is_event_handler(&attr.name.local) {
js_attr_indexes.push(i);
2019-08-23 05:17:15 +02:00
}
}
2019-09-29 23:15:49 +02:00
js_attr_indexes.reverse();
for attr_index in js_attr_indexes {
attrs_mut.remove(attr_index);
}
2019-08-23 20:24:45 +02:00
}
2019-08-23 05:17:15 +02:00
// Dig deeper
for child in node.children.borrow().iter() {
2019-08-23 20:33:18 +02:00
walk_and_embed_assets(
cache,
client,
&url,
child,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
2019-08-23 05:17:15 +02:00
}
2019-08-23 20:24:45 +02:00
}
_ => {
// Note: in case of opt_no_js being set to true, there's no need to worry about
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
// since that's not part of W3C standard and therefore gets ignored
// by browsers other than IE [5, 9]
}
2019-08-23 05:17:15 +02:00
}
}
pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut data.as_bytes())
.unwrap()
}
fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle {
let children = handle.children.borrow();
let matching_children = children.iter().find(|child| match child.data {
NodeData::Element { ref name, .. } => &*name.local == node_name,
_ => false,
});
match matching_children {
Some(node) => node.clone(),
_ => handle.clone(),
}
}
pub fn stringify_document(
handle: &Handle,
opt_no_css: bool,
opt_no_frames: bool,
opt_no_js: bool,
opt_no_images: bool,
opt_isolate: bool,
) -> String {
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, handle, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
2020-01-04 08:33:11 +01:00
let mut result = String::from_utf8(buf).unwrap();
if opt_isolate || opt_no_css || opt_no_frames || opt_no_js || opt_no_images {
let mut buf: Vec<u8> = Vec::new();
let mut dom = html_to_dom(&result);
let doc = dom.get_document();
let html = get_child_node_by_name(&doc, "html");
let head = get_child_node_by_name(&html, "head");
2019-12-11 07:12:57 +01:00
let mut content_attr = str!();
2019-09-29 23:15:49 +02:00
if opt_isolate {
content_attr += " default-src 'unsafe-inline' data:;";
}
if opt_no_css {
content_attr += " style-src 'none';";
}
if opt_no_frames {
content_attr += " frame-src 'none';child-src 'none';";
}
2019-09-29 23:15:49 +02:00
if opt_no_js {
content_attr += " script-src 'none';";
}
if opt_no_images {
content_attr += " img-src data:;";
}
let meta = dom.create_element(
QualName::new(None, ns!(), local_name!("meta")),
vec![
Attribute {
name: QualName::new(None, ns!(), local_name!("http-equiv")),
value: format_tendril!("Content-Security-Policy"),
},
Attribute {
name: QualName::new(None, ns!(), local_name!("content")),
value: format_tendril!("{}", content_attr.trim()),
2019-09-29 23:15:49 +02:00
},
],
Default::default(),
);
head.children.borrow_mut().reverse();
head.children.borrow_mut().push(meta.clone());
head.children.borrow_mut().reverse();
// Note: the CSP meta-tag has to be prepended, never appended,
// since there already may be one defined in the document,
// and browsers don't allow re-defining them (for obvious reasons)
serialize(&mut buf, &doc, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
result = String::from_utf8(buf).unwrap();
// Note: we can't make it isolate the page right away since it may have no HEAD element,
// ergo we have to serialize, parse DOM again, and then finally serialize the result
}
result
2019-08-23 05:17:15 +02:00
}