2020-03-29 09:54:20 +02:00
|
|
|
use crate::css::embed_css;
|
2020-01-02 16:31:55 +01:00
|
|
|
use crate::js::attr_is_event_handler;
|
2020-03-29 09:54:20 +02:00
|
|
|
use crate::utils::{data_to_data_url, is_http_url, resolve_url, retrieve_asset, url_has_protocol};
|
2019-09-22 02:06:00 +02:00
|
|
|
use html5ever::interface::QualName;
|
2019-08-24 20:48:10 +02:00
|
|
|
use html5ever::parse_document;
|
|
|
|
use html5ever::rcdom::{Handle, NodeData, RcDom};
|
|
|
|
use html5ever::serialize::{serialize, SerializeOpts};
|
2019-12-10 01:40:29 +01:00
|
|
|
use html5ever::tendril::{format_tendril, Tendril, TendrilSink};
|
2019-09-22 02:06:00 +02:00
|
|
|
use html5ever::tree_builder::{Attribute, TreeSink};
|
|
|
|
use html5ever::{local_name, namespace_url, ns};
|
2020-01-07 05:22:28 +01:00
|
|
|
use reqwest::blocking::Client;
|
2019-10-23 00:33:22 +02:00
|
|
|
use std::collections::HashMap;
|
2019-08-23 05:17:15 +02:00
|
|
|
use std::default::Default;
|
|
|
|
|
2020-01-03 23:49:39 +01:00
|
|
|
const ICON_VALUES: &[&str] = &[
|
2019-09-29 23:15:49 +02:00
|
|
|
"icon",
|
|
|
|
"shortcut icon",
|
|
|
|
"mask-icon",
|
|
|
|
"apple-touch-icon",
|
|
|
|
"fluid-icon",
|
|
|
|
];
|
|
|
|
|
|
|
|
pub fn get_parent_node(node: &Handle) -> Handle {
|
2019-08-24 17:21:29 +02:00
|
|
|
let parent = node.parent.take().clone();
|
2019-09-22 02:06:00 +02:00
|
|
|
parent.and_then(|node| node.upgrade()).unwrap()
|
|
|
|
}
|
|
|
|
|
2020-04-03 06:00:08 +02:00
|
|
|
pub fn get_node_name(node: &Handle) -> Option<&'_ str> {
|
2019-09-22 02:06:00 +02:00
|
|
|
match &node.data {
|
2020-04-03 06:00:08 +02:00
|
|
|
NodeData::Element { ref name, .. } => Some(name.local.as_ref()),
|
|
|
|
_ => None,
|
2019-08-24 17:21:29 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-29 23:15:49 +02:00
|
|
|
pub fn is_icon(attr_value: &str) -> bool {
|
2020-04-03 06:00:08 +02:00
|
|
|
ICON_VALUES.contains(&attr_value.to_lowercase().as_str())
|
2019-09-29 23:15:49 +02:00
|
|
|
}
|
|
|
|
|
2019-08-23 20:33:18 +02:00
|
|
|
pub fn walk_and_embed_assets(
|
2020-04-11 02:43:29 +02:00
|
|
|
cache: &mut HashMap<String, Vec<u8>>,
|
2019-12-10 03:13:25 +01:00
|
|
|
client: &Client,
|
2019-08-23 20:33:18 +02:00
|
|
|
url: &str,
|
|
|
|
node: &Handle,
|
2019-09-22 02:06:00 +02:00
|
|
|
opt_no_css: bool,
|
2020-04-22 09:37:02 +02:00
|
|
|
opt_no_fonts: bool,
|
|
|
|
opt_no_frames: bool,
|
2019-08-23 20:33:18 +02:00
|
|
|
opt_no_js: bool,
|
|
|
|
opt_no_images: bool,
|
2019-08-25 17:41:30 +02:00
|
|
|
opt_silent: bool,
|
2019-08-23 20:33:18 +02:00
|
|
|
) {
|
2019-08-23 05:17:15 +02:00
|
|
|
match node.data {
|
|
|
|
NodeData::Document => {
|
|
|
|
// Dig deeper
|
|
|
|
for child in node.children.borrow().iter() {
|
2019-08-23 20:33:18 +02:00
|
|
|
walk_and_embed_assets(
|
2019-10-23 00:33:22 +02:00
|
|
|
cache,
|
2019-12-10 03:13:25 +01:00
|
|
|
client,
|
2019-09-22 02:06:00 +02:00
|
|
|
&url,
|
|
|
|
child,
|
|
|
|
opt_no_css,
|
2020-04-22 09:37:02 +02:00
|
|
|
opt_no_fonts,
|
|
|
|
opt_no_frames,
|
2019-09-22 02:06:00 +02:00
|
|
|
opt_no_js,
|
|
|
|
opt_no_images,
|
|
|
|
opt_silent,
|
|
|
|
);
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
NodeData::Element {
|
|
|
|
ref name,
|
|
|
|
ref attrs,
|
|
|
|
..
|
|
|
|
} => {
|
2019-08-23 20:24:45 +02:00
|
|
|
let attrs_mut = &mut attrs.borrow_mut();
|
2019-08-23 05:17:15 +02:00
|
|
|
|
2019-08-24 02:16:16 +02:00
|
|
|
match name.local.as_ref() {
|
|
|
|
"link" => {
|
2019-12-26 15:44:01 +01:00
|
|
|
// Remove integrity attributes
|
|
|
|
let mut i = 0;
|
|
|
|
while i < attrs_mut.len() {
|
|
|
|
let attr_name = attrs_mut[i].name.local.as_ref();
|
|
|
|
if attr_name.eq_ignore_ascii_case("integrity") {
|
|
|
|
attrs_mut.remove(i);
|
|
|
|
} else {
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-09 07:34:30 +01:00
|
|
|
enum LinkType {
|
|
|
|
Icon,
|
|
|
|
Stylesheet,
|
|
|
|
Preload,
|
|
|
|
DnsPrefetch,
|
|
|
|
Unknown,
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut link_type = LinkType::Unknown;
|
2019-08-24 02:16:16 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "rel" {
|
2020-01-10 06:27:15 +01:00
|
|
|
let value = attr.value.trim();
|
2020-01-09 10:18:21 +01:00
|
|
|
if is_icon(value) {
|
2020-01-09 07:34:30 +01:00
|
|
|
link_type = LinkType::Icon;
|
2019-08-24 02:16:16 +02:00
|
|
|
break;
|
2020-01-10 05:52:31 +01:00
|
|
|
} else if value.eq_ignore_ascii_case("stylesheet") {
|
2020-01-09 07:34:30 +01:00
|
|
|
link_type = LinkType::Stylesheet;
|
2019-08-24 02:16:16 +02:00
|
|
|
break;
|
2020-01-10 05:52:31 +01:00
|
|
|
} else if value.eq_ignore_ascii_case("preload") {
|
2020-01-09 10:18:21 +01:00
|
|
|
link_type = LinkType::Preload;
|
|
|
|
break;
|
2020-01-10 05:52:31 +01:00
|
|
|
} else if value.eq_ignore_ascii_case("dns-prefetch") {
|
2020-01-09 10:18:21 +01:00
|
|
|
link_type = LinkType::DnsPrefetch;
|
|
|
|
break;
|
2019-08-24 02:16:16 +02:00
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
2020-01-09 07:34:30 +01:00
|
|
|
let link_type = link_type;
|
2019-08-23 05:17:15 +02:00
|
|
|
|
2020-01-09 07:34:30 +01:00
|
|
|
match link_type {
|
|
|
|
LinkType::Icon => {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
|
|
|
if opt_no_images {
|
|
|
|
attr.value.clear();
|
|
|
|
} else {
|
|
|
|
let href_full_url = resolve_url(&url, attr.value.as_ref())
|
|
|
|
.unwrap_or_default();
|
2020-02-12 07:51:26 +01:00
|
|
|
let (favicon_data_url, _) = retrieve_asset(
|
2020-01-09 07:34:30 +01:00
|
|
|
cache,
|
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2020-01-09 07:34:30 +01:00
|
|
|
&href_full_url,
|
|
|
|
true,
|
|
|
|
"",
|
|
|
|
opt_silent,
|
|
|
|
)
|
|
|
|
.unwrap_or_default();
|
|
|
|
attr.value.clear();
|
2020-02-12 07:51:26 +01:00
|
|
|
attr.value.push_slice(favicon_data_url.as_str());
|
2020-01-09 07:34:30 +01:00
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-01-09 07:34:30 +01:00
|
|
|
LinkType::Stylesheet => {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
|
|
|
if opt_no_css {
|
|
|
|
attr.value.clear();
|
|
|
|
} else {
|
|
|
|
let href_full_url = resolve_url(&url, &attr.value.as_ref())
|
|
|
|
.unwrap_or_default();
|
|
|
|
let replacement_text = match retrieve_asset(
|
2019-12-06 01:10:47 +01:00
|
|
|
cache,
|
2019-12-13 01:29:21 +01:00
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2019-12-06 01:10:47 +01:00
|
|
|
&href_full_url,
|
2020-01-09 07:34:30 +01:00
|
|
|
false,
|
|
|
|
"text/css",
|
2019-12-06 01:10:47 +01:00
|
|
|
opt_silent,
|
2020-01-09 07:34:30 +01:00
|
|
|
) {
|
|
|
|
// On successful retrieval, traverse CSS
|
2020-03-29 09:54:20 +02:00
|
|
|
Ok((css_data, final_url)) => {
|
2020-04-22 09:37:02 +02:00
|
|
|
let css: String = embed_css(
|
2020-03-29 09:54:20 +02:00
|
|
|
cache,
|
|
|
|
client,
|
|
|
|
&final_url,
|
|
|
|
&css_data,
|
2020-04-22 09:37:02 +02:00
|
|
|
opt_no_fonts,
|
2020-03-29 09:54:20 +02:00
|
|
|
opt_no_images,
|
|
|
|
opt_silent,
|
|
|
|
);
|
|
|
|
data_to_data_url(
|
|
|
|
"text/css",
|
2020-04-22 09:37:02 +02:00
|
|
|
css.as_bytes(),
|
2020-03-29 09:54:20 +02:00
|
|
|
&final_url,
|
|
|
|
"",
|
|
|
|
)
|
|
|
|
}
|
2019-12-06 01:10:47 +01:00
|
|
|
|
2020-01-09 07:34:30 +01:00
|
|
|
// If a network error occured, warn
|
|
|
|
Err(e) => {
|
|
|
|
eprintln!("Warning: {}", e);
|
2019-10-12 11:32:59 +02:00
|
|
|
|
2020-01-09 07:34:30 +01:00
|
|
|
// If failed to resolve, replace with absolute URL
|
|
|
|
href_full_url
|
|
|
|
}
|
|
|
|
};
|
2019-12-06 01:10:47 +01:00
|
|
|
|
2020-01-09 07:34:30 +01:00
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(&replacement_text);
|
|
|
|
}
|
2019-09-22 02:06:00 +02:00
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
}
|
|
|
|
}
|
2020-01-09 10:18:21 +01:00
|
|
|
LinkType::Preload | LinkType::DnsPrefetch => {
|
2020-01-13 15:47:07 +01:00
|
|
|
// Since all resources are embedded as data URL, preloading and prefetching are unnecessary
|
2020-01-09 10:18:21 +01:00
|
|
|
if let Some(attr) =
|
|
|
|
attrs_mut.iter_mut().find(|a| &a.name.local == "href")
|
|
|
|
{
|
|
|
|
attr.value.clear();
|
|
|
|
}
|
|
|
|
}
|
2020-01-09 07:34:30 +01:00
|
|
|
LinkType::Unknown => {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
|
|
|
let href_full_url =
|
|
|
|
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
|
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(&href_full_url.as_str());
|
|
|
|
}
|
2019-08-24 20:22:34 +02:00
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2020-03-05 10:56:09 +01:00
|
|
|
"body" => {
|
|
|
|
// Find and remove background attribute(s), keep reference to the last one
|
|
|
|
let mut found_background: Option<Attribute> = None;
|
|
|
|
let mut i = 0;
|
|
|
|
while i < attrs_mut.len() {
|
|
|
|
let attr_name = attrs_mut[i].name.local.as_ref();
|
|
|
|
if attr_name.eq_ignore_ascii_case("background") {
|
|
|
|
found_background = Some(attrs_mut.remove(i));
|
|
|
|
} else {
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !opt_no_images {
|
|
|
|
if let Some((data_url, _)) = found_background
|
|
|
|
.iter()
|
|
|
|
.map(|attr| attr.value.trim())
|
|
|
|
.filter(|background| !background.is_empty()) // Skip if empty
|
|
|
|
.next()
|
|
|
|
.and_then(|background| resolve_url(&url, background).ok()) // Make absolute
|
|
|
|
.and_then(|abs_src| // Download and convert to data_url
|
|
|
|
retrieve_asset(
|
|
|
|
cache,
|
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2020-03-05 10:56:09 +01:00
|
|
|
&abs_src,
|
|
|
|
true,
|
|
|
|
"",
|
|
|
|
opt_silent,
|
|
|
|
).ok())
|
|
|
|
{
|
|
|
|
// Add new data_url background attribute
|
|
|
|
attrs_mut.push(Attribute {
|
|
|
|
name: QualName::new(None, ns!(), local_name!("background")),
|
|
|
|
value: Tendril::from_slice(data_url.as_ref()),
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"img" => {
|
2020-03-05 10:56:09 +01:00
|
|
|
// Find source attribute(s)
|
2019-12-10 01:40:29 +01:00
|
|
|
let mut found_src: Option<Attribute> = None;
|
|
|
|
let mut found_datasrc: Option<Attribute> = None;
|
|
|
|
let mut i = 0;
|
|
|
|
while i < attrs_mut.len() {
|
2020-03-05 10:56:09 +01:00
|
|
|
let attr_name = attrs_mut[i].name.local.as_ref();
|
|
|
|
if attr_name.eq_ignore_ascii_case("src") {
|
2019-12-10 01:40:29 +01:00
|
|
|
found_src = Some(attrs_mut.remove(i));
|
2020-03-05 10:56:09 +01:00
|
|
|
} else if attr_name.eq_ignore_ascii_case("data-src") {
|
2019-12-10 01:40:29 +01:00
|
|
|
found_datasrc = Some(attrs_mut.remove(i));
|
|
|
|
} else {
|
|
|
|
i += 1;
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
2019-12-10 01:40:29 +01:00
|
|
|
|
|
|
|
// If images are disabled, clear both sources
|
|
|
|
if opt_no_images {
|
|
|
|
attrs_mut.push(Attribute {
|
|
|
|
name: QualName::new(None, ns!(), local_name!("src")),
|
2020-04-03 09:30:52 +02:00
|
|
|
value: Tendril::from_slice(empty_image!()),
|
2019-12-10 01:40:29 +01:00
|
|
|
});
|
2020-02-12 07:51:26 +01:00
|
|
|
} else if let Some((data_url, _)) = found_datasrc
|
2020-01-10 06:45:02 +01:00
|
|
|
.iter()
|
2020-02-12 07:51:26 +01:00
|
|
|
.chain(&found_src) // Give data_url priority
|
2020-01-10 06:27:15 +01:00
|
|
|
.map(|attr| attr.value.trim())
|
2020-03-05 10:56:09 +01:00
|
|
|
.filter(|src| !src.is_empty()) // Skip if empty
|
2019-12-10 01:40:29 +01:00
|
|
|
.next()
|
2019-12-24 16:07:56 +01:00
|
|
|
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
|
2020-02-12 07:51:26 +01:00
|
|
|
.and_then(|abs_src| // Download and convert to data_url
|
2019-12-10 01:40:29 +01:00
|
|
|
retrieve_asset(
|
|
|
|
cache,
|
2019-12-24 16:07:56 +01:00
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2019-12-10 01:40:29 +01:00
|
|
|
&abs_src,
|
|
|
|
true,
|
|
|
|
"",
|
|
|
|
opt_silent,
|
|
|
|
).ok())
|
|
|
|
{
|
2020-03-05 10:56:09 +01:00
|
|
|
// Add new data_url src attribute
|
2019-12-10 01:40:29 +01:00
|
|
|
attrs_mut.push(Attribute {
|
|
|
|
name: QualName::new(None, ns!(), local_name!("src")),
|
2020-02-12 07:51:26 +01:00
|
|
|
value: Tendril::from_slice(data_url.as_ref()),
|
2019-12-10 01:40:29 +01:00
|
|
|
});
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2020-04-03 09:30:52 +02:00
|
|
|
"input" => {
|
|
|
|
let mut is_image: bool = false;
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
let attr_name: &str = &attr.name.local;
|
|
|
|
if attr_name == "type" {
|
|
|
|
is_image = attr.value.to_string().eq_ignore_ascii_case("image");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if is_image {
|
|
|
|
let mut found_src: Option<Attribute> = None;
|
|
|
|
let mut i = 0;
|
|
|
|
while i < attrs_mut.len() {
|
|
|
|
let attr_name = attrs_mut[i].name.local.as_ref();
|
|
|
|
if attr_name.eq_ignore_ascii_case("src") {
|
|
|
|
found_src = Some(attrs_mut.remove(i));
|
|
|
|
} else {
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If images are disabled, clear both sources
|
|
|
|
if opt_no_images {
|
|
|
|
attrs_mut.push(Attribute {
|
|
|
|
name: QualName::new(None, ns!(), local_name!("src")),
|
|
|
|
value: Tendril::from_slice(empty_image!()),
|
|
|
|
});
|
|
|
|
} else if let Some((data_url, _)) = found_src
|
|
|
|
.iter()
|
|
|
|
.map(|attr| attr.value.trim())
|
|
|
|
.filter(|src| !src.is_empty()) // Skip if empty
|
|
|
|
.next()
|
|
|
|
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
|
|
|
|
.and_then(|abs_src| // Download and convert to data_url
|
|
|
|
retrieve_asset(
|
|
|
|
cache,
|
|
|
|
client,
|
|
|
|
&url,
|
|
|
|
&abs_src,
|
|
|
|
true,
|
|
|
|
"",
|
|
|
|
opt_silent,
|
|
|
|
).ok())
|
|
|
|
{
|
|
|
|
// Add new data_url src attribute
|
|
|
|
attrs_mut.push(Attribute {
|
|
|
|
name: QualName::new(None, ns!(), local_name!("src")),
|
|
|
|
value: Tendril::from_slice(data_url.as_ref()),
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-04-04 03:56:46 +02:00
|
|
|
"svg" => {
|
|
|
|
if opt_no_images {
|
|
|
|
node.children.borrow_mut().clear();
|
|
|
|
}
|
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"source" => {
|
2019-08-24 17:21:29 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
2019-09-22 18:57:50 +02:00
|
|
|
let attr_name: &str = &attr.name.local;
|
|
|
|
|
|
|
|
if attr_name == "src" {
|
2020-01-10 06:27:15 +01:00
|
|
|
let src_full_url = resolve_url(&url, attr.value.trim())
|
2020-01-04 00:05:02 +01:00
|
|
|
.unwrap_or_else(|_| attr.value.to_string());
|
2019-09-22 18:57:50 +02:00
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(src_full_url.as_str());
|
|
|
|
} else if attr_name == "srcset" {
|
2020-04-03 06:00:08 +02:00
|
|
|
if get_node_name(&get_parent_node(&node)) == Some("picture") {
|
2019-08-24 17:21:29 +02:00
|
|
|
if opt_no_images {
|
|
|
|
attr.value.clear();
|
2020-04-03 09:30:52 +02:00
|
|
|
attr.value.push_slice(empty_image!());
|
2019-08-24 17:21:29 +02:00
|
|
|
} else {
|
2020-01-04 08:33:11 +01:00
|
|
|
let srcset_full_url =
|
2020-01-10 06:27:15 +01:00
|
|
|
resolve_url(&url, attr.value.trim()).unwrap_or_default();
|
2020-02-12 07:51:26 +01:00
|
|
|
let (source_data_url, _) = retrieve_asset(
|
2019-10-23 00:33:22 +02:00
|
|
|
cache,
|
2019-12-10 03:13:25 +01:00
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2019-09-22 02:06:00 +02:00
|
|
|
&srcset_full_url,
|
|
|
|
true,
|
|
|
|
"",
|
|
|
|
opt_silent,
|
|
|
|
)
|
2019-12-11 07:12:57 +01:00
|
|
|
.unwrap_or((str!(), str!()));
|
2019-08-24 17:21:29 +02:00
|
|
|
attr.value.clear();
|
2020-02-12 07:51:26 +01:00
|
|
|
attr.value.push_slice(source_data_url.as_str());
|
2019-08-24 17:21:29 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-04-04 20:55:45 +02:00
|
|
|
"a" | "area" => {
|
2019-08-23 05:17:15 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "href" {
|
2020-01-10 06:27:15 +01:00
|
|
|
let attr_value = attr.value.trim();
|
2020-02-03 07:42:46 +01:00
|
|
|
|
|
|
|
if opt_no_js && attr_value.starts_with("javascript:") {
|
|
|
|
attr.value.clear();
|
|
|
|
// Replace with empty JS call to preserve original behavior
|
|
|
|
attr.value.push_slice("javascript:;");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2019-08-23 22:00:05 +02:00
|
|
|
// Don't touch email links or hrefs which begin with a hash sign
|
2020-01-10 06:27:15 +01:00
|
|
|
if attr_value.starts_with('#') || url_has_protocol(attr_value) {
|
2019-08-23 05:17:15 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-01-10 06:27:15 +01:00
|
|
|
let href_full_url = resolve_url(&url, attr_value).unwrap_or_default();
|
2019-08-23 05:17:15 +02:00
|
|
|
attr.value.clear();
|
2019-08-25 05:06:40 +02:00
|
|
|
attr.value.push_slice(href_full_url.as_str());
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"script" => {
|
2019-12-26 15:44:01 +01:00
|
|
|
// Remove integrity attributes
|
|
|
|
let mut i = 0;
|
|
|
|
while i < attrs_mut.len() {
|
|
|
|
let attr_name = attrs_mut[i].name.local.as_ref();
|
|
|
|
if attr_name.eq_ignore_ascii_case("integrity") {
|
|
|
|
attrs_mut.remove(i);
|
|
|
|
} else {
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-23 05:17:15 +02:00
|
|
|
if opt_no_js {
|
2019-09-22 02:06:00 +02:00
|
|
|
// Empty src and inner content of SCRIPT tags
|
2019-08-23 05:17:15 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
|
|
|
attr.value.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
node.children.borrow_mut().clear();
|
|
|
|
} else {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
2020-01-04 00:05:02 +01:00
|
|
|
let src_full_url =
|
2020-01-10 06:27:15 +01:00
|
|
|
resolve_url(&url, attr.value.trim()).unwrap_or_default();
|
2020-02-12 07:51:26 +01:00
|
|
|
let (js_data_url, _) = retrieve_asset(
|
2019-10-23 00:33:22 +02:00
|
|
|
cache,
|
2019-12-10 03:13:25 +01:00
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2019-09-22 02:06:00 +02:00
|
|
|
&src_full_url,
|
|
|
|
true,
|
|
|
|
"application/javascript",
|
|
|
|
opt_silent,
|
|
|
|
)
|
2019-12-11 07:12:57 +01:00
|
|
|
.unwrap_or((str!(), str!()));
|
2019-08-23 05:17:15 +02:00
|
|
|
attr.value.clear();
|
2020-02-12 07:51:26 +01:00
|
|
|
attr.value.push_slice(js_data_url.as_str());
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-09-22 02:06:00 +02:00
|
|
|
"style" => {
|
|
|
|
if opt_no_css {
|
2019-09-29 23:15:49 +02:00
|
|
|
// Empty inner content of STYLE tags
|
2019-09-22 02:06:00 +02:00
|
|
|
node.children.borrow_mut().clear();
|
2019-12-06 02:05:52 +01:00
|
|
|
} else {
|
|
|
|
for node in node.children.borrow_mut().iter_mut() {
|
2019-12-06 02:41:43 +01:00
|
|
|
if let NodeData::Text { ref contents } = node.data {
|
2019-12-06 02:20:09 +01:00
|
|
|
let mut tendril = contents.borrow_mut();
|
2020-03-29 09:54:20 +02:00
|
|
|
let replacement = embed_css(
|
2019-12-06 02:20:09 +01:00
|
|
|
cache,
|
2019-12-13 01:29:21 +01:00
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2020-03-29 09:54:20 +02:00
|
|
|
tendril.as_ref(),
|
2020-04-22 09:37:02 +02:00
|
|
|
opt_no_fonts,
|
2019-12-06 20:59:13 +01:00
|
|
|
opt_no_images,
|
2019-12-06 02:20:09 +01:00
|
|
|
opt_silent,
|
|
|
|
);
|
|
|
|
tendril.clear();
|
2019-12-09 18:41:21 +01:00
|
|
|
tendril.push_slice(&replacement);
|
2019-12-06 02:05:52 +01:00
|
|
|
}
|
|
|
|
}
|
2019-09-22 02:06:00 +02:00
|
|
|
}
|
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
"form" => {
|
2019-08-23 09:26:05 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "action" {
|
2020-01-10 06:27:15 +01:00
|
|
|
let attr_value = attr.value.trim();
|
2019-09-29 23:15:49 +02:00
|
|
|
// Modify action to be a full URL
|
2020-02-12 07:59:21 +01:00
|
|
|
if !is_http_url(attr_value) {
|
2020-01-04 00:05:02 +01:00
|
|
|
let href_full_url =
|
2020-01-10 06:27:15 +01:00
|
|
|
resolve_url(&url, attr_value).unwrap_or_default();
|
2019-09-29 23:15:49 +02:00
|
|
|
attr.value.clear();
|
|
|
|
attr.value.push_slice(href_full_url.as_str());
|
2019-08-23 09:26:05 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2020-02-24 06:06:31 +01:00
|
|
|
"frame" | "iframe" => {
|
2019-09-29 23:15:49 +02:00
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "src" {
|
|
|
|
if opt_no_frames {
|
|
|
|
// Empty the src attribute
|
2019-09-22 02:06:00 +02:00
|
|
|
attr.value.clear();
|
2019-09-29 23:15:49 +02:00
|
|
|
continue;
|
2019-08-27 04:57:10 +02:00
|
|
|
}
|
|
|
|
|
2020-02-24 06:06:31 +01:00
|
|
|
let frame_src = attr.value.trim();
|
2019-09-22 02:06:00 +02:00
|
|
|
|
2020-02-24 06:06:31 +01:00
|
|
|
// Ignore (i)frames with empty source — they cause infinite loops
|
|
|
|
if frame_src.is_empty() {
|
2019-09-29 23:15:49 +02:00
|
|
|
continue;
|
2019-09-22 02:06:00 +02:00
|
|
|
}
|
2019-09-29 23:15:49 +02:00
|
|
|
|
2020-02-24 06:06:31 +01:00
|
|
|
let src_full_url = resolve_url(&url, frame_src).unwrap_or_default();
|
|
|
|
let (frame_data, frame_final_url) = retrieve_asset(
|
2019-10-23 00:33:22 +02:00
|
|
|
cache,
|
2019-12-10 03:13:25 +01:00
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2019-09-29 23:15:49 +02:00
|
|
|
&src_full_url,
|
|
|
|
false,
|
|
|
|
"text/html",
|
|
|
|
opt_silent,
|
|
|
|
)
|
2019-12-11 07:12:57 +01:00
|
|
|
.unwrap_or((str!(), src_full_url));
|
2020-02-24 06:06:31 +01:00
|
|
|
let dom = html_to_dom(&frame_data);
|
2019-09-29 23:15:49 +02:00
|
|
|
walk_and_embed_assets(
|
2019-10-23 00:33:22 +02:00
|
|
|
cache,
|
2019-12-10 03:13:25 +01:00
|
|
|
client,
|
2020-02-24 06:06:31 +01:00
|
|
|
&frame_final_url,
|
2019-09-29 23:15:49 +02:00
|
|
|
&dom.document,
|
|
|
|
opt_no_css,
|
2020-04-22 09:37:02 +02:00
|
|
|
opt_no_fonts,
|
|
|
|
opt_no_frames,
|
2019-09-29 23:15:49 +02:00
|
|
|
opt_no_js,
|
|
|
|
opt_no_images,
|
|
|
|
opt_silent,
|
|
|
|
);
|
|
|
|
let mut buf: Vec<u8> = Vec::new();
|
|
|
|
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
2020-03-29 09:54:20 +02:00
|
|
|
let iframe_data_url = data_to_data_url("text/html", &buf, "", "");
|
2019-09-29 23:15:49 +02:00
|
|
|
attr.value.clear();
|
2020-02-12 07:51:26 +01:00
|
|
|
attr.value.push_slice(iframe_data_url.as_str());
|
2019-08-24 02:16:16 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-09-22 18:57:50 +02:00
|
|
|
"video" => {
|
|
|
|
for attr in attrs_mut.iter_mut() {
|
|
|
|
if &attr.name.local == "poster" {
|
2020-01-10 06:27:15 +01:00
|
|
|
let video_poster = attr.value.trim();
|
2019-09-22 18:57:50 +02:00
|
|
|
|
2019-09-29 23:15:49 +02:00
|
|
|
// Skip posters with empty source
|
2020-01-04 00:05:02 +01:00
|
|
|
if video_poster.is_empty() {
|
2019-09-22 18:57:50 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if opt_no_images {
|
|
|
|
attr.value.clear();
|
|
|
|
} else {
|
2020-01-03 23:58:29 +01:00
|
|
|
let poster_full_url =
|
2020-01-04 00:05:02 +01:00
|
|
|
resolve_url(&url, video_poster).unwrap_or_default();
|
2020-02-12 07:51:26 +01:00
|
|
|
let (poster_data_url, _) = retrieve_asset(
|
2019-10-23 00:33:22 +02:00
|
|
|
cache,
|
2019-12-10 03:13:25 +01:00
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2019-09-22 18:57:50 +02:00
|
|
|
&poster_full_url,
|
|
|
|
true,
|
|
|
|
"",
|
|
|
|
opt_silent,
|
|
|
|
)
|
2019-12-11 07:12:57 +01:00
|
|
|
.unwrap_or((poster_full_url, str!()));
|
2019-09-22 18:57:50 +02:00
|
|
|
attr.value.clear();
|
2020-02-12 07:51:26 +01:00
|
|
|
attr.value.push_slice(poster_data_url.as_str());
|
2019-09-22 18:57:50 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-08-25 05:06:40 +02:00
|
|
|
_ => {}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
|
2019-12-06 21:28:08 +01:00
|
|
|
// Process style attributes
|
2019-09-22 02:06:00 +02:00
|
|
|
if opt_no_css {
|
|
|
|
// Get rid of style attributes
|
2019-09-29 23:15:49 +02:00
|
|
|
let mut style_attr_indexes = Vec::new();
|
|
|
|
for (i, attr) in attrs_mut.iter_mut().enumerate() {
|
2020-03-29 09:54:20 +02:00
|
|
|
if attr.name.local.as_ref().eq_ignore_ascii_case("style") {
|
2019-09-29 23:15:49 +02:00
|
|
|
style_attr_indexes.push(i);
|
2019-09-22 02:06:00 +02:00
|
|
|
}
|
|
|
|
}
|
2019-09-29 23:15:49 +02:00
|
|
|
style_attr_indexes.reverse();
|
|
|
|
for attr_index in style_attr_indexes {
|
|
|
|
attrs_mut.remove(attr_index);
|
|
|
|
}
|
2019-12-06 21:28:08 +01:00
|
|
|
} else {
|
|
|
|
// Otherwise, parse any links found in the attributes
|
|
|
|
for attribute in attrs_mut
|
|
|
|
.iter_mut()
|
|
|
|
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
|
|
|
|
{
|
2020-03-29 09:54:20 +02:00
|
|
|
let replacement = embed_css(
|
2019-12-06 21:28:08 +01:00
|
|
|
cache,
|
2019-12-13 01:29:21 +01:00
|
|
|
client,
|
2020-03-08 20:31:42 +01:00
|
|
|
&url,
|
2020-03-29 09:54:20 +02:00
|
|
|
attribute.value.as_ref(),
|
2020-04-22 09:37:02 +02:00
|
|
|
opt_no_fonts,
|
2019-12-06 21:28:08 +01:00
|
|
|
opt_no_images,
|
|
|
|
opt_silent,
|
|
|
|
);
|
2020-03-29 09:54:20 +02:00
|
|
|
// let replacement = str!();
|
2019-12-06 21:28:08 +01:00
|
|
|
attribute.value.clear();
|
2019-12-09 18:41:21 +01:00
|
|
|
attribute.value.push_slice(&replacement);
|
2019-12-06 21:28:08 +01:00
|
|
|
}
|
2019-09-22 02:06:00 +02:00
|
|
|
}
|
|
|
|
|
2019-08-23 05:17:15 +02:00
|
|
|
if opt_no_js {
|
|
|
|
// Get rid of JS event attributes
|
2019-09-29 23:15:49 +02:00
|
|
|
let mut js_attr_indexes = Vec::new();
|
|
|
|
for (i, attr) in attrs_mut.iter_mut().enumerate() {
|
|
|
|
if attr_is_event_handler(&attr.name.local) {
|
|
|
|
js_attr_indexes.push(i);
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
2019-09-29 23:15:49 +02:00
|
|
|
js_attr_indexes.reverse();
|
|
|
|
for attr_index in js_attr_indexes {
|
|
|
|
attrs_mut.remove(attr_index);
|
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
|
|
|
|
// Dig deeper
|
|
|
|
for child in node.children.borrow().iter() {
|
2019-08-23 20:33:18 +02:00
|
|
|
walk_and_embed_assets(
|
2019-10-23 00:33:22 +02:00
|
|
|
cache,
|
2019-12-10 03:13:25 +01:00
|
|
|
client,
|
2019-09-22 02:06:00 +02:00
|
|
|
&url,
|
|
|
|
child,
|
|
|
|
opt_no_css,
|
2020-04-22 09:37:02 +02:00
|
|
|
opt_no_fonts,
|
|
|
|
opt_no_frames,
|
2019-09-22 02:06:00 +02:00
|
|
|
opt_no_js,
|
|
|
|
opt_no_images,
|
|
|
|
opt_silent,
|
|
|
|
);
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
2019-08-23 20:24:45 +02:00
|
|
|
}
|
2019-09-22 02:06:00 +02:00
|
|
|
_ => {
|
|
|
|
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
|
|
|
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
|
|
|
// since that's not part of W3C standard and therefore gets ignored
|
|
|
|
// by browsers other than IE [5, 9]
|
|
|
|
}
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
|
|
|
|
parse_document(RcDom::default(), Default::default())
|
|
|
|
.from_utf8()
|
|
|
|
.read_from(&mut data.as_bytes())
|
|
|
|
.unwrap()
|
|
|
|
}
|
|
|
|
|
2019-09-22 02:06:00 +02:00
|
|
|
fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle {
|
|
|
|
let children = handle.children.borrow();
|
|
|
|
let matching_children = children.iter().find(|child| match child.data {
|
|
|
|
NodeData::Element { ref name, .. } => &*name.local == node_name,
|
|
|
|
_ => false,
|
|
|
|
});
|
|
|
|
match matching_children {
|
|
|
|
Some(node) => node.clone(),
|
2020-01-03 23:49:26 +01:00
|
|
|
_ => handle.clone(),
|
2019-09-22 02:06:00 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn stringify_document(
|
|
|
|
handle: &Handle,
|
|
|
|
opt_no_css: bool,
|
|
|
|
opt_no_frames: bool,
|
|
|
|
opt_no_js: bool,
|
|
|
|
opt_no_images: bool,
|
|
|
|
opt_isolate: bool,
|
|
|
|
) -> String {
|
|
|
|
let mut buf: Vec<u8> = Vec::new();
|
|
|
|
serialize(&mut buf, handle, SerializeOpts::default())
|
|
|
|
.expect("unable to serialize DOM into buffer");
|
|
|
|
|
2020-01-04 08:33:11 +01:00
|
|
|
let mut result = String::from_utf8(buf).unwrap();
|
2019-09-22 02:06:00 +02:00
|
|
|
|
|
|
|
if opt_isolate || opt_no_css || opt_no_frames || opt_no_js || opt_no_images {
|
|
|
|
let mut buf: Vec<u8> = Vec::new();
|
|
|
|
let mut dom = html_to_dom(&result);
|
|
|
|
let doc = dom.get_document();
|
|
|
|
let html = get_child_node_by_name(&doc, "html");
|
|
|
|
let head = get_child_node_by_name(&html, "head");
|
2019-12-11 07:12:57 +01:00
|
|
|
let mut content_attr = str!();
|
2019-09-29 23:15:49 +02:00
|
|
|
if opt_isolate {
|
|
|
|
content_attr += " default-src 'unsafe-inline' data:;";
|
|
|
|
}
|
|
|
|
if opt_no_css {
|
|
|
|
content_attr += " style-src 'none';";
|
|
|
|
}
|
|
|
|
if opt_no_frames {
|
|
|
|
content_attr += " frame-src 'none';child-src 'none';";
|
2019-09-22 02:06:00 +02:00
|
|
|
}
|
2019-09-29 23:15:49 +02:00
|
|
|
if opt_no_js {
|
|
|
|
content_attr += " script-src 'none';";
|
|
|
|
}
|
|
|
|
if opt_no_images {
|
|
|
|
content_attr += " img-src data:;";
|
|
|
|
}
|
|
|
|
|
|
|
|
let meta = dom.create_element(
|
|
|
|
QualName::new(None, ns!(), local_name!("meta")),
|
|
|
|
vec![
|
|
|
|
Attribute {
|
|
|
|
name: QualName::new(None, ns!(), local_name!("http-equiv")),
|
|
|
|
value: format_tendril!("Content-Security-Policy"),
|
|
|
|
},
|
|
|
|
Attribute {
|
|
|
|
name: QualName::new(None, ns!(), local_name!("content")),
|
2020-01-04 00:05:02 +01:00
|
|
|
value: format_tendril!("{}", content_attr.trim()),
|
2019-09-29 23:15:49 +02:00
|
|
|
},
|
|
|
|
],
|
|
|
|
Default::default(),
|
|
|
|
);
|
|
|
|
head.children.borrow_mut().reverse();
|
|
|
|
head.children.borrow_mut().push(meta.clone());
|
|
|
|
head.children.borrow_mut().reverse();
|
|
|
|
// Note: the CSP meta-tag has to be prepended, never appended,
|
|
|
|
// since there already may be one defined in the document,
|
|
|
|
// and browsers don't allow re-defining them (for obvious reasons)
|
|
|
|
|
2019-09-22 02:06:00 +02:00
|
|
|
serialize(&mut buf, &doc, SerializeOpts::default())
|
|
|
|
.expect("unable to serialize DOM into buffer");
|
|
|
|
result = String::from_utf8(buf).unwrap();
|
|
|
|
// Note: we can't make it isolate the page right away since it may have no HEAD element,
|
|
|
|
// ergo we have to serialize, parse DOM again, and then finally serialize the result
|
|
|
|
}
|
|
|
|
|
|
|
|
result
|
2019-08-23 05:17:15 +02:00
|
|
|
}
|