Add CSP isolation, no CSS, and no iframe options

This commit is contained in:
Vincent Flyson 2019-09-21 20:06:00 -04:00
parent ac79a52da0
commit 88a230872c
8 changed files with 571 additions and 183 deletions

View file

@ -12,6 +12,10 @@ rust:
- beta - beta
- nightly - nightly
before_script:
- rustup component add rustfmt
script: script:
- cargo build --verbose - cargo build --verbose
- cargo test --verbose - cargo test --verbose
- cargo fmt --all -- --check

View file

@ -1,6 +1,6 @@
[package] [package]
name = "monolith" name = "monolith"
version = "2.0.17" version = "2.0.18"
authors = [ authors = [
"Sunshine <sunshine@uberspace.net>", "Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>", "Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",

View file

@ -1,5 +1,6 @@
[![Travis CI Build Status](https://travis-ci.org/Y2Z/monolith.svg?branch=master)](https://travis-ci.org/Y2Z/monolith) [![Travis CI Build Status](https://travis-ci.org/Y2Z/monolith.svg?branch=master)](https://travis-ci.org/Y2Z/monolith)
[![AppVeyor Build status](https://ci.appveyor.com/api/projects/status/j1v1d96sw952b1ch?svg=true)](https://ci.appveyor.com/project/vflyson/monolith) [![AppVeyor Build status](https://ci.appveyor.com/api/projects/status/ae7soyjih8jg2bv7/branch/master?svg=true)](https://ci.appveyor.com/project/snshn/monolith/branch/master)
# monolith # monolith
@ -21,7 +22,10 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html $ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
### Options ### Options
- `-c`: Ignore styles
- `-f`: Exclude iframes
- `-i`: Remove images - `-i`: Remove images
- `-I`: Isolate document
- `-j`: Exclude JavaScript - `-j`: Exclude JavaScript
- `-k`: Accept invalid X.509 (TLS) certificates - `-k`: Accept invalid X.509 (TLS) certificates
- `-s`: Silent mode - `-s`: Silent mode

View file

@ -114,6 +114,7 @@ install:
- if defined MINGW_PATH set PATH=%PATH%;%MINGW_PATH% - if defined MINGW_PATH set PATH=%PATH%;%MINGW_PATH%
- rustc -vV - rustc -vV
- cargo -vV - cargo -vV
- rustup component add rustfmt
## Build Script ## ## Build Script ##
@ -125,4 +126,5 @@ build: false
#directly or perform other testing commands. Rust will automatically be placed in the PATH #directly or perform other testing commands. Rust will automatically be placed in the PATH
# environment variable. # environment variable.
test_script: test_script:
- cargo test --verbose %cargoflags% - cargo test --verbose %cargoflags%
- cargo fmt --all -- --check

View file

@ -1,23 +1,25 @@
use html5ever::interface::QualName;
use html5ever::parse_document; use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts}; use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::TendrilSink; use html5ever::tendril::{format_tendril, TendrilSink};
use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns};
use http::{is_valid_url, resolve_url, retrieve_asset}; use http::{is_valid_url, resolve_url, retrieve_asset};
use regex::Regex; use regex::Regex;
use std::default::Default; use std::default::Default;
use std::io;
use utils::data_to_dataurl; use utils::data_to_dataurl;
lazy_static! { lazy_static! {
static ref EMPTY_STRING: String = String::new(); static ref EMPTY_STRING: String = String::new();
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
static ref ICON_VALUES: Regex = Regex::new( static ref ICON_VALUES: Regex =
r"^icon|shortcut icon|mask-icon|apple-touch-icon|fluid-icon$" Regex::new(r"^icon|shortcut icon|mask-icon|apple-touch-icon|fluid-icon$").unwrap();
).unwrap();
} }
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\ const TRANSPARENT_PIXEL: &str =
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="; "data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
const JS_DOM_EVENT_ATTRS: [&str; 21] = [ const JS_DOM_EVENT_ATTRS: [&str; 21] = [
// Input // Input
@ -47,53 +49,46 @@ const JS_DOM_EVENT_ATTRS: [&str; 21] = [
"onresize", "onresize",
]; ];
fn get_parent_node_name(node: &Handle) -> String { fn get_parent_node(node: &Handle) -> Handle {
let parent = node.parent.take().clone(); let parent = node.parent.take().clone();
let parent_node = parent.and_then(|node| node.upgrade()).unwrap(); parent.and_then(|node| node.upgrade()).unwrap()
}
match &parent_node.data { fn get_node_name(node: &Handle) -> String {
NodeData::Document => { EMPTY_STRING.clone() } match &node.data {
NodeData::Doctype { .. } => { EMPTY_STRING.clone() } NodeData::Element { ref name, .. } => name.local.as_ref().to_string(),
NodeData::Text { .. } => { EMPTY_STRING.clone() } _ => EMPTY_STRING.clone(),
NodeData::Comment { .. } => { EMPTY_STRING.clone() }
NodeData::Element { ref name, attrs: _, .. } => {
name.local.as_ref().to_string()
}
NodeData::ProcessingInstruction { .. } => unreachable!()
} }
} }
pub fn walk_and_embed_assets( pub fn walk_and_embed_assets(
url: &str, url: &str,
node: &Handle, node: &Handle,
opt_no_css: bool,
opt_no_js: bool, opt_no_js: bool,
opt_no_images: bool, opt_no_images: bool,
opt_user_agent: &str, opt_user_agent: &str,
opt_silent: bool, opt_silent: bool,
opt_insecure: bool, opt_insecure: bool,
opt_no_frames: bool,
) { ) {
match node.data { match node.data {
NodeData::Document => { NodeData::Document => {
// Dig deeper // Dig deeper
for child in node.children.borrow().iter() { for child in node.children.borrow().iter() {
walk_and_embed_assets( walk_and_embed_assets(
&url, child, &url,
opt_no_js, child,
opt_no_images, opt_no_css,
opt_user_agent, opt_no_js,
opt_silent, opt_no_images,
opt_insecure, opt_user_agent,
); opt_silent,
opt_insecure,
opt_no_frames,
);
} }
} }
NodeData::Doctype { .. } => {}
NodeData::Text { .. } => {}
NodeData::Comment { .. } => {
// Note: in case of opt_no_js being set to true, there's no need to worry about
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
// since that's not part of W3C standard and therefore gets ignored
// by browsers other than IE [5, 9]
}
NodeData::Element { NodeData::Element {
ref name, ref name,
ref attrs, ref attrs,
@ -103,7 +98,7 @@ pub fn walk_and_embed_assets(
match name.local.as_ref() { match name.local.as_ref() {
"link" => { "link" => {
let mut link_type = ""; let mut link_type: &str = "";
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" { if &attr.name.local == "rel" {
@ -122,22 +117,19 @@ pub fn walk_and_embed_assets(
if &attr.name.local == "href" { if &attr.name.local == "href" {
if opt_no_images { if opt_no_images {
attr.value.clear(); attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else { } else {
let href_full_url: String = resolve_url( let href_full_url: String =
&url, resolve_url(&url, &attr.value.to_string())
&attr.value.to_string() .unwrap_or(EMPTY_STRING.clone());
)
.unwrap_or(EMPTY_STRING.clone());
let favicon_datauri = retrieve_asset( let favicon_datauri = retrieve_asset(
&href_full_url, &href_full_url,
true, true,
"", "",
opt_user_agent, opt_user_agent,
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(favicon_datauri.as_str()); attr.value.push_slice(favicon_datauri.as_str());
} }
@ -146,12 +138,13 @@ pub fn walk_and_embed_assets(
} else if link_type == "stylesheet" { } else if link_type == "stylesheet" {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" { if &attr.name.local == "href" {
let href_full_url: String = resolve_url( if opt_no_css {
&url, attr.value.clear();
&attr.value.to_string(), } else {
) let href_full_url: String =
.unwrap_or(EMPTY_STRING.clone()); resolve_url(&url, &attr.value.to_string())
let css_datauri = retrieve_asset( .unwrap_or(EMPTY_STRING.clone());
let css_datauri = retrieve_asset(
&href_full_url, &href_full_url,
true, true,
"text/css", "text/css",
@ -160,18 +153,17 @@ pub fn walk_and_embed_assets(
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(css_datauri.as_str()); attr.value.push_slice(css_datauri.as_str());
}
} }
} }
} else { } else {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" { if &attr.name.local == "href" {
let href_full_url: String = resolve_url( let href_full_url: String =
&url, resolve_url(&url, &attr.value.to_string())
&attr.value.to_string(), .unwrap_or(EMPTY_STRING.clone());
)
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(&href_full_url.as_str()); attr.value.push_slice(&href_full_url.as_str());
} }
@ -192,20 +184,17 @@ pub fn walk_and_embed_assets(
attr.value.clear(); attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL); attr.value.push_slice(TRANSPARENT_PIXEL);
} else { } else {
let src_full_url: String = resolve_url( let src_full_url: String =
&url, resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone());
&value,
)
.unwrap_or(EMPTY_STRING.clone());
let img_datauri = retrieve_asset( let img_datauri = retrieve_asset(
&src_full_url, &src_full_url,
true, true,
"", "",
opt_user_agent, opt_user_agent,
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(img_datauri.as_str()); attr.value.push_slice(img_datauri.as_str());
} }
@ -215,25 +204,23 @@ pub fn walk_and_embed_assets(
"source" => { "source" => {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "srcset" { if &attr.name.local == "srcset" {
if get_parent_node_name(&node) == "picture" { if get_node_name(&get_parent_node(&node)) == "picture" {
if opt_no_images { if opt_no_images {
attr.value.clear(); attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL); attr.value.push_slice(TRANSPARENT_PIXEL);
} else { } else {
let srcset_full_url: String = resolve_url( let srcset_full_url: String =
&url, resolve_url(&url, &attr.value.to_string())
&attr.value.to_string(), .unwrap_or(EMPTY_STRING.clone());
)
.unwrap_or(EMPTY_STRING.clone());
let source_datauri = retrieve_asset( let source_datauri = retrieve_asset(
&srcset_full_url, &srcset_full_url,
true, true,
"", "",
opt_user_agent, opt_user_agent,
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(source_datauri.as_str()); attr.value.push_slice(source_datauri.as_str());
} }
@ -258,7 +245,7 @@ pub fn walk_and_embed_assets(
} }
"script" => { "script" => {
if opt_no_js { if opt_no_js {
// Get rid of src and inner content of SCRIPT tags // Empty src and inner content of SCRIPT tags
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" { if &attr.name.local == "src" {
attr.value.clear(); attr.value.clear();
@ -268,26 +255,30 @@ pub fn walk_and_embed_assets(
} else { } else {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" { if &attr.name.local == "src" {
let src_full_url: String = resolve_url( let src_full_url: String =
&url, resolve_url(&url, &attr.value.to_string())
&attr.value.to_string(), .unwrap_or(EMPTY_STRING.clone());
)
.unwrap_or(EMPTY_STRING.clone());
let js_datauri = retrieve_asset( let js_datauri = retrieve_asset(
&src_full_url, &src_full_url,
true, true,
"application/javascript", "application/javascript",
opt_user_agent, opt_user_agent,
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(js_datauri.as_str()); attr.value.push_slice(js_datauri.as_str());
} }
} }
} }
} }
"style" => {
if opt_no_css {
// Empty inner content of STYLE tags
node.children.borrow_mut().clear();
}
}
"form" => { "form" => {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "action" { if &attr.name.local == "action" {
@ -304,18 +295,26 @@ pub fn walk_and_embed_assets(
} }
} }
"iframe" => { "iframe" => {
for attr in attrs_mut.iter_mut() { if opt_no_frames {
if &attr.name.local == "src" { // Empty the src attribute
let value = attr.value.to_string(); for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
// Ignore iframes with empty source (they cause infinite loops) attr.value.clear();
if value == EMPTY_STRING.clone() {
continue;
} }
}
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let iframe_src = attr.value.to_string();
let src_full_url: String = resolve_url(&url, &value) // Ignore iframes with empty source (they cause infinite loops)
.unwrap_or(EMPTY_STRING.clone()); if iframe_src == EMPTY_STRING.clone() {
let iframe_data = retrieve_asset( continue;
}
let src_full_url: String =
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
let iframe_data = retrieve_asset(
&src_full_url, &src_full_url,
false, false,
"text/html", "text/html",
@ -324,27 +323,40 @@ pub fn walk_and_embed_assets(
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let dom = html_to_dom(&iframe_data); let dom = html_to_dom(&iframe_data);
walk_and_embed_assets( walk_and_embed_assets(
&src_full_url, &src_full_url,
&dom.document, &dom.document,
opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
opt_user_agent, opt_user_agent,
opt_silent, opt_silent,
opt_insecure, opt_insecure,
opt_no_frames,
); );
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); serialize(&mut buf, &dom.document, SerializeOpts::default())
let iframe_datauri = data_to_dataurl("text/html", &buf); .unwrap();
attr.value.clear(); let iframe_datauri = data_to_dataurl("text/html", &buf);
attr.value.push_slice(iframe_datauri.as_str()); attr.value.clear();
attr.value.push_slice(iframe_datauri.as_str());
}
} }
} }
} }
_ => {} _ => {}
} }
if opt_no_css {
// Get rid of style attributes
for attr in attrs_mut.iter_mut() {
if attr.name.local.to_lowercase() == "style" {
attr.value.clear();
}
}
}
if opt_no_js { if opt_no_js {
// Get rid of JS event attributes // Get rid of JS event attributes
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
@ -357,17 +369,24 @@ pub fn walk_and_embed_assets(
// Dig deeper // Dig deeper
for child in node.children.borrow().iter() { for child in node.children.borrow().iter() {
walk_and_embed_assets( walk_and_embed_assets(
&url, &url,
child, child,
opt_no_js, opt_no_css,
opt_no_images, opt_no_js,
opt_user_agent, opt_no_images,
opt_silent, opt_user_agent,
opt_insecure, opt_silent,
); opt_insecure,
opt_no_frames,
);
} }
} }
NodeData::ProcessingInstruction { .. } => unreachable!() _ => {
// Note: in case of opt_no_js being set to true, there's no need to worry about
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
// since that's not part of W3C standard and therefore gets ignored
// by browsers other than IE [5, 9]
}
} }
} }
@ -382,8 +401,86 @@ pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
.unwrap() .unwrap()
} }
pub fn print_dom(handle: &Handle) { fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle {
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap(); let children = handle.children.borrow();
let matching_children = children.iter().find(|child| match child.data {
NodeData::Element { ref name, .. } => &*name.local == node_name,
_ => false,
});
match matching_children {
Some(node) => node.clone(),
_ => {
return handle.clone();
}
}
}
pub fn stringify_document(
handle: &Handle,
opt_no_css: bool,
opt_no_frames: bool,
opt_no_js: bool,
opt_no_images: bool,
opt_isolate: bool,
) -> String {
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, handle, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
let mut result: String = String::from_utf8(buf).unwrap();
if opt_isolate || opt_no_css || opt_no_frames || opt_no_js || opt_no_images {
let mut buf: Vec<u8> = Vec::new();
let mut dom = html_to_dom(&result);
let doc = dom.get_document();
let html = get_child_node_by_name(&doc, "html");
let head = get_child_node_by_name(&html, "head");
{
let mut content_attr = EMPTY_STRING.clone();
if opt_isolate {
content_attr += "default-src 'unsafe-inline' data:;"
}
if opt_no_css {
content_attr += "style-src 'none';"
}
if opt_no_frames {
content_attr += "frame-src 'none';child-src 'none';"
}
if opt_no_js {
content_attr += "script-src 'none';"
}
if opt_no_images {
content_attr += "img-src data:;"
}
let meta = dom.create_element(
QualName::new(None, ns!(), local_name!("meta")),
vec![
Attribute {
name: QualName::new(None, ns!(), local_name!("http-equiv")),
value: format_tendril!("Content-Security-Policy"),
},
Attribute {
name: QualName::new(None, ns!(), local_name!("content")),
value: format_tendril!("{}", content_attr),
},
],
Default::default(),
);
head.children.borrow_mut().reverse();
head.children.borrow_mut().push(meta.clone());
head.children.borrow_mut().reverse();
// Note: the CSP meta-tag has to be prepended, never appended,
// since there already may be one defined in the document,
// and browsers don't allow re-defining them (for obvious reasons)
}
serialize(&mut buf, &doc, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
result = String::from_utf8(buf).unwrap();
// Note: we can't make it isolate the page right away since it may have no HEAD element,
// ergo we have to serialize, parse DOM again, and then finally serialize the result
}
result
} }
fn is_icon(attr_value: &str) -> bool { fn is_icon(attr_value: &str) -> bool {
@ -407,7 +504,10 @@ mod tests {
#[test] #[test]
fn test_has_protocol() { fn test_has_protocol() {
assert_eq!(has_protocol("mailto:somebody@somewhere.com?subject=hello"), true); assert_eq!(
has_protocol("mailto:somebody@somewhere.com?subject=hello"),
true
);
assert_eq!(has_protocol("tel:5551234567"), true); assert_eq!(has_protocol("tel:5551234567"), true);
assert_eq!(has_protocol("ftp:user:password@some-ftp-server.com"), true); assert_eq!(has_protocol("ftp:user:password@some-ftp-server.com"), true);
assert_eq!(has_protocol("javascript:void(0)"), true); assert_eq!(has_protocol("javascript:void(0)"), true);
@ -417,7 +517,10 @@ mod tests {
assert_eq!(has_protocol("some-hostname.com/some-file.html"), false); assert_eq!(has_protocol("some-hostname.com/some-file.html"), false);
assert_eq!(has_protocol("/some-file.html"), false); assert_eq!(has_protocol("/some-file.html"), false);
assert_eq!(has_protocol(""), false); assert_eq!(has_protocol(""), false);
assert_eq!(has_protocol("MAILTO:somebody@somewhere.com?subject=hello"), true); assert_eq!(
has_protocol("MAILTO:somebody@somewhere.com?subject=hello"),
true
);
} }
#[test] #[test]
@ -438,9 +541,9 @@ mod tests {
NodeData::Doctype { .. } => (), NodeData::Doctype { .. } => (),
NodeData::Text { .. } => (), NodeData::Text { .. } => (),
NodeData::Comment { .. } => (), NodeData::Comment { .. } => (),
NodeData::Element { ref name, attrs: _, .. } => { NodeData::Element { ref name, .. } => {
let node_name = name.local.as_ref().to_string(); let node_name = name.local.as_ref().to_string();
let parent_node_name = get_parent_node_name(node); let parent_node_name = get_node_name(&get_parent_node(node));
if node_name == "head" || node_name == "body" { if node_name == "head" || node_name == "body" {
assert_eq!(parent_node_name, "html"); assert_eq!(parent_node_name, "html");
} else if node_name == "div" { } else if node_name == "div" {
@ -455,7 +558,7 @@ mod tests {
test_walk(child, &mut *i); test_walk(child, &mut *i);
} }
} }
NodeData::ProcessingInstruction { .. } => unreachable!() NodeData::ProcessingInstruction { .. } => unreachable!(),
}; };
} }
@ -470,7 +573,24 @@ mod tests {
let dom = html_to_dom(&html); let dom = html_to_dom(&html);
let url = "http://localhost"; let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true); let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
walk_and_embed_assets(
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
@ -482,12 +602,29 @@ mod tests {
} }
#[test] #[test]
fn test_walk_and_embed_assets_iframe() { fn test_walk_and_embed_assets_no_recursive_iframe() {
let html = "<div><P></P><iframe src=\"\"></iframe></div>"; let html = "<div><P></P><iframe src=\"\"></iframe></div>";
let dom = html_to_dom(&html); let dom = html_to_dom(&html);
let url = "http://localhost"; let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true); let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
walk_and_embed_assets(
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
@ -499,19 +636,74 @@ mod tests {
} }
#[test] #[test]
fn test_walk_and_embed_assets_img() { fn test_walk_and_embed_assets_no_css() {
let html = "<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>"; let html = "<link rel=\"stylesheet\" href=\"main.css\">\
<style>html{background-color: #000;}</style>\
<div style=\"display: none;\"></div>";
let dom = html_to_dom(&html); let dom = html_to_dom(&html);
let url = "http://localhost"; let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true); let opt_no_css: bool = true;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
walk_and_embed_assets(
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!( assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(), buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div>\ "<html><head><link rel=\"stylesheet\" href=\"\"><style></style></head>\
<body><div style=\"\"></div></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_no_images() {
let html = "<link rel=\"icon\" href=\"favicon.ico\">\
<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = true;
let opt_silent = true;
let opt_insecure = false;
walk_and_embed_assets(
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head><link rel=\"icon\" href=\"\"></head><body><div>\
<img src=\"data:image/png;base64,\ <img src=\"data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0\ iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0\
lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\ lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\
@ -520,21 +712,199 @@ mod tests {
} }
#[test] #[test]
fn test_walk_and_embed_assets_js() { fn test_walk_and_embed_assets_no_frames() {
let html = "<div><script src=\"http://localhost/assets/some.js\"></script>\ let html = "<iframe src=\"http://trackbook.com\"></iframe>";
<script>alert(1)</script></div>";
let dom = html_to_dom(&html); let dom = html_to_dom(&html);
let url = "http://localhost"; let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true); let opt_no_css: bool = false;
let opt_no_frames: bool = true;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
walk_and_embed_assets(
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!( assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(), buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><script src=\"\"></script>\ "<html><head></head><body><iframe src=\"\"></iframe></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_no_js() {
let html =
"<div onClick=\"void(0)\"><script src=\"http://localhost/assets/some.js\"></script>\
<script>alert(1)</script></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = true;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
walk_and_embed_assets(
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div onclick=\"\"><script src=\"\"></script>\
<script></script></div></body></html>" <script></script></div></body></html>"
); );
} }
#[test]
fn test_stringify_document() {
let html = "<div><script src=\"some.js\"></script></div>";
let dom = html_to_dom(&html);
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_isolate: bool = false;
assert_eq!(
stringify_document(
&dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
),
"<html><head></head><body><div><script src=\"some.js\"></script></div></body></html>"
);
}
#[test]
fn test_stringify_document_isolate() {
let html = "<title>Isolated document</title><link rel=\"something\"/>\
<div><script src=\"some.js\"></script></div>";
let dom = html_to_dom(&html);
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_isolate: bool = true;
assert_eq!(
stringify_document(
&dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
),
"<html>\
<head>\
<meta \
http-equiv=\"Content-Security-Policy\" \
content=\"default-src 'unsafe-inline' data:;\"></meta>\
<title>Isolated document</title>\
<link rel=\"something\">\
</head>\
<body><div><script src=\"some.js\"></script></div></body>\
</html>"
);
}
#[test]
fn test_stringify_document_no_css() {
let html = "<!doctype html>\
<title>Unstyled document</title>\
<link rel=\"stylesheet\" href=\"main.css\"/>\
<div style=\"display: none;\"></div>";
let dom = html_to_dom(&html);
let opt_no_css: bool = true;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_isolate: bool = false;
assert_eq!(
stringify_document(
&dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
),
"<!DOCTYPE html>\
<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none';\"></meta>\
<title>Unstyled document</title>\
<link rel=\"stylesheet\" href=\"main.css\">\
</head>\
<body><div style=\"display: none;\"></div></body>\
</html>"
);
}
#[test]
fn test_stringify_document_no_frames() {
let html = "<!doctype html><title>Frameless document</title><link rel=\"something\"/>\
<div><script src=\"some.js\"></script></div>";
let dom = html_to_dom(&html);
let opt_no_css: bool = false;
let opt_no_frames: bool = true;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_isolate: bool = false;
assert_eq!(
stringify_document(
&dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
),
"<!DOCTYPE html>\
<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"frame-src 'none';child-src 'none';\"></meta>\
<title>Frameless document</title>\
<link rel=\"something\">\
</head>\
<body><div><script src=\"some.js\"></script></div></body>\
</html>"
);
}
} }

View file

@ -1,6 +1,6 @@
use regex::Regex; use regex::Regex;
use reqwest::Client;
use reqwest::header::{CONTENT_TYPE, USER_AGENT}; use reqwest::header::{CONTENT_TYPE, USER_AGENT};
use reqwest::Client;
use std::time::Duration; use std::time::Duration;
use url::{ParseError, Url}; use url::{ParseError, Url};
use utils::data_to_dataurl; use utils::data_to_dataurl;
@ -46,10 +46,7 @@ pub fn retrieve_asset(
.timeout(Duration::from_secs(10)) .timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(opt_insecure) .danger_accept_invalid_certs(opt_insecure)
.build()?; .build()?;
let mut response = client let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
.get(url)
.header(USER_AGENT, opt_user_agent)
.send()?;
let final_url = response.url().as_str(); let final_url = response.url().as_str();
if !opt_silent { if !opt_silent {
@ -102,19 +99,13 @@ mod tests {
#[test] #[test]
fn test_resolve_url() -> Result<(), ParseError> { fn test_resolve_url() -> Result<(), ParseError> {
let resolved_url = resolve_url( let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?;
"https://www.kernel.org",
"../category/signatures.html",
)?;
assert_eq!( assert_eq!(
resolved_url.as_str(), resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html" "https://www.kernel.org/category/signatures.html"
); );
let resolved_url = resolve_url( let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?;
"https://www.kernel.org",
"category/signatures.html",
)?;
assert_eq!( assert_eq!(
resolved_url.as_str(), resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html" "https://www.kernel.org/category/signatures.html"

View file

@ -3,7 +3,7 @@ extern crate clap;
extern crate monolith; extern crate monolith;
use clap::{App, Arg}; use clap::{App, Arg};
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets}; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::{is_valid_url, retrieve_asset}; use monolith::http::{is_valid_url, retrieve_asset};
static DEFAULT_USER_AGENT: &str = static DEFAULT_USER_AGENT: &str =
@ -21,43 +21,60 @@ fn main() {
.index(1) .index(1)
.help("URL to download"), .help("URL to download"),
) )
.args_from_usage("-i, --no-images 'Removes images'") .args_from_usage("-c, --no-css 'Ignore styles'")
.args_from_usage("-j, --no-js 'Excludes JavaScript'") .args_from_usage("-f, --no-frames 'Exclude iframes'")
.args_from_usage("-i, --no-images 'Remove images'")
.args_from_usage("-I, --isolate 'Cut off from the Internet'")
.args_from_usage("-j, --no-js 'Exclude JavaScript'")
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'") .args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
.args_from_usage("-s, --silent 'Suppress verbosity'") .args_from_usage("-s, --silent 'Suppress verbosity'")
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'") .args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
.get_matches(); .get_matches();
// Process the command // Process the command
let arg_target = command.value_of("url").unwrap(); let arg_target: &str = command.value_of("url").unwrap();
let opt_no_images = command.is_present("no-images"); let opt_no_css: bool = command.is_present("no-css");
let opt_no_js = command.is_present("no-js"); let opt_no_frames: bool = command.is_present("no-frames");
let opt_insecure = command.is_present("insecure"); let opt_no_images: bool = command.is_present("no-images");
let opt_silent = command.is_present("silent"); let opt_no_js: bool = command.is_present("no-js");
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT); let opt_insecure: bool = command.is_present("insecure");
let opt_isolate: bool = command.is_present("isolate");
let opt_silent: bool = command.is_present("silent");
let opt_user_agent: &str = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
if is_valid_url(arg_target) { if is_valid_url(arg_target) {
let data = retrieve_asset( let data = retrieve_asset(
&arg_target, &arg_target,
false, false,
"", "",
opt_user_agent, opt_user_agent,
opt_silent, opt_silent,
opt_insecure, opt_insecure,
).unwrap(); )
.unwrap();
let dom = html_to_dom(&data); let dom = html_to_dom(&data);
walk_and_embed_assets( walk_and_embed_assets(
&arg_target, &arg_target,
&dom.document, &dom.document,
opt_no_css,
opt_no_js, opt_no_js,
opt_no_images, opt_no_images,
opt_user_agent, opt_user_agent,
opt_silent, opt_silent,
opt_insecure, opt_insecure,
opt_no_frames,
); );
print_dom(&dom.document); let html: String = stringify_document(
println!(); // Ensure newline at end of output &dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
);
println!("{}", html);
} }
} }

View file

@ -2,7 +2,7 @@ extern crate base64;
use self::base64::encode; use self::base64::encode;
static MAGIC: [[&[u8]; 2]; 19] = [ static MAGIC: [[&[u8]; 2]; 19] = [
// Image // Image
[b"GIF87a", b"image/gif"], [b"GIF87a", b"image/gif"],
[b"GIF89a", b"image/gif"], [b"GIF89a", b"image/gif"],
@ -39,7 +39,7 @@ pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
fn detect_mimetype(data: &[u8]) -> String { fn detect_mimetype(data: &[u8]) -> String {
let mut re = String::new(); let mut re = String::new();
for item in MAGIC.iter() { for item in MAGIC.iter() {
if data.starts_with(item[0]) { if data.starts_with(item[0]) {
re = String::from_utf8(item[1].to_vec()).unwrap(); re = String::from_utf8(item[1].to_vec()).unwrap();
break; break;