Treat network errors as empty results

This commit is contained in:
Vincent Flyson 2019-08-24 23:06:40 -04:00
parent 8aa688203c
commit 50f1ba1ce8
3 changed files with 108 additions and 104 deletions

View file

@ -1,6 +1,6 @@
[package] [package]
name = "monolith" name = "monolith"
version = "2.0.12" version = "2.0.13"
authors = [ authors = [
"Sunshine <sunshine@uberspace.net>", "Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>", "Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",

View file

@ -9,19 +9,11 @@ use std::io;
use utils::data_to_dataurl; use utils::data_to_dataurl;
lazy_static! { lazy_static! {
static ref EMPTY_STRING: String = String::new();
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
} static ref ICON_VALUES: Regex = Regex::new(
r"^icon|shortcut icon|mask-icon|apple-touch-icon$"
enum NodeMatch { ).unwrap();
Icon,
Image,
Source,
StyleSheet,
Anchor,
Script,
Form,
IFrame,
Other,
} }
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\ const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
@ -60,15 +52,11 @@ fn get_parent_node_name(node: &Handle) -> String {
let parent_node = parent.and_then(|node| node.upgrade()).unwrap(); let parent_node = parent.and_then(|node| node.upgrade()).unwrap();
match &parent_node.data { match &parent_node.data {
NodeData::Document => {"".to_string()} NodeData::Document => { EMPTY_STRING.clone() }
NodeData::Doctype { .. } => {"".to_string()} NodeData::Doctype { .. } => { EMPTY_STRING.clone() }
NodeData::Text { .. } => {"".to_string()} NodeData::Text { .. } => { EMPTY_STRING.clone() }
NodeData::Comment { .. } => {"".to_string()} NodeData::Comment { .. } => { EMPTY_STRING.clone() }
NodeData::Element { NodeData::Element { ref name, attrs: _, .. } => {
ref name,
attrs: _,
..
} => {
name.local.as_ref().to_string() name.local.as_ref().to_string()
} }
NodeData::ProcessingInstruction { .. } => unreachable!() NodeData::ProcessingInstruction { .. } => unreachable!()
@ -94,91 +82,117 @@ pub fn walk_and_embed_assets(
); );
} }
} }
NodeData::Doctype { .. } => {} NodeData::Doctype { .. } => {}
NodeData::Text { .. } => {} NodeData::Text { .. } => {}
NodeData::Comment { .. } => { NodeData::Comment { .. } => {
// Note: in case of opt_no_js being set to true, there's no need to worry about // Note: in case of opt_no_js being set to true, there's no need to worry about
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>... // getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
// since that's not part of W3C standard and therefore gets ignored // since that's not part of W3C standard and therefore gets ignored
// by browsers other than IE [5, 9] // by browsers other than IE [5, 9]
} }
NodeData::Element { NodeData::Element {
ref name, ref name,
ref attrs, ref attrs,
.. ..
} => { } => {
let attrs_mut = &mut attrs.borrow_mut(); let attrs_mut = &mut attrs.borrow_mut();
let mut found = NodeMatch::Other;
match name.local.as_ref() { match name.local.as_ref() {
"link" => { "link" => {
let mut link_type = "";
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" { if &attr.name.local == "rel" {
if is_icon(&attr.value.to_string()) { if is_icon(&attr.value.to_string()) {
found = NodeMatch::Icon; link_type = "icon";
break; break;
} else if attr.value.to_string() == "stylesheet" { } else if attr.value.to_string() == "stylesheet" {
found = NodeMatch::StyleSheet; link_type = "stylesheet";
break; break;
} }
} }
} }
}
"img" => { found = NodeMatch::Image; }
"source" => { found = NodeMatch::Source; }
"a" => { found = NodeMatch::Anchor; }
"script" => { found = NodeMatch::Script; }
"form" => { found = NodeMatch::Form; }
"iframe" => { found = NodeMatch::IFrame; }
_ => {}
}
match found { if link_type == "icon" {
NodeMatch::Icon => {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" { if &attr.name.local == "href" {
if opt_no_images { if opt_no_images {
attr.value.clear(); attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL); attr.value.push_slice(TRANSPARENT_PIXEL);
} else { } else {
let href_full_url = resolve_url(&url, &attr.value.to_string()); let href_full_url: String = resolve_url(
&url,
&attr.value.to_string()
)
.unwrap_or(EMPTY_STRING.clone());
let favicon_datauri = retrieve_asset( let favicon_datauri = retrieve_asset(
&href_full_url.unwrap(), &href_full_url,
true, true,
"", "",
opt_user_agent, opt_user_agent,
); ).unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(favicon_datauri.unwrap().as_str()); attr.value.push_slice(favicon_datauri.as_str());
}
}
}
} else if link_type == "stylesheet" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let css_datauri = retrieve_asset(
&href_full_url,
true,
"text/css",
opt_user_agent,
).unwrap_or(EMPTY_STRING.clone());
attr.value.clear();
attr.value.push_slice(css_datauri.as_str());
}
}
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear();
attr.value.push_slice(&href_full_url.as_str());
} }
} }
} }
} }
NodeMatch::Image => { "img" => {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" { if &attr.name.local == "src" {
if opt_no_images { if opt_no_images {
attr.value.clear(); attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL); attr.value.push_slice(TRANSPARENT_PIXEL);
} else { } else {
let src_full_url = resolve_url(&url, &attr.value.to_string()); let src_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let img_datauri = retrieve_asset( let img_datauri = retrieve_asset(
&src_full_url.unwrap(), &src_full_url,
true, true,
"", "",
opt_user_agent, opt_user_agent,
); ).unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(img_datauri.unwrap().as_str()); attr.value.push_slice(img_datauri.as_str());
} }
} }
} }
} }
NodeMatch::Source => { "source" => {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "srcset" { if &attr.name.local == "srcset" {
if get_parent_node_name(&node) == "picture" { if get_parent_node_name(&node) == "picture" {
@ -186,21 +200,25 @@ pub fn walk_and_embed_assets(
attr.value.clear(); attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL); attr.value.push_slice(TRANSPARENT_PIXEL);
} else { } else {
let srcset_full_url = resolve_url(&url, &attr.value.to_string()); let srcset_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let source_datauri = retrieve_asset( let source_datauri = retrieve_asset(
&srcset_full_url.unwrap(), &srcset_full_url,
true, true,
"", "",
opt_user_agent, opt_user_agent,
); ).unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(source_datauri.unwrap().as_str()); attr.value.push_slice(source_datauri.as_str());
} }
} }
} }
} }
} }
NodeMatch::Anchor => { "a" => {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" { if &attr.name.local == "href" {
// Don't touch email links or hrefs which begin with a hash sign // Don't touch email links or hrefs which begin with a hash sign
@ -208,28 +226,14 @@ pub fn walk_and_embed_assets(
continue; continue;
} }
let href_full_url = resolve_url(&url, &attr.value.to_string()); let href_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(href_full_url.unwrap().as_str()); attr.value.push_slice(href_full_url.as_str());
} }
} }
} }
NodeMatch::StyleSheet => { "script" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url = resolve_url(&url, &attr.value.to_string());
let css_datauri = retrieve_asset(
&href_full_url.unwrap(),
true,
"text/css",
opt_user_agent,
);
attr.value.clear();
attr.value.push_slice(css_datauri.unwrap().as_str());
}
}
}
NodeMatch::Script => {
if opt_no_js { if opt_no_js {
// Get rid of src and inner content of SCRIPT tags // Get rid of src and inner content of SCRIPT tags
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
@ -241,20 +245,24 @@ pub fn walk_and_embed_assets(
} else { } else {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" { if &attr.name.local == "src" {
let src_full_url = resolve_url(&url, &attr.value.to_string()); let src_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let js_datauri = retrieve_asset( let js_datauri = retrieve_asset(
&src_full_url.unwrap(), &src_full_url,
true, true,
"application/javascript", "application/javascript",
opt_user_agent, opt_user_agent,
); ).unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(js_datauri.unwrap().as_str()); attr.value.push_slice(js_datauri.as_str());
} }
} }
} }
} }
NodeMatch::Form => { "form" => {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "action" { if &attr.name.local == "action" {
// Do not touch action props which are set to a URL // Do not touch action props which are set to a URL
@ -262,23 +270,25 @@ pub fn walk_and_embed_assets(
continue; continue;
} }
let href_full_url = resolve_url(&url, &attr.value.to_string()); let href_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear(); attr.value.clear();
attr.value.push_slice(href_full_url.unwrap().as_str()); attr.value.push_slice(href_full_url.as_str());
} }
} }
} }
NodeMatch::IFrame => { "iframe" => {
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" { if &attr.name.local == "src" {
let src_full_url = resolve_url(&url, &attr.value.to_string()).unwrap(); let src_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let iframe_data = retrieve_asset( let iframe_data = retrieve_asset(
&src_full_url, &src_full_url,
false, false,
"text/html", "text/html",
opt_user_agent, opt_user_agent,
); ).unwrap_or(EMPTY_STRING.clone());
let dom = html_to_dom(&iframe_data.unwrap()); let dom = html_to_dom(&iframe_data);
walk_and_embed_assets( walk_and_embed_assets(
&src_full_url, &src_full_url,
&dom.document, &dom.document,
@ -294,7 +304,7 @@ pub fn walk_and_embed_assets(
} }
} }
} }
NodeMatch::Other => {} _ => {}
} }
if opt_no_js { if opt_no_js {
@ -317,7 +327,6 @@ pub fn walk_and_embed_assets(
); );
} }
} }
NodeData::ProcessingInstruction { .. } => unreachable!() NodeData::ProcessingInstruction { .. } => unreachable!()
} }
} }
@ -338,10 +347,7 @@ pub fn print_dom(handle: &Handle) {
} }
fn is_icon(attr_value: &str) -> bool { fn is_icon(attr_value: &str) -> bool {
attr_value == "icon" ICON_VALUES.is_match(&attr_value.to_lowercase())
|| attr_value == "shortcut icon"
|| attr_value == "mask-icon"
|| attr_value == "apple-touch-icon"
} }
#[cfg(test)] #[cfg(test)]

View file

@ -9,10 +9,10 @@ lazy_static! {
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap(); static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
} }
pub fn is_data_url(url: &str) -> Result<bool, String> { pub fn is_data_url(url: &str) -> Result<bool, ParseError> {
match Url::parse(url) { match Url::parse(url) {
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"), Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
Err(err) => Err(format!("{}", err)), Err(err) => Err(err),
} }
} }
@ -42,13 +42,11 @@ pub fn retrieve_asset(
} else { } else {
let client = Client::builder() let client = Client::builder()
.timeout(Duration::from_secs(10)) .timeout(Duration::from_secs(10))
.build() .build()?;
.unwrap();
let mut response = client let mut response = client
.get(url) .get(url)
.header(USER_AGENT, opt_user_agent) .header(USER_AGENT, opt_user_agent)
.send() .send()?;
.unwrap();
let final_url = response.url().as_str(); let final_url = response.url().as_str();
if url == final_url { if url == final_url {