Treat network errors as empty results
This commit is contained in:
parent
8aa688203c
commit
50f1ba1ce8
3 changed files with 108 additions and 104 deletions
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "monolith"
|
name = "monolith"
|
||||||
version = "2.0.12"
|
version = "2.0.13"
|
||||||
authors = [
|
authors = [
|
||||||
"Sunshine <sunshine@uberspace.net>",
|
"Sunshine <sunshine@uberspace.net>",
|
||||||
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
|
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
|
||||||
|
|
184
src/html.rs
184
src/html.rs
|
@ -9,19 +9,11 @@ use std::io;
|
||||||
use utils::data_to_dataurl;
|
use utils::data_to_dataurl;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
|
static ref EMPTY_STRING: String = String::new();
|
||||||
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
||||||
}
|
static ref ICON_VALUES: Regex = Regex::new(
|
||||||
|
r"^icon|shortcut icon|mask-icon|apple-touch-icon$"
|
||||||
enum NodeMatch {
|
).unwrap();
|
||||||
Icon,
|
|
||||||
Image,
|
|
||||||
Source,
|
|
||||||
StyleSheet,
|
|
||||||
Anchor,
|
|
||||||
Script,
|
|
||||||
Form,
|
|
||||||
IFrame,
|
|
||||||
Other,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
|
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
|
||||||
|
@ -60,15 +52,11 @@ fn get_parent_node_name(node: &Handle) -> String {
|
||||||
let parent_node = parent.and_then(|node| node.upgrade()).unwrap();
|
let parent_node = parent.and_then(|node| node.upgrade()).unwrap();
|
||||||
|
|
||||||
match &parent_node.data {
|
match &parent_node.data {
|
||||||
NodeData::Document => {"".to_string()}
|
NodeData::Document => { EMPTY_STRING.clone() }
|
||||||
NodeData::Doctype { .. } => {"".to_string()}
|
NodeData::Doctype { .. } => { EMPTY_STRING.clone() }
|
||||||
NodeData::Text { .. } => {"".to_string()}
|
NodeData::Text { .. } => { EMPTY_STRING.clone() }
|
||||||
NodeData::Comment { .. } => {"".to_string()}
|
NodeData::Comment { .. } => { EMPTY_STRING.clone() }
|
||||||
NodeData::Element {
|
NodeData::Element { ref name, attrs: _, .. } => {
|
||||||
ref name,
|
|
||||||
attrs: _,
|
|
||||||
..
|
|
||||||
} => {
|
|
||||||
name.local.as_ref().to_string()
|
name.local.as_ref().to_string()
|
||||||
}
|
}
|
||||||
NodeData::ProcessingInstruction { .. } => unreachable!()
|
NodeData::ProcessingInstruction { .. } => unreachable!()
|
||||||
|
@ -94,91 +82,117 @@ pub fn walk_and_embed_assets(
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeData::Doctype { .. } => {}
|
NodeData::Doctype { .. } => {}
|
||||||
|
|
||||||
NodeData::Text { .. } => {}
|
NodeData::Text { .. } => {}
|
||||||
|
|
||||||
NodeData::Comment { .. } => {
|
NodeData::Comment { .. } => {
|
||||||
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
||||||
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
||||||
// since that's not part of W3C standard and therefore gets ignored
|
// since that's not part of W3C standard and therefore gets ignored
|
||||||
// by browsers other than IE [5, 9]
|
// by browsers other than IE [5, 9]
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeData::Element {
|
NodeData::Element {
|
||||||
ref name,
|
ref name,
|
||||||
ref attrs,
|
ref attrs,
|
||||||
..
|
..
|
||||||
} => {
|
} => {
|
||||||
let attrs_mut = &mut attrs.borrow_mut();
|
let attrs_mut = &mut attrs.borrow_mut();
|
||||||
let mut found = NodeMatch::Other;
|
|
||||||
|
|
||||||
match name.local.as_ref() {
|
match name.local.as_ref() {
|
||||||
"link" => {
|
"link" => {
|
||||||
|
let mut link_type = "";
|
||||||
|
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "rel" {
|
if &attr.name.local == "rel" {
|
||||||
if is_icon(&attr.value.to_string()) {
|
if is_icon(&attr.value.to_string()) {
|
||||||
found = NodeMatch::Icon;
|
link_type = "icon";
|
||||||
break;
|
break;
|
||||||
} else if attr.value.to_string() == "stylesheet" {
|
} else if attr.value.to_string() == "stylesheet" {
|
||||||
found = NodeMatch::StyleSheet;
|
link_type = "stylesheet";
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
"img" => { found = NodeMatch::Image; }
|
|
||||||
"source" => { found = NodeMatch::Source; }
|
|
||||||
"a" => { found = NodeMatch::Anchor; }
|
|
||||||
"script" => { found = NodeMatch::Script; }
|
|
||||||
"form" => { found = NodeMatch::Form; }
|
|
||||||
"iframe" => { found = NodeMatch::IFrame; }
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
|
|
||||||
match found {
|
if link_type == "icon" {
|
||||||
NodeMatch::Icon => {
|
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "href" {
|
if &attr.name.local == "href" {
|
||||||
if opt_no_images {
|
if opt_no_images {
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||||
} else {
|
} else {
|
||||||
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
let href_full_url: String = resolve_url(
|
||||||
|
&url,
|
||||||
|
&attr.value.to_string()
|
||||||
|
)
|
||||||
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let favicon_datauri = retrieve_asset(
|
let favicon_datauri = retrieve_asset(
|
||||||
&href_full_url.unwrap(),
|
&href_full_url,
|
||||||
true,
|
true,
|
||||||
"",
|
"",
|
||||||
opt_user_agent,
|
opt_user_agent,
|
||||||
);
|
).unwrap_or(EMPTY_STRING.clone());
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(favicon_datauri.unwrap().as_str());
|
attr.value.push_slice(favicon_datauri.as_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if link_type == "stylesheet" {
|
||||||
|
for attr in attrs_mut.iter_mut() {
|
||||||
|
if &attr.name.local == "href" {
|
||||||
|
let href_full_url: String = resolve_url(
|
||||||
|
&url,
|
||||||
|
&attr.value.to_string(),
|
||||||
|
)
|
||||||
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
|
let css_datauri = retrieve_asset(
|
||||||
|
&href_full_url,
|
||||||
|
true,
|
||||||
|
"text/css",
|
||||||
|
opt_user_agent,
|
||||||
|
).unwrap_or(EMPTY_STRING.clone());
|
||||||
|
attr.value.clear();
|
||||||
|
attr.value.push_slice(css_datauri.as_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for attr in attrs_mut.iter_mut() {
|
||||||
|
if &attr.name.local == "href" {
|
||||||
|
let href_full_url: String = resolve_url(
|
||||||
|
&url,
|
||||||
|
&attr.value.to_string(),
|
||||||
|
)
|
||||||
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
|
attr.value.clear();
|
||||||
|
attr.value.push_slice(&href_full_url.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NodeMatch::Image => {
|
"img" => {
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "src" {
|
if &attr.name.local == "src" {
|
||||||
if opt_no_images {
|
if opt_no_images {
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||||
} else {
|
} else {
|
||||||
let src_full_url = resolve_url(&url, &attr.value.to_string());
|
let src_full_url: String = resolve_url(
|
||||||
|
&url,
|
||||||
|
&attr.value.to_string(),
|
||||||
|
)
|
||||||
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let img_datauri = retrieve_asset(
|
let img_datauri = retrieve_asset(
|
||||||
&src_full_url.unwrap(),
|
&src_full_url,
|
||||||
true,
|
true,
|
||||||
"",
|
"",
|
||||||
opt_user_agent,
|
opt_user_agent,
|
||||||
);
|
).unwrap_or(EMPTY_STRING.clone());
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(img_datauri.unwrap().as_str());
|
attr.value.push_slice(img_datauri.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NodeMatch::Source => {
|
"source" => {
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "srcset" {
|
if &attr.name.local == "srcset" {
|
||||||
if get_parent_node_name(&node) == "picture" {
|
if get_parent_node_name(&node) == "picture" {
|
||||||
|
@ -186,21 +200,25 @@ pub fn walk_and_embed_assets(
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||||
} else {
|
} else {
|
||||||
let srcset_full_url = resolve_url(&url, &attr.value.to_string());
|
let srcset_full_url: String = resolve_url(
|
||||||
|
&url,
|
||||||
|
&attr.value.to_string(),
|
||||||
|
)
|
||||||
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let source_datauri = retrieve_asset(
|
let source_datauri = retrieve_asset(
|
||||||
&srcset_full_url.unwrap(),
|
&srcset_full_url,
|
||||||
true,
|
true,
|
||||||
"",
|
"",
|
||||||
opt_user_agent,
|
opt_user_agent,
|
||||||
);
|
).unwrap_or(EMPTY_STRING.clone());
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(source_datauri.unwrap().as_str());
|
attr.value.push_slice(source_datauri.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NodeMatch::Anchor => {
|
"a" => {
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "href" {
|
if &attr.name.local == "href" {
|
||||||
// Don't touch email links or hrefs which begin with a hash sign
|
// Don't touch email links or hrefs which begin with a hash sign
|
||||||
|
@ -208,28 +226,14 @@ pub fn walk_and_embed_assets(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||||
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(href_full_url.unwrap().as_str());
|
attr.value.push_slice(href_full_url.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NodeMatch::StyleSheet => {
|
"script" => {
|
||||||
for attr in attrs_mut.iter_mut() {
|
|
||||||
if &attr.name.local == "href" {
|
|
||||||
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
|
||||||
let css_datauri = retrieve_asset(
|
|
||||||
&href_full_url.unwrap(),
|
|
||||||
true,
|
|
||||||
"text/css",
|
|
||||||
opt_user_agent,
|
|
||||||
);
|
|
||||||
attr.value.clear();
|
|
||||||
attr.value.push_slice(css_datauri.unwrap().as_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
NodeMatch::Script => {
|
|
||||||
if opt_no_js {
|
if opt_no_js {
|
||||||
// Get rid of src and inner content of SCRIPT tags
|
// Get rid of src and inner content of SCRIPT tags
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
|
@ -241,20 +245,24 @@ pub fn walk_and_embed_assets(
|
||||||
} else {
|
} else {
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "src" {
|
if &attr.name.local == "src" {
|
||||||
let src_full_url = resolve_url(&url, &attr.value.to_string());
|
let src_full_url: String = resolve_url(
|
||||||
|
&url,
|
||||||
|
&attr.value.to_string(),
|
||||||
|
)
|
||||||
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let js_datauri = retrieve_asset(
|
let js_datauri = retrieve_asset(
|
||||||
&src_full_url.unwrap(),
|
&src_full_url,
|
||||||
true,
|
true,
|
||||||
"application/javascript",
|
"application/javascript",
|
||||||
opt_user_agent,
|
opt_user_agent,
|
||||||
);
|
).unwrap_or(EMPTY_STRING.clone());
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(js_datauri.unwrap().as_str());
|
attr.value.push_slice(js_datauri.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NodeMatch::Form => {
|
"form" => {
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "action" {
|
if &attr.name.local == "action" {
|
||||||
// Do not touch action props which are set to a URL
|
// Do not touch action props which are set to a URL
|
||||||
|
@ -262,23 +270,25 @@ pub fn walk_and_embed_assets(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||||
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(href_full_url.unwrap().as_str());
|
attr.value.push_slice(href_full_url.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NodeMatch::IFrame => {
|
"iframe" => {
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "src" {
|
if &attr.name.local == "src" {
|
||||||
let src_full_url = resolve_url(&url, &attr.value.to_string()).unwrap();
|
let src_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||||
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let iframe_data = retrieve_asset(
|
let iframe_data = retrieve_asset(
|
||||||
&src_full_url,
|
&src_full_url,
|
||||||
false,
|
false,
|
||||||
"text/html",
|
"text/html",
|
||||||
opt_user_agent,
|
opt_user_agent,
|
||||||
);
|
).unwrap_or(EMPTY_STRING.clone());
|
||||||
let dom = html_to_dom(&iframe_data.unwrap());
|
let dom = html_to_dom(&iframe_data);
|
||||||
walk_and_embed_assets(
|
walk_and_embed_assets(
|
||||||
&src_full_url,
|
&src_full_url,
|
||||||
&dom.document,
|
&dom.document,
|
||||||
|
@ -294,7 +304,7 @@ pub fn walk_and_embed_assets(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NodeMatch::Other => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
if opt_no_js {
|
if opt_no_js {
|
||||||
|
@ -317,7 +327,6 @@ pub fn walk_and_embed_assets(
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeData::ProcessingInstruction { .. } => unreachable!()
|
NodeData::ProcessingInstruction { .. } => unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -338,10 +347,7 @@ pub fn print_dom(handle: &Handle) {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_icon(attr_value: &str) -> bool {
|
fn is_icon(attr_value: &str) -> bool {
|
||||||
attr_value == "icon"
|
ICON_VALUES.is_match(&attr_value.to_lowercase())
|
||||||
|| attr_value == "shortcut icon"
|
|
||||||
|| attr_value == "mask-icon"
|
|
||||||
|| attr_value == "apple-touch-icon"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
10
src/http.rs
10
src/http.rs
|
@ -9,10 +9,10 @@ lazy_static! {
|
||||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_data_url(url: &str) -> Result<bool, String> {
|
pub fn is_data_url(url: &str) -> Result<bool, ParseError> {
|
||||||
match Url::parse(url) {
|
match Url::parse(url) {
|
||||||
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
|
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
|
||||||
Err(err) => Err(format!("{}", err)),
|
Err(err) => Err(err),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,13 +42,11 @@ pub fn retrieve_asset(
|
||||||
} else {
|
} else {
|
||||||
let client = Client::builder()
|
let client = Client::builder()
|
||||||
.timeout(Duration::from_secs(10))
|
.timeout(Duration::from_secs(10))
|
||||||
.build()
|
.build()?;
|
||||||
.unwrap();
|
|
||||||
let mut response = client
|
let mut response = client
|
||||||
.get(url)
|
.get(url)
|
||||||
.header(USER_AGENT, opt_user_agent)
|
.header(USER_AGENT, opt_user_agent)
|
||||||
.send()
|
.send()?;
|
||||||
.unwrap();
|
|
||||||
let final_url = response.url().as_str();
|
let final_url = response.url().as_str();
|
||||||
|
|
||||||
if url == final_url {
|
if url == final_url {
|
||||||
|
|
Loading…
Reference in a new issue