Properly handle 30x redirects

This commit is contained in:
Sunshine 2019-09-30 23:58:09 -04:00
parent b6ba22513d
commit 0896f2e214
5 changed files with 60 additions and 37 deletions

View file

@ -7,7 +7,6 @@ use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset;
use js::attr_is_event_handler;
use regex::Regex;
use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
@ -103,7 +102,7 @@ pub fn walk_and_embed_assets(
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let favicon_datauri = retrieve_asset(
let (favicon_dataurl, _) = retrieve_asset(
&href_full_url,
true,
"",
@ -111,9 +110,9 @@ pub fn walk_and_embed_assets(
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear();
attr.value.push_slice(favicon_datauri.as_str());
attr.value.push_slice(favicon_dataurl.as_str());
}
}
}
@ -126,7 +125,7 @@ pub fn walk_and_embed_assets(
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let css_datauri = retrieve_asset(
let (css_dataurl, _) = retrieve_asset(
&href_full_url,
true,
"text/css",
@ -134,9 +133,9 @@ pub fn walk_and_embed_assets(
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear();
attr.value.push_slice(css_datauri.as_str());
attr.value.push_slice(css_dataurl.as_str());
}
}
}
@ -168,7 +167,7 @@ pub fn walk_and_embed_assets(
} else {
let src_full_url: String =
resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone());
let img_datauri = retrieve_asset(
let (img_dataurl, _) = retrieve_asset(
&src_full_url,
true,
"",
@ -176,9 +175,9 @@ pub fn walk_and_embed_assets(
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear();
attr.value.push_slice(img_datauri.as_str());
attr.value.push_slice(img_dataurl.as_str());
}
}
}
@ -201,7 +200,7 @@ pub fn walk_and_embed_assets(
let srcset_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let source_datauri = retrieve_asset(
let (source_dataurl, _) = retrieve_asset(
&srcset_full_url,
true,
"",
@ -209,9 +208,9 @@ pub fn walk_and_embed_assets(
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear();
attr.value.push_slice(source_datauri.as_str());
attr.value.push_slice(source_dataurl.as_str());
}
}
}
@ -247,7 +246,7 @@ pub fn walk_and_embed_assets(
let src_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let js_datauri = retrieve_asset(
let (js_dataurl, _) = retrieve_asset(
&src_full_url,
true,
"application/javascript",
@ -255,9 +254,9 @@ pub fn walk_and_embed_assets(
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear();
attr.value.push_slice(js_datauri.as_str());
attr.value.push_slice(js_dataurl.as_str());
}
}
}
@ -300,7 +299,7 @@ pub fn walk_and_embed_assets(
let src_full_url: String =
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
let iframe_data = retrieve_asset(
let (iframe_data, iframe_final_url) = retrieve_asset(
&src_full_url,
false,
"text/html",
@ -308,10 +307,10 @@ pub fn walk_and_embed_assets(
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
.unwrap_or((EMPTY_STRING.clone(), src_full_url));
let dom = html_to_dom(&iframe_data);
walk_and_embed_assets(
&src_full_url,
&iframe_final_url,
&dom.document,
opt_no_css,
opt_no_js,
@ -323,9 +322,9 @@ pub fn walk_and_embed_assets(
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
let iframe_datauri = data_to_dataurl("text/html", &buf);
let iframe_dataurl = data_to_dataurl("text/html", &buf);
attr.value.clear();
attr.value.push_slice(iframe_datauri.as_str());
attr.value.push_slice(iframe_dataurl.as_str());
}
}
}
@ -344,7 +343,7 @@ pub fn walk_and_embed_assets(
} else {
let poster_full_url: String = resolve_url(&url, &video_poster)
.unwrap_or(EMPTY_STRING.clone());
let img_datauri = retrieve_asset(
let (poster_dataurl, _) = retrieve_asset(
&poster_full_url,
true,
"",
@ -352,9 +351,9 @@ pub fn walk_and_embed_assets(
opt_silent,
opt_insecure,
)
.unwrap_or(poster_full_url);
.unwrap_or((poster_full_url, EMPTY_STRING.clone()));
attr.value.clear();
attr.value.push_slice(img_datauri.as_str());
attr.value.push_slice(poster_dataurl.as_str());
}
}
}

View file

@ -6,26 +6,25 @@ use utils::{data_to_dataurl, is_data_url};
pub fn retrieve_asset(
url: &str,
as_dataurl: bool,
as_mime: &str,
mime: &str,
opt_user_agent: &str,
opt_silent: bool,
opt_insecure: bool,
) -> Result<String, reqwest::Error> {
) -> Result<(String, String), reqwest::Error> {
if is_data_url(&url).unwrap() {
Ok(url.to_string())
Ok((url.to_string(), url.to_string()))
} else {
let client = Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(opt_insecure)
.build()?;
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
let final_url = response.url().as_str();
if !opt_silent {
if url == final_url {
if url == response.url().as_str() {
eprintln!("[ {} ]", &url);
} else {
eprintln!("[ {} -> {} ]", &url, &final_url);
eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
}
}
@ -35,19 +34,22 @@ pub fn retrieve_asset(
response.copy_to(&mut data)?;
// Attempt to obtain MIME type by reading the Content-Type header
let mimetype = if as_mime == "" {
let mimetype = if mime == "" {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or(&as_mime)
.unwrap_or(&mime)
} else {
as_mime
mime
};
Ok(data_to_dataurl(&mimetype, &data))
Ok((
data_to_dataurl(&mimetype, &data),
response.url().to_string(),
))
} else {
Ok(response.text().unwrap())
Ok((response.text().unwrap(), response.url().to_string()))
}
}
}

View file

@ -46,7 +46,7 @@ fn main() {
let opt_user_agent: &str = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
if is_valid_url(arg_target) {
let data = retrieve_asset(
let (data, final_url) = retrieve_asset(
&arg_target,
false,
"",
@ -58,7 +58,7 @@ fn main() {
let dom = html_to_dom(&data);
walk_and_embed_assets(
&arg_target,
&final_url,
&dom.document,
opt_no_css,
opt_no_js,

21
src/tests/http.rs Normal file
View file

@ -0,0 +1,21 @@
use crate::http::retrieve_asset;
#[test]
fn test_retrieve_asset() {
let (data, final_url) =
retrieve_asset("data:text/html;base64,...", true, "", "", true, false).unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
let (data, final_url) = retrieve_asset(
"data:text/html;base64,...",
true,
"image/png",
"",
true,
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
}

View file

@ -1,3 +1,4 @@
mod html;
mod http;
mod js;
mod utils;