Properly handle 30x redirects
This commit is contained in:
parent
b6ba22513d
commit
0896f2e214
5 changed files with 60 additions and 37 deletions
47
src/html.rs
47
src/html.rs
|
@ -7,7 +7,6 @@ use html5ever::tree_builder::{Attribute, TreeSink};
|
||||||
use html5ever::{local_name, namespace_url, ns};
|
use html5ever::{local_name, namespace_url, ns};
|
||||||
use http::retrieve_asset;
|
use http::retrieve_asset;
|
||||||
use js::attr_is_event_handler;
|
use js::attr_is_event_handler;
|
||||||
use regex::Regex;
|
|
||||||
use std::default::Default;
|
use std::default::Default;
|
||||||
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
|
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
|
||||||
|
|
||||||
|
@ -103,7 +102,7 @@ pub fn walk_and_embed_assets(
|
||||||
let href_full_url: String =
|
let href_full_url: String =
|
||||||
resolve_url(&url, &attr.value.to_string())
|
resolve_url(&url, &attr.value.to_string())
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let favicon_datauri = retrieve_asset(
|
let (favicon_dataurl, _) = retrieve_asset(
|
||||||
&href_full_url,
|
&href_full_url,
|
||||||
true,
|
true,
|
||||||
"",
|
"",
|
||||||
|
@ -111,9 +110,9 @@ pub fn walk_and_embed_assets(
|
||||||
opt_silent,
|
opt_silent,
|
||||||
opt_insecure,
|
opt_insecure,
|
||||||
)
|
)
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(favicon_datauri.as_str());
|
attr.value.push_slice(favicon_dataurl.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -126,7 +125,7 @@ pub fn walk_and_embed_assets(
|
||||||
let href_full_url: String =
|
let href_full_url: String =
|
||||||
resolve_url(&url, &attr.value.to_string())
|
resolve_url(&url, &attr.value.to_string())
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let css_datauri = retrieve_asset(
|
let (css_dataurl, _) = retrieve_asset(
|
||||||
&href_full_url,
|
&href_full_url,
|
||||||
true,
|
true,
|
||||||
"text/css",
|
"text/css",
|
||||||
|
@ -134,9 +133,9 @@ pub fn walk_and_embed_assets(
|
||||||
opt_silent,
|
opt_silent,
|
||||||
opt_insecure,
|
opt_insecure,
|
||||||
)
|
)
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(css_datauri.as_str());
|
attr.value.push_slice(css_dataurl.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -168,7 +167,7 @@ pub fn walk_and_embed_assets(
|
||||||
} else {
|
} else {
|
||||||
let src_full_url: String =
|
let src_full_url: String =
|
||||||
resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone());
|
resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone());
|
||||||
let img_datauri = retrieve_asset(
|
let (img_dataurl, _) = retrieve_asset(
|
||||||
&src_full_url,
|
&src_full_url,
|
||||||
true,
|
true,
|
||||||
"",
|
"",
|
||||||
|
@ -176,9 +175,9 @@ pub fn walk_and_embed_assets(
|
||||||
opt_silent,
|
opt_silent,
|
||||||
opt_insecure,
|
opt_insecure,
|
||||||
)
|
)
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(img_datauri.as_str());
|
attr.value.push_slice(img_dataurl.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -201,7 +200,7 @@ pub fn walk_and_embed_assets(
|
||||||
let srcset_full_url: String =
|
let srcset_full_url: String =
|
||||||
resolve_url(&url, &attr.value.to_string())
|
resolve_url(&url, &attr.value.to_string())
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let source_datauri = retrieve_asset(
|
let (source_dataurl, _) = retrieve_asset(
|
||||||
&srcset_full_url,
|
&srcset_full_url,
|
||||||
true,
|
true,
|
||||||
"",
|
"",
|
||||||
|
@ -209,9 +208,9 @@ pub fn walk_and_embed_assets(
|
||||||
opt_silent,
|
opt_silent,
|
||||||
opt_insecure,
|
opt_insecure,
|
||||||
)
|
)
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(source_datauri.as_str());
|
attr.value.push_slice(source_dataurl.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -247,7 +246,7 @@ pub fn walk_and_embed_assets(
|
||||||
let src_full_url: String =
|
let src_full_url: String =
|
||||||
resolve_url(&url, &attr.value.to_string())
|
resolve_url(&url, &attr.value.to_string())
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let js_datauri = retrieve_asset(
|
let (js_dataurl, _) = retrieve_asset(
|
||||||
&src_full_url,
|
&src_full_url,
|
||||||
true,
|
true,
|
||||||
"application/javascript",
|
"application/javascript",
|
||||||
|
@ -255,9 +254,9 @@ pub fn walk_and_embed_assets(
|
||||||
opt_silent,
|
opt_silent,
|
||||||
opt_insecure,
|
opt_insecure,
|
||||||
)
|
)
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(js_datauri.as_str());
|
attr.value.push_slice(js_dataurl.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -300,7 +299,7 @@ pub fn walk_and_embed_assets(
|
||||||
|
|
||||||
let src_full_url: String =
|
let src_full_url: String =
|
||||||
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
|
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
|
||||||
let iframe_data = retrieve_asset(
|
let (iframe_data, iframe_final_url) = retrieve_asset(
|
||||||
&src_full_url,
|
&src_full_url,
|
||||||
false,
|
false,
|
||||||
"text/html",
|
"text/html",
|
||||||
|
@ -308,10 +307,10 @@ pub fn walk_and_embed_assets(
|
||||||
opt_silent,
|
opt_silent,
|
||||||
opt_insecure,
|
opt_insecure,
|
||||||
)
|
)
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or((EMPTY_STRING.clone(), src_full_url));
|
||||||
let dom = html_to_dom(&iframe_data);
|
let dom = html_to_dom(&iframe_data);
|
||||||
walk_and_embed_assets(
|
walk_and_embed_assets(
|
||||||
&src_full_url,
|
&iframe_final_url,
|
||||||
&dom.document,
|
&dom.document,
|
||||||
opt_no_css,
|
opt_no_css,
|
||||||
opt_no_js,
|
opt_no_js,
|
||||||
|
@ -323,9 +322,9 @@ pub fn walk_and_embed_assets(
|
||||||
);
|
);
|
||||||
let mut buf: Vec<u8> = Vec::new();
|
let mut buf: Vec<u8> = Vec::new();
|
||||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||||
let iframe_datauri = data_to_dataurl("text/html", &buf);
|
let iframe_dataurl = data_to_dataurl("text/html", &buf);
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(iframe_datauri.as_str());
|
attr.value.push_slice(iframe_dataurl.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -344,7 +343,7 @@ pub fn walk_and_embed_assets(
|
||||||
} else {
|
} else {
|
||||||
let poster_full_url: String = resolve_url(&url, &video_poster)
|
let poster_full_url: String = resolve_url(&url, &video_poster)
|
||||||
.unwrap_or(EMPTY_STRING.clone());
|
.unwrap_or(EMPTY_STRING.clone());
|
||||||
let img_datauri = retrieve_asset(
|
let (poster_dataurl, _) = retrieve_asset(
|
||||||
&poster_full_url,
|
&poster_full_url,
|
||||||
true,
|
true,
|
||||||
"",
|
"",
|
||||||
|
@ -352,9 +351,9 @@ pub fn walk_and_embed_assets(
|
||||||
opt_silent,
|
opt_silent,
|
||||||
opt_insecure,
|
opt_insecure,
|
||||||
)
|
)
|
||||||
.unwrap_or(poster_full_url);
|
.unwrap_or((poster_full_url, EMPTY_STRING.clone()));
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(img_datauri.as_str());
|
attr.value.push_slice(poster_dataurl.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
24
src/http.rs
24
src/http.rs
|
@ -6,26 +6,25 @@ use utils::{data_to_dataurl, is_data_url};
|
||||||
pub fn retrieve_asset(
|
pub fn retrieve_asset(
|
||||||
url: &str,
|
url: &str,
|
||||||
as_dataurl: bool,
|
as_dataurl: bool,
|
||||||
as_mime: &str,
|
mime: &str,
|
||||||
opt_user_agent: &str,
|
opt_user_agent: &str,
|
||||||
opt_silent: bool,
|
opt_silent: bool,
|
||||||
opt_insecure: bool,
|
opt_insecure: bool,
|
||||||
) -> Result<String, reqwest::Error> {
|
) -> Result<(String, String), reqwest::Error> {
|
||||||
if is_data_url(&url).unwrap() {
|
if is_data_url(&url).unwrap() {
|
||||||
Ok(url.to_string())
|
Ok((url.to_string(), url.to_string()))
|
||||||
} else {
|
} else {
|
||||||
let client = Client::builder()
|
let client = Client::builder()
|
||||||
.timeout(Duration::from_secs(10))
|
.timeout(Duration::from_secs(10))
|
||||||
.danger_accept_invalid_certs(opt_insecure)
|
.danger_accept_invalid_certs(opt_insecure)
|
||||||
.build()?;
|
.build()?;
|
||||||
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
|
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
|
||||||
let final_url = response.url().as_str();
|
|
||||||
|
|
||||||
if !opt_silent {
|
if !opt_silent {
|
||||||
if url == final_url {
|
if url == response.url().as_str() {
|
||||||
eprintln!("[ {} ]", &url);
|
eprintln!("[ {} ]", &url);
|
||||||
} else {
|
} else {
|
||||||
eprintln!("[ {} -> {} ]", &url, &final_url);
|
eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -35,19 +34,22 @@ pub fn retrieve_asset(
|
||||||
response.copy_to(&mut data)?;
|
response.copy_to(&mut data)?;
|
||||||
|
|
||||||
// Attempt to obtain MIME type by reading the Content-Type header
|
// Attempt to obtain MIME type by reading the Content-Type header
|
||||||
let mimetype = if as_mime == "" {
|
let mimetype = if mime == "" {
|
||||||
response
|
response
|
||||||
.headers()
|
.headers()
|
||||||
.get(CONTENT_TYPE)
|
.get(CONTENT_TYPE)
|
||||||
.and_then(|header| header.to_str().ok())
|
.and_then(|header| header.to_str().ok())
|
||||||
.unwrap_or(&as_mime)
|
.unwrap_or(&mime)
|
||||||
} else {
|
} else {
|
||||||
as_mime
|
mime
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(data_to_dataurl(&mimetype, &data))
|
Ok((
|
||||||
|
data_to_dataurl(&mimetype, &data),
|
||||||
|
response.url().to_string(),
|
||||||
|
))
|
||||||
} else {
|
} else {
|
||||||
Ok(response.text().unwrap())
|
Ok((response.text().unwrap(), response.url().to_string()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,7 +46,7 @@ fn main() {
|
||||||
let opt_user_agent: &str = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
let opt_user_agent: &str = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
||||||
|
|
||||||
if is_valid_url(arg_target) {
|
if is_valid_url(arg_target) {
|
||||||
let data = retrieve_asset(
|
let (data, final_url) = retrieve_asset(
|
||||||
&arg_target,
|
&arg_target,
|
||||||
false,
|
false,
|
||||||
"",
|
"",
|
||||||
|
@ -58,7 +58,7 @@ fn main() {
|
||||||
let dom = html_to_dom(&data);
|
let dom = html_to_dom(&data);
|
||||||
|
|
||||||
walk_and_embed_assets(
|
walk_and_embed_assets(
|
||||||
&arg_target,
|
&final_url,
|
||||||
&dom.document,
|
&dom.document,
|
||||||
opt_no_css,
|
opt_no_css,
|
||||||
opt_no_js,
|
opt_no_js,
|
||||||
|
|
21
src/tests/http.rs
Normal file
21
src/tests/http.rs
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
use crate::http::retrieve_asset;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_retrieve_asset() {
|
||||||
|
let (data, final_url) =
|
||||||
|
retrieve_asset("data:text/html;base64,...", true, "", "", true, false).unwrap();
|
||||||
|
assert_eq!(&data, "data:text/html;base64,...");
|
||||||
|
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||||
|
|
||||||
|
let (data, final_url) = retrieve_asset(
|
||||||
|
"data:text/html;base64,...",
|
||||||
|
true,
|
||||||
|
"image/png",
|
||||||
|
"",
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(&data, "data:text/html;base64,...");
|
||||||
|
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||||
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
mod html;
|
mod html;
|
||||||
|
mod http;
|
||||||
mod js;
|
mod js;
|
||||||
mod utils;
|
mod utils;
|
||||||
|
|
Loading…
Add table
Reference in a new issue