Properly handle 30x redirects
This commit is contained in:
parent
b6ba22513d
commit
0896f2e214
5 changed files with 60 additions and 37 deletions
47
src/html.rs
47
src/html.rs
|
@ -7,7 +7,6 @@ use html5ever::tree_builder::{Attribute, TreeSink};
|
|||
use html5ever::{local_name, namespace_url, ns};
|
||||
use http::retrieve_asset;
|
||||
use js::attr_is_event_handler;
|
||||
use regex::Regex;
|
||||
use std::default::Default;
|
||||
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
|
||||
|
||||
|
@ -103,7 +102,7 @@ pub fn walk_and_embed_assets(
|
|||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let favicon_datauri = retrieve_asset(
|
||||
let (favicon_dataurl, _) = retrieve_asset(
|
||||
&href_full_url,
|
||||
true,
|
||||
"",
|
||||
|
@ -111,9 +110,9 @@ pub fn walk_and_embed_assets(
|
|||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(favicon_datauri.as_str());
|
||||
attr.value.push_slice(favicon_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -126,7 +125,7 @@ pub fn walk_and_embed_assets(
|
|||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let css_datauri = retrieve_asset(
|
||||
let (css_dataurl, _) = retrieve_asset(
|
||||
&href_full_url,
|
||||
true,
|
||||
"text/css",
|
||||
|
@ -134,9 +133,9 @@ pub fn walk_and_embed_assets(
|
|||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(css_datauri.as_str());
|
||||
attr.value.push_slice(css_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -168,7 +167,7 @@ pub fn walk_and_embed_assets(
|
|||
} else {
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone());
|
||||
let img_datauri = retrieve_asset(
|
||||
let (img_dataurl, _) = retrieve_asset(
|
||||
&src_full_url,
|
||||
true,
|
||||
"",
|
||||
|
@ -176,9 +175,9 @@ pub fn walk_and_embed_assets(
|
|||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(img_datauri.as_str());
|
||||
attr.value.push_slice(img_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -201,7 +200,7 @@ pub fn walk_and_embed_assets(
|
|||
let srcset_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let source_datauri = retrieve_asset(
|
||||
let (source_dataurl, _) = retrieve_asset(
|
||||
&srcset_full_url,
|
||||
true,
|
||||
"",
|
||||
|
@ -209,9 +208,9 @@ pub fn walk_and_embed_assets(
|
|||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(source_datauri.as_str());
|
||||
attr.value.push_slice(source_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -247,7 +246,7 @@ pub fn walk_and_embed_assets(
|
|||
let src_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let js_datauri = retrieve_asset(
|
||||
let (js_dataurl, _) = retrieve_asset(
|
||||
&src_full_url,
|
||||
true,
|
||||
"application/javascript",
|
||||
|
@ -255,9 +254,9 @@ pub fn walk_and_embed_assets(
|
|||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(js_datauri.as_str());
|
||||
attr.value.push_slice(js_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -300,7 +299,7 @@ pub fn walk_and_embed_assets(
|
|||
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
|
||||
let iframe_data = retrieve_asset(
|
||||
let (iframe_data, iframe_final_url) = retrieve_asset(
|
||||
&src_full_url,
|
||||
false,
|
||||
"text/html",
|
||||
|
@ -308,10 +307,10 @@ pub fn walk_and_embed_assets(
|
|||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
.unwrap_or((EMPTY_STRING.clone(), src_full_url));
|
||||
let dom = html_to_dom(&iframe_data);
|
||||
walk_and_embed_assets(
|
||||
&src_full_url,
|
||||
&iframe_final_url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
|
@ -323,9 +322,9 @@ pub fn walk_and_embed_assets(
|
|||
);
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
let iframe_datauri = data_to_dataurl("text/html", &buf);
|
||||
let iframe_dataurl = data_to_dataurl("text/html", &buf);
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(iframe_datauri.as_str());
|
||||
attr.value.push_slice(iframe_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -344,7 +343,7 @@ pub fn walk_and_embed_assets(
|
|||
} else {
|
||||
let poster_full_url: String = resolve_url(&url, &video_poster)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let img_datauri = retrieve_asset(
|
||||
let (poster_dataurl, _) = retrieve_asset(
|
||||
&poster_full_url,
|
||||
true,
|
||||
"",
|
||||
|
@ -352,9 +351,9 @@ pub fn walk_and_embed_assets(
|
|||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(poster_full_url);
|
||||
.unwrap_or((poster_full_url, EMPTY_STRING.clone()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(img_datauri.as_str());
|
||||
attr.value.push_slice(poster_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
24
src/http.rs
24
src/http.rs
|
@ -6,26 +6,25 @@ use utils::{data_to_dataurl, is_data_url};
|
|||
pub fn retrieve_asset(
|
||||
url: &str,
|
||||
as_dataurl: bool,
|
||||
as_mime: &str,
|
||||
mime: &str,
|
||||
opt_user_agent: &str,
|
||||
opt_silent: bool,
|
||||
opt_insecure: bool,
|
||||
) -> Result<String, reqwest::Error> {
|
||||
) -> Result<(String, String), reqwest::Error> {
|
||||
if is_data_url(&url).unwrap() {
|
||||
Ok(url.to_string())
|
||||
Ok((url.to_string(), url.to_string()))
|
||||
} else {
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(opt_insecure)
|
||||
.build()?;
|
||||
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
|
||||
let final_url = response.url().as_str();
|
||||
|
||||
if !opt_silent {
|
||||
if url == final_url {
|
||||
if url == response.url().as_str() {
|
||||
eprintln!("[ {} ]", &url);
|
||||
} else {
|
||||
eprintln!("[ {} -> {} ]", &url, &final_url);
|
||||
eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -35,19 +34,22 @@ pub fn retrieve_asset(
|
|||
response.copy_to(&mut data)?;
|
||||
|
||||
// Attempt to obtain MIME type by reading the Content-Type header
|
||||
let mimetype = if as_mime == "" {
|
||||
let mimetype = if mime == "" {
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|header| header.to_str().ok())
|
||||
.unwrap_or(&as_mime)
|
||||
.unwrap_or(&mime)
|
||||
} else {
|
||||
as_mime
|
||||
mime
|
||||
};
|
||||
|
||||
Ok(data_to_dataurl(&mimetype, &data))
|
||||
Ok((
|
||||
data_to_dataurl(&mimetype, &data),
|
||||
response.url().to_string(),
|
||||
))
|
||||
} else {
|
||||
Ok(response.text().unwrap())
|
||||
Ok((response.text().unwrap(), response.url().to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,7 +46,7 @@ fn main() {
|
|||
let opt_user_agent: &str = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
||||
|
||||
if is_valid_url(arg_target) {
|
||||
let data = retrieve_asset(
|
||||
let (data, final_url) = retrieve_asset(
|
||||
&arg_target,
|
||||
false,
|
||||
"",
|
||||
|
@ -58,7 +58,7 @@ fn main() {
|
|||
let dom = html_to_dom(&data);
|
||||
|
||||
walk_and_embed_assets(
|
||||
&arg_target,
|
||||
&final_url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
|
|
21
src/tests/http.rs
Normal file
21
src/tests/http.rs
Normal file
|
@ -0,0 +1,21 @@
|
|||
use crate::http::retrieve_asset;
|
||||
|
||||
#[test]
|
||||
fn test_retrieve_asset() {
|
||||
let (data, final_url) =
|
||||
retrieve_asset("data:text/html;base64,...", true, "", "", true, false).unwrap();
|
||||
assert_eq!(&data, "data:text/html;base64,...");
|
||||
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||
|
||||
let (data, final_url) = retrieve_asset(
|
||||
"data:text/html;base64,...",
|
||||
true,
|
||||
"image/png",
|
||||
"",
|
||||
true,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(&data, "data:text/html;base64,...");
|
||||
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
mod html;
|
||||
mod http;
|
||||
mod js;
|
||||
mod utils;
|
||||
|
|
Loading…
Reference in a new issue