Merge pull request #65 from Y2Z/redirects

Properly handle 30x redirects
This commit is contained in:
Sunshine 2019-10-01 07:25:35 -04:00 committed by GitHub
commit dbacd76103
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 60 additions and 37 deletions

View File

@ -7,7 +7,6 @@ use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns}; use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset; use http::retrieve_asset;
use js::attr_is_event_handler; use js::attr_is_event_handler;
use regex::Regex;
use std::default::Default; use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol}; use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
@ -103,7 +102,7 @@ pub fn walk_and_embed_assets(
let href_full_url: String = let href_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let favicon_datauri = retrieve_asset( let (favicon_dataurl, _) = retrieve_asset(
&href_full_url, &href_full_url,
true, true,
"", "",
@ -111,9 +110,9 @@ pub fn walk_and_embed_assets(
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(favicon_datauri.as_str()); attr.value.push_slice(favicon_dataurl.as_str());
} }
} }
} }
@ -126,7 +125,7 @@ pub fn walk_and_embed_assets(
let href_full_url: String = let href_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let css_datauri = retrieve_asset( let (css_dataurl, _) = retrieve_asset(
&href_full_url, &href_full_url,
true, true,
"text/css", "text/css",
@ -134,9 +133,9 @@ pub fn walk_and_embed_assets(
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(css_datauri.as_str()); attr.value.push_slice(css_dataurl.as_str());
} }
} }
} }
@ -168,7 +167,7 @@ pub fn walk_and_embed_assets(
} else { } else {
let src_full_url: String = let src_full_url: String =
resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone()); resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone());
let img_datauri = retrieve_asset( let (img_dataurl, _) = retrieve_asset(
&src_full_url, &src_full_url,
true, true,
"", "",
@ -176,9 +175,9 @@ pub fn walk_and_embed_assets(
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(img_datauri.as_str()); attr.value.push_slice(img_dataurl.as_str());
} }
} }
} }
@ -201,7 +200,7 @@ pub fn walk_and_embed_assets(
let srcset_full_url: String = let srcset_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let source_datauri = retrieve_asset( let (source_dataurl, _) = retrieve_asset(
&srcset_full_url, &srcset_full_url,
true, true,
"", "",
@ -209,9 +208,9 @@ pub fn walk_and_embed_assets(
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(source_datauri.as_str()); attr.value.push_slice(source_dataurl.as_str());
} }
} }
} }
@ -247,7 +246,7 @@ pub fn walk_and_embed_assets(
let src_full_url: String = let src_full_url: String =
resolve_url(&url, &attr.value.to_string()) resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let js_datauri = retrieve_asset( let (js_dataurl, _) = retrieve_asset(
&src_full_url, &src_full_url,
true, true,
"application/javascript", "application/javascript",
@ -255,9 +254,9 @@ pub fn walk_and_embed_assets(
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(js_datauri.as_str()); attr.value.push_slice(js_dataurl.as_str());
} }
} }
} }
@ -300,7 +299,7 @@ pub fn walk_and_embed_assets(
let src_full_url: String = let src_full_url: String =
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone()); resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
let iframe_data = retrieve_asset( let (iframe_data, iframe_final_url) = retrieve_asset(
&src_full_url, &src_full_url,
false, false,
"text/html", "text/html",
@ -308,10 +307,10 @@ pub fn walk_and_embed_assets(
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or((EMPTY_STRING.clone(), src_full_url));
let dom = html_to_dom(&iframe_data); let dom = html_to_dom(&iframe_data);
walk_and_embed_assets( walk_and_embed_assets(
&src_full_url, &iframe_final_url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,
@ -323,9 +322,9 @@ pub fn walk_and_embed_assets(
); );
let mut buf: Vec<u8> = Vec::new(); let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
let iframe_datauri = data_to_dataurl("text/html", &buf); let iframe_dataurl = data_to_dataurl("text/html", &buf);
attr.value.clear(); attr.value.clear();
attr.value.push_slice(iframe_datauri.as_str()); attr.value.push_slice(iframe_dataurl.as_str());
} }
} }
} }
@ -344,7 +343,7 @@ pub fn walk_and_embed_assets(
} else { } else {
let poster_full_url: String = resolve_url(&url, &video_poster) let poster_full_url: String = resolve_url(&url, &video_poster)
.unwrap_or(EMPTY_STRING.clone()); .unwrap_or(EMPTY_STRING.clone());
let img_datauri = retrieve_asset( let (poster_dataurl, _) = retrieve_asset(
&poster_full_url, &poster_full_url,
true, true,
"", "",
@ -352,9 +351,9 @@ pub fn walk_and_embed_assets(
opt_silent, opt_silent,
opt_insecure, opt_insecure,
) )
.unwrap_or(poster_full_url); .unwrap_or((poster_full_url, EMPTY_STRING.clone()));
attr.value.clear(); attr.value.clear();
attr.value.push_slice(img_datauri.as_str()); attr.value.push_slice(poster_dataurl.as_str());
} }
} }
} }

View File

@ -6,26 +6,25 @@ use utils::{data_to_dataurl, is_data_url};
pub fn retrieve_asset( pub fn retrieve_asset(
url: &str, url: &str,
as_dataurl: bool, as_dataurl: bool,
as_mime: &str, mime: &str,
opt_user_agent: &str, opt_user_agent: &str,
opt_silent: bool, opt_silent: bool,
opt_insecure: bool, opt_insecure: bool,
) -> Result<String, reqwest::Error> { ) -> Result<(String, String), reqwest::Error> {
if is_data_url(&url).unwrap() { if is_data_url(&url).unwrap() {
Ok(url.to_string()) Ok((url.to_string(), url.to_string()))
} else { } else {
let client = Client::builder() let client = Client::builder()
.timeout(Duration::from_secs(10)) .timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(opt_insecure) .danger_accept_invalid_certs(opt_insecure)
.build()?; .build()?;
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?; let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
let final_url = response.url().as_str();
if !opt_silent { if !opt_silent {
if url == final_url { if url == response.url().as_str() {
eprintln!("[ {} ]", &url); eprintln!("[ {} ]", &url);
} else { } else {
eprintln!("[ {} -> {} ]", &url, &final_url); eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
} }
} }
@ -35,19 +34,22 @@ pub fn retrieve_asset(
response.copy_to(&mut data)?; response.copy_to(&mut data)?;
// Attempt to obtain MIME type by reading the Content-Type header // Attempt to obtain MIME type by reading the Content-Type header
let mimetype = if as_mime == "" { let mimetype = if mime == "" {
response response
.headers() .headers()
.get(CONTENT_TYPE) .get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok()) .and_then(|header| header.to_str().ok())
.unwrap_or(&as_mime) .unwrap_or(&mime)
} else { } else {
as_mime mime
}; };
Ok(data_to_dataurl(&mimetype, &data)) Ok((
data_to_dataurl(&mimetype, &data),
response.url().to_string(),
))
} else { } else {
Ok(response.text().unwrap()) Ok((response.text().unwrap(), response.url().to_string()))
} }
} }
} }

View File

@ -46,7 +46,7 @@ fn main() {
let opt_user_agent: &str = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT); let opt_user_agent: &str = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
if is_valid_url(arg_target) { if is_valid_url(arg_target) {
let data = retrieve_asset( let (data, final_url) = retrieve_asset(
&arg_target, &arg_target,
false, false,
"", "",
@ -58,7 +58,7 @@ fn main() {
let dom = html_to_dom(&data); let dom = html_to_dom(&data);
walk_and_embed_assets( walk_and_embed_assets(
&arg_target, &final_url,
&dom.document, &dom.document,
opt_no_css, opt_no_css,
opt_no_js, opt_no_js,

21
src/tests/http.rs Normal file
View File

@ -0,0 +1,21 @@
use crate::http::retrieve_asset;
#[test]
fn test_retrieve_asset() {
let (data, final_url) =
retrieve_asset("data:text/html;base64,...", true, "", "", true, false).unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
let (data, final_url) = retrieve_asset(
"data:text/html;base64,...",
true,
"image/png",
"",
true,
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
}

View File

@ -1,3 +1,4 @@
mod html; mod html;
mod http;
mod js; mod js;
mod utils; mod utils;