account for network errors, add option to ignore them
This commit is contained in:
parent
052f8f49ec
commit
38a6f963ad
@ -54,9 +54,10 @@ The guide can be found [here](docs/containers.md)
|
|||||||
---------------------------------------------------
|
---------------------------------------------------
|
||||||
|
|
||||||
## Options
|
## Options
|
||||||
- `-c`: Ignore styles
|
- `-c`: Exclude CSS
|
||||||
- `-f`: Exclude frames
|
- `-e`: Ignore network errors
|
||||||
- `-F`: Omit web fonts
|
- `-f`: Omit frames
|
||||||
|
- `-F`: Exclude web fonts
|
||||||
- `-i`: Remove images
|
- `-i`: Remove images
|
||||||
- `-I`: Isolate the document
|
- `-I`: Isolate the document
|
||||||
- `-j`: Exclude JavaScript
|
- `-j`: Exclude JavaScript
|
||||||
|
16
src/css.rs
16
src/css.rs
@ -176,7 +176,7 @@ pub fn process_css<'a>(
|
|||||||
client,
|
client,
|
||||||
&parent_url,
|
&parent_url,
|
||||||
&import_full_url,
|
&import_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((import_contents, import_final_url, _import_media_type)) => {
|
Ok((import_contents, import_final_url, _import_media_type)) => {
|
||||||
@ -227,7 +227,7 @@ pub fn process_css<'a>(
|
|||||||
client,
|
client,
|
||||||
&parent_url,
|
&parent_url,
|
||||||
&resolved_url,
|
&resolved_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((data, final_url, media_type)) => {
|
Ok((data, final_url, media_type)) => {
|
||||||
@ -315,14 +315,8 @@ pub fn process_css<'a>(
|
|||||||
if is_import {
|
if is_import {
|
||||||
let full_url = resolve_url(&parent_url, value).unwrap_or_default();
|
let full_url = resolve_url(&parent_url, value).unwrap_or_default();
|
||||||
let url_fragment = get_url_fragment(full_url.clone());
|
let url_fragment = get_url_fragment(full_url.clone());
|
||||||
match retrieve_asset(
|
match retrieve_asset(cache, client, &parent_url, &full_url, options, depth + 1)
|
||||||
cache,
|
{
|
||||||
client,
|
|
||||||
&parent_url,
|
|
||||||
&full_url,
|
|
||||||
options.silent,
|
|
||||||
depth + 1,
|
|
||||||
) {
|
|
||||||
Ok((css, final_url, _media_type)) => {
|
Ok((css, final_url, _media_type)) => {
|
||||||
let data_url = data_to_data_url(
|
let data_url = data_to_data_url(
|
||||||
"text/css",
|
"text/css",
|
||||||
@ -361,7 +355,7 @@ pub fn process_css<'a>(
|
|||||||
client,
|
client,
|
||||||
&parent_url,
|
&parent_url,
|
||||||
&full_url,
|
&full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((data, final_url, media_type)) => {
|
Ok((data, final_url, media_type)) => {
|
||||||
|
22
src/html.rs
22
src/html.rs
@ -148,7 +148,7 @@ pub fn embed_srcset(
|
|||||||
client,
|
client,
|
||||||
&parent_url,
|
&parent_url,
|
||||||
&image_full_url,
|
&image_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((image_data, image_final_url, image_media_type)) => {
|
Ok((image_data, image_final_url, image_media_type)) => {
|
||||||
@ -539,7 +539,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&link_href_full_url,
|
&link_href_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((
|
Ok((
|
||||||
@ -612,7 +612,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&link_href_full_url,
|
&link_href_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((
|
Ok((
|
||||||
@ -723,7 +723,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&background_full_url,
|
&background_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((background_data, background_final_url, background_media_type)) => {
|
Ok((background_data, background_final_url, background_media_type)) => {
|
||||||
@ -807,7 +807,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&img_full_url,
|
&img_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((img_data, img_final_url, img_media_type)) => {
|
Ok((img_data, img_final_url, img_media_type)) => {
|
||||||
@ -898,7 +898,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&input_image_full_url,
|
&input_image_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((
|
Ok((
|
||||||
@ -961,7 +961,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&image_full_url,
|
&image_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((image_data, image_final_url, image_media_type)) => {
|
Ok((image_data, image_final_url, image_media_type)) => {
|
||||||
@ -1020,7 +1020,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&srcset_full_url,
|
&srcset_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((srcset_data, srcset_final_url, srcset_media_type)) => {
|
Ok((srcset_data, srcset_final_url, srcset_media_type)) => {
|
||||||
@ -1103,7 +1103,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&script_full_url,
|
&script_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((script_data, script_final_url, _script_media_type)) => {
|
Ok((script_data, script_final_url, _script_media_type)) => {
|
||||||
@ -1196,7 +1196,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&frame_full_url,
|
&frame_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((frame_data, frame_final_url, frame_media_type)) => {
|
Ok((frame_data, frame_final_url, frame_media_type)) => {
|
||||||
@ -1269,7 +1269,7 @@ pub fn walk_and_embed_assets(
|
|||||||
client,
|
client,
|
||||||
&url,
|
&url,
|
||||||
&video_poster_full_url,
|
&video_poster_full_url,
|
||||||
options.silent,
|
options,
|
||||||
depth + 1,
|
depth + 1,
|
||||||
) {
|
) {
|
||||||
Ok((
|
Ok((
|
||||||
|
15
src/main.rs
15
src/main.rs
@ -113,20 +113,15 @@ fn main() {
|
|||||||
|
|
||||||
// Retrieve target document
|
// Retrieve target document
|
||||||
if is_file_url(target_url) || is_http_url(target_url) {
|
if is_file_url(target_url) || is_http_url(target_url) {
|
||||||
match retrieve_asset(
|
match retrieve_asset(&mut cache, &client, target_url, target_url, &options, 0) {
|
||||||
&mut cache,
|
|
||||||
&client,
|
|
||||||
target_url,
|
|
||||||
target_url,
|
|
||||||
options.silent,
|
|
||||||
0,
|
|
||||||
) {
|
|
||||||
Ok((data, final_url, _media_type)) => {
|
Ok((data, final_url, _media_type)) => {
|
||||||
base_url = final_url;
|
base_url = final_url;
|
||||||
dom = html_to_dom(&String::from_utf8_lossy(&data));
|
dom = html_to_dom(&String::from_utf8_lossy(&data));
|
||||||
}
|
}
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
eprintln!("Could not retrieve target document");
|
if !options.silent {
|
||||||
|
eprintln!("Could not retrieve target document");
|
||||||
|
}
|
||||||
process::exit(1);
|
process::exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -159,7 +154,7 @@ fn main() {
|
|||||||
&client,
|
&client,
|
||||||
&base_url,
|
&base_url,
|
||||||
&favicon_ico_url,
|
&favicon_ico_url,
|
||||||
options.silent,
|
&options,
|
||||||
0,
|
0,
|
||||||
) {
|
) {
|
||||||
Ok((data, final_url, media_type)) => {
|
Ok((data, final_url, media_type)) => {
|
||||||
|
@ -4,6 +4,7 @@ use clap::{App, Arg};
|
|||||||
pub struct Options {
|
pub struct Options {
|
||||||
pub target: String,
|
pub target: String,
|
||||||
pub no_css: bool,
|
pub no_css: bool,
|
||||||
|
pub ignore_errors: bool,
|
||||||
pub no_fonts: bool,
|
pub no_fonts: bool,
|
||||||
pub no_frames: bool,
|
pub no_frames: bool,
|
||||||
pub no_images: bool,
|
pub no_images: bool,
|
||||||
@ -45,6 +46,7 @@ impl Options {
|
|||||||
)
|
)
|
||||||
// .args_from_usage("-a, --no-audio 'Removes audio sources'")
|
// .args_from_usage("-a, --no-audio 'Removes audio sources'")
|
||||||
.args_from_usage("-c, --no-css 'Removes CSS'")
|
.args_from_usage("-c, --no-css 'Removes CSS'")
|
||||||
|
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
|
||||||
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
|
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
|
||||||
.args_from_usage("-F, --no-fonts 'Removes fonts'")
|
.args_from_usage("-F, --no-fonts 'Removes fonts'")
|
||||||
.args_from_usage("-i, --no-images 'Removes images'")
|
.args_from_usage("-i, --no-images 'Removes images'")
|
||||||
@ -66,6 +68,7 @@ impl Options {
|
|||||||
.expect("please set target")
|
.expect("please set target")
|
||||||
.to_string();
|
.to_string();
|
||||||
options.no_css = app.is_present("no-css");
|
options.no_css = app.is_present("no-css");
|
||||||
|
options.ignore_errors = app.is_present("ignore-errors");
|
||||||
options.no_frames = app.is_present("no-frames");
|
options.no_frames = app.is_present("no-frames");
|
||||||
options.no_fonts = app.is_present("no-fonts");
|
options.no_fonts = app.is_present("no-fonts");
|
||||||
options.no_images = app.is_present("no-images");
|
options.no_images = app.is_present("no-images");
|
||||||
|
@ -11,6 +11,7 @@ mod passing {
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::env;
|
use std::env;
|
||||||
|
|
||||||
|
use crate::opts::Options;
|
||||||
use crate::url;
|
use crate::url;
|
||||||
use crate::utils;
|
use crate::utils;
|
||||||
|
|
||||||
@ -19,6 +20,9 @@ mod passing {
|
|||||||
let cache = &mut HashMap::new();
|
let cache = &mut HashMap::new();
|
||||||
let client = Client::new();
|
let client = Client::new();
|
||||||
|
|
||||||
|
let mut options = Options::default();
|
||||||
|
options.silent = true;
|
||||||
|
|
||||||
// If both source and target are data URLs,
|
// If both source and target are data URLs,
|
||||||
// ensure the result contains target data URL
|
// ensure the result contains target data URL
|
||||||
let (data, final_url, media_type) = utils::retrieve_asset(
|
let (data, final_url, media_type) = utils::retrieve_asset(
|
||||||
@ -26,7 +30,7 @@ mod passing {
|
|||||||
&client,
|
&client,
|
||||||
"data:text/html;base64,c291cmNl",
|
"data:text/html;base64,c291cmNl",
|
||||||
"data:text/html;base64,dGFyZ2V0",
|
"data:text/html;base64,dGFyZ2V0",
|
||||||
false,
|
&options,
|
||||||
0,
|
0,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@ -46,6 +50,9 @@ mod passing {
|
|||||||
let cache = &mut HashMap::new();
|
let cache = &mut HashMap::new();
|
||||||
let client = Client::new();
|
let client = Client::new();
|
||||||
|
|
||||||
|
let mut options = Options::default();
|
||||||
|
options.silent = true;
|
||||||
|
|
||||||
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
|
||||||
|
|
||||||
// Inclusion of local assets from local sources should be allowed
|
// Inclusion of local assets from local sources should be allowed
|
||||||
@ -63,7 +70,7 @@ mod passing {
|
|||||||
file = file_url_protocol,
|
file = file_url_protocol,
|
||||||
cwd = cwd.to_str().unwrap()
|
cwd = cwd.to_str().unwrap()
|
||||||
),
|
),
|
||||||
false,
|
&options,
|
||||||
0,
|
0,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@ -91,6 +98,7 @@ mod failing {
|
|||||||
use reqwest::blocking::Client;
|
use reqwest::blocking::Client;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::opts::Options;
|
||||||
use crate::utils;
|
use crate::utils;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -98,13 +106,16 @@ mod failing {
|
|||||||
let cache = &mut HashMap::new();
|
let cache = &mut HashMap::new();
|
||||||
let client = Client::new();
|
let client = Client::new();
|
||||||
|
|
||||||
|
let mut options = Options::default();
|
||||||
|
options.silent = true;
|
||||||
|
|
||||||
// Inclusion of local assets from data URL sources should not be allowed
|
// Inclusion of local assets from data URL sources should not be allowed
|
||||||
match utils::retrieve_asset(
|
match utils::retrieve_asset(
|
||||||
cache,
|
cache,
|
||||||
&client,
|
&client,
|
||||||
"data:text/html;base64,SoUrCe",
|
"data:text/html;base64,SoUrCe",
|
||||||
"file:///etc/passwd",
|
"file:///etc/passwd",
|
||||||
false,
|
&options,
|
||||||
0,
|
0,
|
||||||
) {
|
) {
|
||||||
Ok((..)) => {
|
Ok((..)) => {
|
||||||
@ -121,13 +132,16 @@ mod failing {
|
|||||||
let cache = &mut HashMap::new();
|
let cache = &mut HashMap::new();
|
||||||
let client = Client::new();
|
let client = Client::new();
|
||||||
|
|
||||||
|
let mut options = Options::default();
|
||||||
|
options.silent = true;
|
||||||
|
|
||||||
// Inclusion of local assets from remote sources should not be allowed
|
// Inclusion of local assets from remote sources should not be allowed
|
||||||
match utils::retrieve_asset(
|
match utils::retrieve_asset(
|
||||||
cache,
|
cache,
|
||||||
&client,
|
&client,
|
||||||
"https://kernel.org/",
|
"https://kernel.org/",
|
||||||
"file:///etc/passwd",
|
"file:///etc/passwd",
|
||||||
false,
|
&options,
|
||||||
0,
|
0,
|
||||||
) {
|
) {
|
||||||
Ok((..)) => {
|
Ok((..)) => {
|
||||||
|
69
src/utils.rs
69
src/utils.rs
@ -4,6 +4,7 @@ use std::collections::HashMap;
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
|
use crate::opts::Options;
|
||||||
use crate::url::{clean_url, data_url_to_data, file_url_to_fs_path, is_data_url, is_file_url};
|
use crate::url::{clean_url, data_url_to_data, file_url_to_fs_path, is_data_url, is_file_url};
|
||||||
|
|
||||||
const INDENT: &str = " ";
|
const INDENT: &str = " ";
|
||||||
@ -73,7 +74,7 @@ pub fn retrieve_asset(
|
|||||||
client: &Client,
|
client: &Client,
|
||||||
parent_url: &str,
|
parent_url: &str,
|
||||||
url: &str,
|
url: &str,
|
||||||
opt_silent: bool,
|
options: &Options,
|
||||||
depth: u32,
|
depth: u32,
|
||||||
) -> Result<(Vec<u8>, String, String), reqwest::Error> {
|
) -> Result<(Vec<u8>, String, String), reqwest::Error> {
|
||||||
if url.len() == 0 {
|
if url.len() == 0 {
|
||||||
@ -95,7 +96,7 @@ pub fn retrieve_asset(
|
|||||||
let fs_file_path: String = file_url_to_fs_path(url);
|
let fs_file_path: String = file_url_to_fs_path(url);
|
||||||
let path = Path::new(&fs_file_path);
|
let path = Path::new(&fs_file_path);
|
||||||
if path.exists() {
|
if path.exists() {
|
||||||
if !opt_silent {
|
if !options.silent {
|
||||||
eprintln!("{}{}", indent(depth).as_str(), &url);
|
eprintln!("{}{}", indent(depth).as_str(), &url);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,7 +110,7 @@ pub fn retrieve_asset(
|
|||||||
|
|
||||||
if cache.contains_key(&cache_key) {
|
if cache.contains_key(&cache_key) {
|
||||||
// URL is in cache, we get and return it
|
// URL is in cache, we get and return it
|
||||||
if !opt_silent {
|
if !options.silent {
|
||||||
eprintln!("{}{} (from cache)", indent(depth).as_str(), &url);
|
eprintln!("{}{} (from cache)", indent(depth).as_str(), &url);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,34 +121,46 @@ pub fn retrieve_asset(
|
|||||||
))
|
))
|
||||||
} else {
|
} else {
|
||||||
// URL not in cache, we retrieve the file
|
// URL not in cache, we retrieve the file
|
||||||
let mut response = client.get(url).send()?;
|
match client.get(url).send() {
|
||||||
let res_url = response.url().to_string();
|
Ok(mut response) => {
|
||||||
|
if !options.ignore_errors && response.status() != 200 {
|
||||||
|
if !options.silent {
|
||||||
|
eprintln!("Unable to retrieve {} ({})", &url, response.status());
|
||||||
|
}
|
||||||
|
// Provoke error
|
||||||
|
return Err(client.get("").send().unwrap_err());
|
||||||
|
}
|
||||||
|
|
||||||
if !opt_silent {
|
let res_url = response.url().to_string();
|
||||||
if url == res_url {
|
|
||||||
eprintln!("{}{}", indent(depth).as_str(), &url);
|
if !options.silent {
|
||||||
} else {
|
if url == res_url {
|
||||||
eprintln!("{}{} -> {}", indent(depth).as_str(), &url, &res_url);
|
eprintln!("{}{}", indent(depth).as_str(), &url);
|
||||||
|
} else {
|
||||||
|
eprintln!("{}{} -> {}", indent(depth).as_str(), &url, &res_url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let new_cache_key: String = clean_url(&res_url);
|
||||||
|
|
||||||
|
// Convert response into a byte array
|
||||||
|
let mut data: Vec<u8> = vec![];
|
||||||
|
response.copy_to(&mut data)?;
|
||||||
|
|
||||||
|
// Attempt to obtain media type by reading the Content-Type header
|
||||||
|
let media_type = response
|
||||||
|
.headers()
|
||||||
|
.get(CONTENT_TYPE)
|
||||||
|
.and_then(|header| header.to_str().ok())
|
||||||
|
.unwrap_or("");
|
||||||
|
|
||||||
|
// Add retrieved resource to cache
|
||||||
|
cache.insert(new_cache_key, data.clone());
|
||||||
|
|
||||||
|
Ok((data, res_url, media_type.to_string()))
|
||||||
}
|
}
|
||||||
|
Err(error) => Err(error),
|
||||||
}
|
}
|
||||||
|
|
||||||
let new_cache_key: String = clean_url(&res_url);
|
|
||||||
|
|
||||||
// Convert response into a byte array
|
|
||||||
let mut data: Vec<u8> = vec![];
|
|
||||||
response.copy_to(&mut data)?;
|
|
||||||
|
|
||||||
// Attempt to obtain media type by reading the Content-Type header
|
|
||||||
let media_type = response
|
|
||||||
.headers()
|
|
||||||
.get(CONTENT_TYPE)
|
|
||||||
.and_then(|header| header.to_str().ok())
|
|
||||||
.unwrap_or("");
|
|
||||||
|
|
||||||
// Add to cache
|
|
||||||
cache.insert(new_cache_key, data.clone());
|
|
||||||
|
|
||||||
Ok((data, res_url, media_type.to_string()))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user