account for network errors, add option to ignore them

This commit is contained in:
Sunshine 2020-11-22 16:49:26 -10:00
parent 052f8f49ec
commit 38a6f963ad
No known key found for this signature in database
GPG key ID: B80CA68703CD8AB1
7 changed files with 87 additions and 67 deletions

View file

@ -54,9 +54,10 @@ The guide can be found [here](docs/containers.md)
---------------------------------------------------
## Options
- `-c`: Ignore styles
- `-f`: Exclude frames
- `-F`: Omit web fonts
- `-c`: Exclude CSS
- `-e`: Ignore network errors
- `-f`: Omit frames
- `-F`: Exclude web fonts
- `-i`: Remove images
- `-I`: Isolate the document
- `-j`: Exclude JavaScript

View file

@ -176,7 +176,7 @@ pub fn process_css<'a>(
client,
&parent_url,
&import_full_url,
options.silent,
options,
depth + 1,
) {
Ok((import_contents, import_final_url, _import_media_type)) => {
@ -227,7 +227,7 @@ pub fn process_css<'a>(
client,
&parent_url,
&resolved_url,
options.silent,
options,
depth + 1,
) {
Ok((data, final_url, media_type)) => {
@ -315,14 +315,8 @@ pub fn process_css<'a>(
if is_import {
let full_url = resolve_url(&parent_url, value).unwrap_or_default();
let url_fragment = get_url_fragment(full_url.clone());
match retrieve_asset(
cache,
client,
&parent_url,
&full_url,
options.silent,
depth + 1,
) {
match retrieve_asset(cache, client, &parent_url, &full_url, options, depth + 1)
{
Ok((css, final_url, _media_type)) => {
let data_url = data_to_data_url(
"text/css",
@ -361,7 +355,7 @@ pub fn process_css<'a>(
client,
&parent_url,
&full_url,
options.silent,
options,
depth + 1,
) {
Ok((data, final_url, media_type)) => {

View file

@ -148,7 +148,7 @@ pub fn embed_srcset(
client,
&parent_url,
&image_full_url,
options.silent,
options,
depth + 1,
) {
Ok((image_data, image_final_url, image_media_type)) => {
@ -539,7 +539,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&link_href_full_url,
options.silent,
options,
depth + 1,
) {
Ok((
@ -612,7 +612,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&link_href_full_url,
options.silent,
options,
depth + 1,
) {
Ok((
@ -723,7 +723,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&background_full_url,
options.silent,
options,
depth + 1,
) {
Ok((background_data, background_final_url, background_media_type)) => {
@ -807,7 +807,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&img_full_url,
options.silent,
options,
depth + 1,
) {
Ok((img_data, img_final_url, img_media_type)) => {
@ -898,7 +898,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&input_image_full_url,
options.silent,
options,
depth + 1,
) {
Ok((
@ -961,7 +961,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&image_full_url,
options.silent,
options,
depth + 1,
) {
Ok((image_data, image_final_url, image_media_type)) => {
@ -1020,7 +1020,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&srcset_full_url,
options.silent,
options,
depth + 1,
) {
Ok((srcset_data, srcset_final_url, srcset_media_type)) => {
@ -1103,7 +1103,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&script_full_url,
options.silent,
options,
depth + 1,
) {
Ok((script_data, script_final_url, _script_media_type)) => {
@ -1196,7 +1196,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&frame_full_url,
options.silent,
options,
depth + 1,
) {
Ok((frame_data, frame_final_url, frame_media_type)) => {
@ -1269,7 +1269,7 @@ pub fn walk_and_embed_assets(
client,
&url,
&video_poster_full_url,
options.silent,
options,
depth + 1,
) {
Ok((

View file

@ -113,20 +113,15 @@ fn main() {
// Retrieve target document
if is_file_url(target_url) || is_http_url(target_url) {
match retrieve_asset(
&mut cache,
&client,
target_url,
target_url,
options.silent,
0,
) {
match retrieve_asset(&mut cache, &client, target_url, target_url, &options, 0) {
Ok((data, final_url, _media_type)) => {
base_url = final_url;
dom = html_to_dom(&String::from_utf8_lossy(&data));
}
Err(_) => {
eprintln!("Could not retrieve target document");
if !options.silent {
eprintln!("Could not retrieve target document");
}
process::exit(1);
}
}
@ -159,7 +154,7 @@ fn main() {
&client,
&base_url,
&favicon_ico_url,
options.silent,
&options,
0,
) {
Ok((data, final_url, media_type)) => {

View file

@ -4,6 +4,7 @@ use clap::{App, Arg};
pub struct Options {
pub target: String,
pub no_css: bool,
pub ignore_errors: bool,
pub no_fonts: bool,
pub no_frames: bool,
pub no_images: bool,
@ -45,6 +46,7 @@ impl Options {
)
// .args_from_usage("-a, --no-audio 'Removes audio sources'")
.args_from_usage("-c, --no-css 'Removes CSS'")
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
.args_from_usage("-F, --no-fonts 'Removes fonts'")
.args_from_usage("-i, --no-images 'Removes images'")
@ -66,6 +68,7 @@ impl Options {
.expect("please set target")
.to_string();
options.no_css = app.is_present("no-css");
options.ignore_errors = app.is_present("ignore-errors");
options.no_frames = app.is_present("no-frames");
options.no_fonts = app.is_present("no-fonts");
options.no_images = app.is_present("no-images");

View file

@ -11,6 +11,7 @@ mod passing {
use std::collections::HashMap;
use std::env;
use crate::opts::Options;
use crate::url;
use crate::utils;
@ -19,6 +20,9 @@ mod passing {
let cache = &mut HashMap::new();
let client = Client::new();
let mut options = Options::default();
options.silent = true;
// If both source and target are data URLs,
// ensure the result contains target data URL
let (data, final_url, media_type) = utils::retrieve_asset(
@ -26,7 +30,7 @@ mod passing {
&client,
"data:text/html;base64,c291cmNl",
"data:text/html;base64,dGFyZ2V0",
false,
&options,
0,
)
.unwrap();
@ -46,6 +50,9 @@ mod passing {
let cache = &mut HashMap::new();
let client = Client::new();
let mut options = Options::default();
options.silent = true;
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// Inclusion of local assets from local sources should be allowed
@ -63,7 +70,7 @@ mod passing {
file = file_url_protocol,
cwd = cwd.to_str().unwrap()
),
false,
&options,
0,
)
.unwrap();
@ -91,6 +98,7 @@ mod failing {
use reqwest::blocking::Client;
use std::collections::HashMap;
use crate::opts::Options;
use crate::utils;
#[test]
@ -98,13 +106,16 @@ mod failing {
let cache = &mut HashMap::new();
let client = Client::new();
let mut options = Options::default();
options.silent = true;
// Inclusion of local assets from data URL sources should not be allowed
match utils::retrieve_asset(
cache,
&client,
"data:text/html;base64,SoUrCe",
"file:///etc/passwd",
false,
&options,
0,
) {
Ok((..)) => {
@ -121,13 +132,16 @@ mod failing {
let cache = &mut HashMap::new();
let client = Client::new();
let mut options = Options::default();
options.silent = true;
// Inclusion of local assets from remote sources should not be allowed
match utils::retrieve_asset(
cache,
&client,
"https://kernel.org/",
"file:///etc/passwd",
false,
&options,
0,
) {
Ok((..)) => {

View file

@ -4,6 +4,7 @@ use std::collections::HashMap;
use std::fs;
use std::path::Path;
use crate::opts::Options;
use crate::url::{clean_url, data_url_to_data, file_url_to_fs_path, is_data_url, is_file_url};
const INDENT: &str = " ";
@ -73,7 +74,7 @@ pub fn retrieve_asset(
client: &Client,
parent_url: &str,
url: &str,
opt_silent: bool,
options: &Options,
depth: u32,
) -> Result<(Vec<u8>, String, String), reqwest::Error> {
if url.len() == 0 {
@ -95,7 +96,7 @@ pub fn retrieve_asset(
let fs_file_path: String = file_url_to_fs_path(url);
let path = Path::new(&fs_file_path);
if path.exists() {
if !opt_silent {
if !options.silent {
eprintln!("{}{}", indent(depth).as_str(), &url);
}
@ -109,7 +110,7 @@ pub fn retrieve_asset(
if cache.contains_key(&cache_key) {
// URL is in cache, we get and return it
if !opt_silent {
if !options.silent {
eprintln!("{}{} (from cache)", indent(depth).as_str(), &url);
}
@ -120,34 +121,46 @@ pub fn retrieve_asset(
))
} else {
// URL not in cache, we retrieve the file
let mut response = client.get(url).send()?;
let res_url = response.url().to_string();
match client.get(url).send() {
Ok(mut response) => {
if !options.ignore_errors && response.status() != 200 {
if !options.silent {
eprintln!("Unable to retrieve {} ({})", &url, response.status());
}
// Provoke error
return Err(client.get("").send().unwrap_err());
}
if !opt_silent {
if url == res_url {
eprintln!("{}{}", indent(depth).as_str(), &url);
} else {
eprintln!("{}{} -> {}", indent(depth).as_str(), &url, &res_url);
let res_url = response.url().to_string();
if !options.silent {
if url == res_url {
eprintln!("{}{}", indent(depth).as_str(), &url);
} else {
eprintln!("{}{} -> {}", indent(depth).as_str(), &url, &res_url);
}
}
let new_cache_key: String = clean_url(&res_url);
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
response.copy_to(&mut data)?;
// Attempt to obtain media type by reading the Content-Type header
let media_type = response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or("");
// Add retrieved resource to cache
cache.insert(new_cache_key, data.clone());
Ok((data, res_url, media_type.to_string()))
}
Err(error) => Err(error),
}
let new_cache_key: String = clean_url(&res_url);
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
response.copy_to(&mut data)?;
// Attempt to obtain media type by reading the Content-Type header
let media_type = response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or("");
// Add to cache
cache.insert(new_cache_key, data.clone());
Ok((data, res_url, media_type.to_string()))
}
}
}