Revamp resolve_url() and improve code format
This commit is contained in:
parent
fdbe2499df
commit
c34d77d5d8
4 changed files with 46 additions and 55 deletions
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "monolith"
|
name = "monolith"
|
||||||
version = "2.0.8"
|
version = "2.0.9"
|
||||||
authors = ["Sunshine <sunshine@uberspace.net>"]
|
authors = ["Sunshine <sunshine@uberspace.net>"]
|
||||||
description = "CLI tool for saving web pages as a single HTML file"
|
description = "CLI tool for saving web pages as a single HTML file"
|
||||||
|
|
||||||
|
|
16
src/html.rs
16
src/html.rs
|
@ -19,7 +19,8 @@ enum NodeMatch {
|
||||||
Other,
|
Other,
|
||||||
}
|
}
|
||||||
|
|
||||||
const PNG_PIXEL: &str = "";
|
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
|
||||||
|
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
|
||||||
|
|
||||||
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
|
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
|
||||||
// Input
|
// Input
|
||||||
|
@ -76,7 +77,8 @@ pub fn walk_and_embed_assets(
|
||||||
NodeData::Comment { .. } => {
|
NodeData::Comment { .. } => {
|
||||||
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
||||||
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
||||||
// since that's not part of W3C standard and gets ignored by browsers other than IE [5, 9]
|
// since that's not part of W3C standard and therefore gets ignored
|
||||||
|
// by browsers other than IE [5, 9]
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeData::Element {
|
NodeData::Element {
|
||||||
|
@ -130,7 +132,7 @@ pub fn walk_and_embed_assets(
|
||||||
if &attr.name.local == "src" {
|
if &attr.name.local == "src" {
|
||||||
if opt_no_images {
|
if opt_no_images {
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(PNG_PIXEL);
|
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||||
} else {
|
} else {
|
||||||
let src_full_url = resolve_url(&url, &attr.value.to_string());
|
let src_full_url = resolve_url(&url, &attr.value.to_string());
|
||||||
let img_datauri = retrieve_asset(
|
let img_datauri = retrieve_asset(
|
||||||
|
@ -224,7 +226,13 @@ pub fn walk_and_embed_assets(
|
||||||
opt_user_agent,
|
opt_user_agent,
|
||||||
);
|
);
|
||||||
let dom = html_to_dom(&iframe_data.unwrap());
|
let dom = html_to_dom(&iframe_data.unwrap());
|
||||||
walk_and_embed_assets(&src_full_url, &dom.document, opt_no_js, opt_no_images, opt_user_agent);
|
walk_and_embed_assets(
|
||||||
|
&src_full_url,
|
||||||
|
&dom.document,
|
||||||
|
opt_no_js,
|
||||||
|
opt_no_images,
|
||||||
|
opt_user_agent,
|
||||||
|
);
|
||||||
let mut buf: Vec<u8> = Vec::new();
|
let mut buf: Vec<u8> = Vec::new();
|
||||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||||
let iframe_datauri = data_to_dataurl("text/html", &buf);
|
let iframe_datauri = data_to_dataurl("text/html", &buf);
|
||||||
|
|
68
src/http.rs
68
src/http.rs
|
@ -9,6 +9,13 @@ lazy_static! {
|
||||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn is_data_url(url: &str) -> Result<bool, String> {
|
||||||
|
match Url::parse(url) {
|
||||||
|
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
|
||||||
|
Err(err) => Err(format!("{}", err)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn is_valid_url(path: &str) -> bool {
|
pub fn is_valid_url(path: &str) -> bool {
|
||||||
REGEX_URL.is_match(path)
|
REGEX_URL.is_match(path)
|
||||||
}
|
}
|
||||||
|
@ -18,59 +25,19 @@ pub fn resolve_url(from: &str, to: &str) -> Result<String, ParseError> {
|
||||||
// (anything, http://site.com/css/main.css)
|
// (anything, http://site.com/css/main.css)
|
||||||
to.to_string()
|
to.to_string()
|
||||||
} else {
|
} else {
|
||||||
let mut re = String::new();
|
Url::parse(from)?.join(to)?.to_string()
|
||||||
if is_valid_url(from) {
|
|
||||||
// It's a remote resource (HTTP)
|
|
||||||
if to.starts_with('/') {
|
|
||||||
// (http://site.com/article/1, /...?)
|
|
||||||
let from_url = Url::parse(from)?;
|
|
||||||
|
|
||||||
if to.starts_with("//") {
|
|
||||||
// (http://site.com/article/1, //images/1.png)
|
|
||||||
re.push_str(from_url.scheme());
|
|
||||||
re.push_str(":");
|
|
||||||
re.push_str(to);
|
|
||||||
} else {
|
|
||||||
// (http://site.com/article/1, /css/main.css)
|
|
||||||
re.push_str(from_url.scheme());
|
|
||||||
re.push_str("://");
|
|
||||||
re.push_str(from_url.host_str().unwrap());
|
|
||||||
re.push_str(to);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// (http://site.com, css/main.css)
|
|
||||||
// TODO improve to ensure no // or /// ever happen
|
|
||||||
let base = Url::parse(from)?;
|
|
||||||
re = base.join(to)?.to_string();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// It's a local resource (fs)
|
|
||||||
// TODO improve to ensure no // or /// ever happen
|
|
||||||
// TODO for fs use basepath instead of $from
|
|
||||||
re.push_str(from);
|
|
||||||
re.push_str("/");
|
|
||||||
re.push_str(to);
|
|
||||||
}
|
|
||||||
re
|
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn url_is_data(url: &str) -> Result<bool, String> {
|
|
||||||
match Url::parse(url) {
|
|
||||||
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
|
|
||||||
Err(err) => Err(format!("{}", err)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn retrieve_asset(
|
pub fn retrieve_asset(
|
||||||
url: &str,
|
url: &str,
|
||||||
as_dataurl: bool,
|
as_dataurl: bool,
|
||||||
as_mime: &str,
|
as_mime: &str,
|
||||||
opt_user_agent: &str,
|
opt_user_agent: &str,
|
||||||
) -> Result<String, reqwest::Error> {
|
) -> Result<String, reqwest::Error> {
|
||||||
if url_is_data(&url).unwrap() {
|
if is_data_url(&url).unwrap() {
|
||||||
Ok(url.to_string())
|
Ok(url.to_string())
|
||||||
} else {
|
} else {
|
||||||
let client = Client::builder()
|
let client = Client::builder()
|
||||||
|
@ -161,6 +128,15 @@ mod tests {
|
||||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let resolved_url = resolve_url(
|
||||||
|
"https://www.kernel.org",
|
||||||
|
"//another-host.org/theme/images/logos/tux.png",
|
||||||
|
)?;
|
||||||
|
assert_eq!(
|
||||||
|
resolved_url.as_str(),
|
||||||
|
"https://another-host.org/theme/images/logos/tux.png"
|
||||||
|
);
|
||||||
|
|
||||||
let resolved_url = resolve_url(
|
let resolved_url = resolve_url(
|
||||||
"https://www.kernel.org/category/signatures.html",
|
"https://www.kernel.org/category/signatures.html",
|
||||||
"/theme/images/logos/tux.png",
|
"/theme/images/logos/tux.png",
|
||||||
|
@ -183,12 +159,12 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_url_is_data() {
|
fn test_is_data_url() {
|
||||||
assert!(
|
assert!(
|
||||||
url_is_data("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
|
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
|
||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
);
|
);
|
||||||
assert!(!url_is_data("https://kernel.org").unwrap_or(false));
|
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
|
||||||
assert!(!url_is_data("//kernel.org").unwrap_or(false));
|
assert!(!is_data_url("//kernel.org").unwrap_or(false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
15
src/main.rs
15
src/main.rs
|
@ -6,7 +6,8 @@ use clap::{App, Arg};
|
||||||
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets};
|
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets};
|
||||||
use monolith::http::{is_valid_url, retrieve_asset};
|
use monolith::http::{is_valid_url, retrieve_asset};
|
||||||
|
|
||||||
static DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
|
static DEFAULT_USER_AGENT: &str =
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let command = App::new("monolith")
|
let command = App::new("monolith")
|
||||||
|
@ -32,10 +33,16 @@ fn main() {
|
||||||
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
||||||
|
|
||||||
if is_valid_url(arg_target) {
|
if is_valid_url(arg_target) {
|
||||||
let data = retrieve_asset(&arg_target, false, "", opt_user_agent);
|
let data = retrieve_asset(&arg_target, false, "", opt_user_agent).unwrap();
|
||||||
let dom = html_to_dom(&data.unwrap());
|
let dom = html_to_dom(&data);
|
||||||
|
|
||||||
walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_images, opt_user_agent);
|
walk_and_embed_assets(
|
||||||
|
&arg_target,
|
||||||
|
&dom.document,
|
||||||
|
opt_no_js,
|
||||||
|
opt_no_images,
|
||||||
|
opt_user_agent,
|
||||||
|
);
|
||||||
|
|
||||||
print_dom(&dom.document);
|
print_dom(&dom.document);
|
||||||
println!(); // Ensure newline at end of output
|
println!(); // Ensure newline at end of output
|
||||||
|
|
Loading…
Reference in a new issue