commit
47abda066a
4 changed files with 64 additions and 28 deletions
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "monolith"
|
name = "monolith"
|
||||||
version = "2.0.7"
|
version = "2.0.8"
|
||||||
authors = ["Sunshine <sunshine@uberspace.net>"]
|
authors = ["Sunshine <sunshine@uberspace.net>"]
|
||||||
description = "CLI tool to save webpages as a single HTML file"
|
description = "CLI tool to save webpages as a single HTML file"
|
||||||
|
|
||||||
|
|
40
src/html.rs
40
src/html.rs
|
@ -1,6 +1,7 @@
|
||||||
use http::{is_valid_url, resolve_url, retrieve_asset};
|
use http::{is_valid_url, resolve_url, retrieve_asset};
|
||||||
use std::default::Default;
|
use std::default::Default;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
use utils::data_to_dataurl;
|
||||||
|
|
||||||
use html5ever::parse_document;
|
use html5ever::parse_document;
|
||||||
use html5ever::rcdom::{Handle, NodeData, RcDom};
|
use html5ever::rcdom::{Handle, NodeData, RcDom};
|
||||||
|
@ -14,6 +15,7 @@ enum NodeMatch {
|
||||||
Anchor,
|
Anchor,
|
||||||
Script,
|
Script,
|
||||||
Form,
|
Form,
|
||||||
|
IFrame,
|
||||||
Other,
|
Other,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -85,7 +87,8 @@ pub fn walk_and_embed_assets(
|
||||||
let attrs_mut = &mut attrs.borrow_mut();
|
let attrs_mut = &mut attrs.borrow_mut();
|
||||||
let mut found = NodeMatch::Other;
|
let mut found = NodeMatch::Other;
|
||||||
|
|
||||||
if &name.local == "link" {
|
match name.local.as_ref() {
|
||||||
|
"link" => {
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "rel" {
|
if &attr.name.local == "rel" {
|
||||||
if is_icon(&attr.value.to_string()) {
|
if is_icon(&attr.value.to_string()) {
|
||||||
|
@ -97,14 +100,13 @@ pub fn walk_and_embed_assets(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if &name.local == "img" {
|
}
|
||||||
found = NodeMatch::Image;
|
"img" => { found = NodeMatch::Image; }
|
||||||
} else if &name.local == "a" {
|
"a" => { found = NodeMatch::Anchor; }
|
||||||
found = NodeMatch::Anchor;
|
"script" => { found = NodeMatch::Script; }
|
||||||
} else if &name.local == "script" {
|
"form" => { found = NodeMatch::Form; }
|
||||||
found = NodeMatch::Script;
|
"iframe" => { found = NodeMatch::IFrame; }
|
||||||
} else if &name.local == "form" {
|
_ => {}
|
||||||
found = NodeMatch::Form;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
match found {
|
match found {
|
||||||
|
@ -211,6 +213,26 @@ pub fn walk_and_embed_assets(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
NodeMatch::IFrame => {
|
||||||
|
for attr in attrs_mut.iter_mut() {
|
||||||
|
if &attr.name.local == "src" {
|
||||||
|
let src_full_url = resolve_url(&url, &attr.value.to_string()).unwrap();
|
||||||
|
let iframe_data = retrieve_asset(
|
||||||
|
&src_full_url,
|
||||||
|
false,
|
||||||
|
"text/html",
|
||||||
|
opt_user_agent,
|
||||||
|
);
|
||||||
|
let dom = html_to_dom(&iframe_data.unwrap());
|
||||||
|
walk_and_embed_assets(&src_full_url, &dom.document, opt_no_js, opt_no_images, opt_user_agent);
|
||||||
|
let mut buf: Vec<u8> = Vec::new();
|
||||||
|
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||||
|
let iframe_datauri = data_to_dataurl("text/html", &buf);
|
||||||
|
attr.value.clear();
|
||||||
|
attr.value.push_slice(iframe_datauri.as_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
NodeMatch::Other => {}
|
NodeMatch::Other => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
30
src/http.rs
30
src/http.rs
|
@ -21,11 +21,11 @@ pub fn resolve_url(from: &str, to: &str) -> Result<String, ParseError> {
|
||||||
let mut re = String::new();
|
let mut re = String::new();
|
||||||
if is_valid_url(from) {
|
if is_valid_url(from) {
|
||||||
// It's a remote resource (HTTP)
|
// It's a remote resource (HTTP)
|
||||||
if to.chars().nth(0) == Some('/') {
|
if to.starts_with('/') {
|
||||||
// (http://site.com/article/1, /...?)
|
// (http://site.com/article/1, /...?)
|
||||||
let from_url = Url::parse(from)?;
|
let from_url = Url::parse(from)?;
|
||||||
|
|
||||||
if to.chars().nth(1) == Some('/') {
|
if to.starts_with("//") {
|
||||||
// (http://site.com/article/1, //images/1.png)
|
// (http://site.com/article/1, //images/1.png)
|
||||||
re.push_str(from_url.scheme());
|
re.push_str(from_url.scheme());
|
||||||
re.push_str(":");
|
re.push_str(":");
|
||||||
|
@ -40,9 +40,8 @@ pub fn resolve_url(from: &str, to: &str) -> Result<String, ParseError> {
|
||||||
} else {
|
} else {
|
||||||
// (http://site.com, css/main.css)
|
// (http://site.com, css/main.css)
|
||||||
// TODO improve to ensure no // or /// ever happen
|
// TODO improve to ensure no // or /// ever happen
|
||||||
re.push_str(from);
|
let base = Url::parse(from)?;
|
||||||
re.push_str("/");
|
re = base.join(to)?.to_string();
|
||||||
re.push_str(to);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// It's a local resource (fs)
|
// It's a local resource (fs)
|
||||||
|
@ -126,13 +125,19 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_resolve_url() -> Result<(), ParseError> {
|
fn test_resolve_url() -> Result<(), ParseError> {
|
||||||
let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?;
|
let resolved_url = resolve_url(
|
||||||
|
"https://www.kernel.org",
|
||||||
|
"../category/signatures.html",
|
||||||
|
)?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
resolved_url.as_str(),
|
resolved_url.as_str(),
|
||||||
"https://www.kernel.org/../category/signatures.html"
|
"https://www.kernel.org/category/signatures.html"
|
||||||
);
|
);
|
||||||
|
|
||||||
let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?;
|
let resolved_url = resolve_url(
|
||||||
|
"https://www.kernel.org",
|
||||||
|
"category/signatures.html",
|
||||||
|
)?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
resolved_url.as_str(),
|
resolved_url.as_str(),
|
||||||
"https://www.kernel.org/category/signatures.html"
|
"https://www.kernel.org/category/signatures.html"
|
||||||
|
@ -165,6 +170,15 @@ mod tests {
|
||||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let resolved_url = resolve_url(
|
||||||
|
"https://www.w3schools.com/html/html_iframe.asp",
|
||||||
|
"default.asp",
|
||||||
|
)?;
|
||||||
|
assert_eq!(
|
||||||
|
resolved_url.as_str(),
|
||||||
|
"https://www.w3schools.com/html/default.asp"
|
||||||
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,14 +28,14 @@ fn main() {
|
||||||
// Process the command
|
// Process the command
|
||||||
let arg_target = command.value_of("url").unwrap();
|
let arg_target = command.value_of("url").unwrap();
|
||||||
let opt_no_js = command.is_present("no-js");
|
let opt_no_js = command.is_present("no-js");
|
||||||
let opt_no_img = command.is_present("no-images");
|
let opt_no_images = command.is_present("no-images");
|
||||||
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
||||||
|
|
||||||
if is_valid_url(arg_target) {
|
if is_valid_url(arg_target) {
|
||||||
let data = retrieve_asset(&arg_target, false, "", opt_user_agent);
|
let data = retrieve_asset(&arg_target, false, "", opt_user_agent);
|
||||||
let dom = html_to_dom(&data.unwrap());
|
let dom = html_to_dom(&data.unwrap());
|
||||||
|
|
||||||
walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img, opt_user_agent);
|
walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_images, opt_user_agent);
|
||||||
|
|
||||||
print_dom(&dom.document);
|
print_dom(&dom.document);
|
||||||
println!(); // Ensure newline at end of output
|
println!(); // Ensure newline at end of output
|
||||||
|
|
Loading…
Reference in a new issue