Add support for iframes

This commit is contained in:
Vincent Flyson 2019-08-23 20:16:16 -04:00
parent 4775384ca1
commit 2be5b1c235
4 changed files with 64 additions and 28 deletions

View File

@ -1,6 +1,6 @@
[package] [package]
name = "monolith" name = "monolith"
version = "2.0.7" version = "2.0.8"
authors = ["Sunshine <sunshine@uberspace.net>"] authors = ["Sunshine <sunshine@uberspace.net>"]
description = "CLI tool to save webpages as a single HTML file" description = "CLI tool to save webpages as a single HTML file"

View File

@ -1,6 +1,7 @@
use http::{is_valid_url, resolve_url, retrieve_asset}; use http::{is_valid_url, resolve_url, retrieve_asset};
use std::default::Default; use std::default::Default;
use std::io; use std::io;
use utils::data_to_dataurl;
use html5ever::parse_document; use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::rcdom::{Handle, NodeData, RcDom};
@ -14,6 +15,7 @@ enum NodeMatch {
Anchor, Anchor,
Script, Script,
Form, Form,
IFrame,
Other, Other,
} }
@ -85,26 +87,26 @@ pub fn walk_and_embed_assets(
let attrs_mut = &mut attrs.borrow_mut(); let attrs_mut = &mut attrs.borrow_mut();
let mut found = NodeMatch::Other; let mut found = NodeMatch::Other;
if &name.local == "link" { match name.local.as_ref() {
for attr in attrs_mut.iter_mut() { "link" => {
if &attr.name.local == "rel" { for attr in attrs_mut.iter_mut() {
if is_icon(&attr.value.to_string()) { if &attr.name.local == "rel" {
found = NodeMatch::Icon; if is_icon(&attr.value.to_string()) {
break; found = NodeMatch::Icon;
} else if attr.value.to_string() == "stylesheet" { break;
found = NodeMatch::StyleSheet; } else if attr.value.to_string() == "stylesheet" {
break; found = NodeMatch::StyleSheet;
break;
}
} }
} }
} }
} else if &name.local == "img" { "img" => { found = NodeMatch::Image; }
found = NodeMatch::Image; "a" => { found = NodeMatch::Anchor; }
} else if &name.local == "a" { "script" => { found = NodeMatch::Script; }
found = NodeMatch::Anchor; "form" => { found = NodeMatch::Form; }
} else if &name.local == "script" { "iframe" => { found = NodeMatch::IFrame; }
found = NodeMatch::Script; _ => {}
} else if &name.local == "form" {
found = NodeMatch::Form;
} }
match found { match found {
@ -211,6 +213,26 @@ pub fn walk_and_embed_assets(
} }
} }
} }
NodeMatch::IFrame => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let src_full_url = resolve_url(&url, &attr.value.to_string()).unwrap();
let iframe_data = retrieve_asset(
&src_full_url,
false,
"text/html",
opt_user_agent,
);
let dom = html_to_dom(&iframe_data.unwrap());
walk_and_embed_assets(&src_full_url, &dom.document, opt_no_js, opt_no_images, opt_user_agent);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
let iframe_datauri = data_to_dataurl("text/html", &buf);
attr.value.clear();
attr.value.push_slice(iframe_datauri.as_str());
}
}
}
NodeMatch::Other => {} NodeMatch::Other => {}
} }

View File

@ -21,11 +21,11 @@ pub fn resolve_url(from: &str, to: &str) -> Result<String, ParseError> {
let mut re = String::new(); let mut re = String::new();
if is_valid_url(from) { if is_valid_url(from) {
// It's a remote resource (HTTP) // It's a remote resource (HTTP)
if to.chars().nth(0) == Some('/') { if to.starts_with('/') {
// (http://site.com/article/1, /...?) // (http://site.com/article/1, /...?)
let from_url = Url::parse(from)?; let from_url = Url::parse(from)?;
if to.chars().nth(1) == Some('/') { if to.starts_with("//") {
// (http://site.com/article/1, //images/1.png) // (http://site.com/article/1, //images/1.png)
re.push_str(from_url.scheme()); re.push_str(from_url.scheme());
re.push_str(":"); re.push_str(":");
@ -40,9 +40,8 @@ pub fn resolve_url(from: &str, to: &str) -> Result<String, ParseError> {
} else { } else {
// (http://site.com, css/main.css) // (http://site.com, css/main.css)
// TODO improve to ensure no // or /// ever happen // TODO improve to ensure no // or /// ever happen
re.push_str(from); let base = Url::parse(from)?;
re.push_str("/"); re = base.join(to)?.to_string();
re.push_str(to);
} }
} else { } else {
// It's a local resource (fs) // It's a local resource (fs)
@ -126,13 +125,19 @@ mod tests {
#[test] #[test]
fn test_resolve_url() -> Result<(), ParseError> { fn test_resolve_url() -> Result<(), ParseError> {
let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?; let resolved_url = resolve_url(
"https://www.kernel.org",
"../category/signatures.html",
)?;
assert_eq!( assert_eq!(
resolved_url.as_str(), resolved_url.as_str(),
"https://www.kernel.org/../category/signatures.html" "https://www.kernel.org/category/signatures.html"
); );
let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?; let resolved_url = resolve_url(
"https://www.kernel.org",
"category/signatures.html",
)?;
assert_eq!( assert_eq!(
resolved_url.as_str(), resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html" "https://www.kernel.org/category/signatures.html"
@ -165,6 +170,15 @@ mod tests {
"https://www.kernel.org/theme/images/logos/tux.png" "https://www.kernel.org/theme/images/logos/tux.png"
); );
let resolved_url = resolve_url(
"https://www.w3schools.com/html/html_iframe.asp",
"default.asp",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.w3schools.com/html/default.asp"
);
Ok(()) Ok(())
} }

View File

@ -28,14 +28,14 @@ fn main() {
// Process the command // Process the command
let arg_target = command.value_of("url").unwrap(); let arg_target = command.value_of("url").unwrap();
let opt_no_js = command.is_present("no-js"); let opt_no_js = command.is_present("no-js");
let opt_no_img = command.is_present("no-images"); let opt_no_images = command.is_present("no-images");
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT); let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
if is_valid_url(arg_target) { if is_valid_url(arg_target) {
let data = retrieve_asset(&arg_target, false, "", opt_user_agent); let data = retrieve_asset(&arg_target, false, "", opt_user_agent);
let dom = html_to_dom(&data.unwrap()); let dom = html_to_dom(&data.unwrap());
walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img, opt_user_agent); walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_images, opt_user_agent);
print_dom(&dom.document); print_dom(&dom.document);
println!(); // Ensure newline at end of output println!(); // Ensure newline at end of output