From 13429e32d3e011ea1b7220c7e52c9a6ace5e3a4a Mon Sep 17 00:00:00 2001 From: Vincent Flyson Date: Fri, 23 Aug 2019 16:00:05 -0400 Subject: [PATCH 1/6] Allow HTTP redirects and preserve email links --- Cargo.toml | 2 +- src/html.rs | 4 ++-- src/http.rs | 3 ++- src/main.rs | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ca1b409..c0ae12f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "monolith" -version = "2.0.5" +version = "2.0.6" authors = ["Sunshine "] description = "CLI tool to save webpages as a single HTML file" diff --git a/src/html.rs b/src/html.rs index 63c1d7d..0da1aaf 100644 --- a/src/html.rs +++ b/src/html.rs @@ -146,8 +146,8 @@ pub fn walk_and_embed_assets( NodeMatch::Anchor => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { - // Do not touch hrefs which begin with a hash sign - if attr.value.to_string().chars().nth(0) == Some('#') { + // Don't touch email links or hrefs which begin with a hash sign + if attr.value.starts_with('#') || attr.value.starts_with("mailto:") { continue; } diff --git a/src/http.rs b/src/http.rs index 19e064d..b18da0c 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,6 +1,6 @@ use regex::Regex; use reqwest::header::{CONTENT_TYPE, USER_AGENT}; -use reqwest::Client; +use reqwest::{Client, RedirectPolicy}; use std::time::Duration; use url::{ParseError, Url}; use utils::data_to_dataurl; @@ -75,6 +75,7 @@ pub fn retrieve_asset( Ok(url.to_string()) } else { let client = Client::builder() + .redirect(RedirectPolicy::limited(3)) .timeout(Duration::from_secs(10)) .build() .unwrap(); diff --git a/src/main.rs b/src/main.rs index cfb4872..7988f2f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,7 +22,7 @@ fn main() { ) .args_from_usage("-j, --no-js 'Excludes JavaScript'") .args_from_usage("-i, --no-images 'Removes images'") - .args_from_usage("-u, --user-agent= 'Custom User-Agent string'") + .args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'") .get_matches(); // Process the command From c0fffbb21235f11b1947e11e66cb69ca6c3c5421 Mon Sep 17 00:00:00 2001 From: Vincent Flyson Date: Fri, 23 Aug 2019 18:48:08 -0400 Subject: [PATCH 2/6] Get rid of mime-sniffer dependency --- Cargo.toml | 5 ++-- src/utils.rs | 64 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c0ae12f..11b9328 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "monolith" -version = "2.0.6" +version = "2.0.7" authors = ["Sunshine "] description = "CLI tool to save webpages as a single HTML file" @@ -9,8 +9,7 @@ base64 = "0.10.1" clap = "2.33.0" html5ever = "0.24.0" indicatif = "0.11.0" -mime-sniffer = "0.1.2" +lazy_static = "1.3.0" regex = "1.2.1" reqwest = "0.9.20" url = "2.1.0" -lazy_static = "1.3.0" diff --git a/src/utils.rs b/src/utils.rs index f459a02..2f91337 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,8 +1,31 @@ extern crate base64; -extern crate mime_sniffer; use self::base64::encode; -use self::mime_sniffer::MimeTypeSniffer; + +static MAGIC: [[&[u8]; 2]; 19] = [ + // Image + [b"GIF87a", b"image/gif"], + [b"GIF89a", b"image/gif"], + [b"\xFF\xD8\xFF", b"image/jpeg"], + [b"\x89PNG\x0D\x0A\x1A\x0A", b"image/png"], + [b" String { let mimetype = if mime == "" { @@ -14,7 +37,16 @@ pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String { } fn detect_mimetype(data: &[u8]) -> String { - data.sniff_mime_type().unwrap_or("").to_string() + let mut re = String::new(); + + for item in MAGIC.iter() { + if data.starts_with(item[0]) { + re = String::from_utf8(item[1].to_vec()).unwrap(); + break; + } + } + + re } #[cfg(test)] @@ -31,4 +63,30 @@ mod tests { "data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK" ); } + + #[test] + fn test_detect_mimetype() { + // Image + assert_eq!(detect_mimetype(b"GIF87a"), "image/gif"); + assert_eq!(detect_mimetype(b"GIF89a"), "image/gif"); + assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg"); + assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png"); + assert_eq!(detect_mimetype(b" Date: Fri, 23 Aug 2019 20:16:16 -0400 Subject: [PATCH 3/6] Add support for iframes --- Cargo.toml | 2 +- src/html.rs | 56 +++++++++++++++++++++++++++++++++++++---------------- src/http.rs | 30 ++++++++++++++++++++-------- src/main.rs | 4 ++-- 4 files changed, 64 insertions(+), 28 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 11b9328..8335adf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "monolith" -version = "2.0.7" +version = "2.0.8" authors = ["Sunshine "] description = "CLI tool to save webpages as a single HTML file" diff --git a/src/html.rs b/src/html.rs index 0da1aaf..100cddd 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,6 +1,7 @@ use http::{is_valid_url, resolve_url, retrieve_asset}; use std::default::Default; use std::io; +use utils::data_to_dataurl; use html5ever::parse_document; use html5ever::rcdom::{Handle, NodeData, RcDom}; @@ -14,6 +15,7 @@ enum NodeMatch { Anchor, Script, Form, + IFrame, Other, } @@ -85,26 +87,26 @@ pub fn walk_and_embed_assets( let attrs_mut = &mut attrs.borrow_mut(); let mut found = NodeMatch::Other; - if &name.local == "link" { - for attr in attrs_mut.iter_mut() { - if &attr.name.local == "rel" { - if is_icon(&attr.value.to_string()) { - found = NodeMatch::Icon; - break; - } else if attr.value.to_string() == "stylesheet" { - found = NodeMatch::StyleSheet; - break; + match name.local.as_ref() { + "link" => { + for attr in attrs_mut.iter_mut() { + if &attr.name.local == "rel" { + if is_icon(&attr.value.to_string()) { + found = NodeMatch::Icon; + break; + } else if attr.value.to_string() == "stylesheet" { + found = NodeMatch::StyleSheet; + break; + } } } } - } else if &name.local == "img" { - found = NodeMatch::Image; - } else if &name.local == "a" { - found = NodeMatch::Anchor; - } else if &name.local == "script" { - found = NodeMatch::Script; - } else if &name.local == "form" { - found = NodeMatch::Form; + "img" => { found = NodeMatch::Image; } + "a" => { found = NodeMatch::Anchor; } + "script" => { found = NodeMatch::Script; } + "form" => { found = NodeMatch::Form; } + "iframe" => { found = NodeMatch::IFrame; } + _ => {} } match found { @@ -211,6 +213,26 @@ pub fn walk_and_embed_assets( } } } + NodeMatch::IFrame => { + for attr in attrs_mut.iter_mut() { + if &attr.name.local == "src" { + let src_full_url = resolve_url(&url, &attr.value.to_string()).unwrap(); + let iframe_data = retrieve_asset( + &src_full_url, + false, + "text/html", + opt_user_agent, + ); + let dom = html_to_dom(&iframe_data.unwrap()); + walk_and_embed_assets(&src_full_url, &dom.document, opt_no_js, opt_no_images, opt_user_agent); + let mut buf: Vec = Vec::new(); + serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + let iframe_datauri = data_to_dataurl("text/html", &buf); + attr.value.clear(); + attr.value.push_slice(iframe_datauri.as_str()); + } + } + } NodeMatch::Other => {} } diff --git a/src/http.rs b/src/http.rs index b18da0c..1765f14 100644 --- a/src/http.rs +++ b/src/http.rs @@ -21,11 +21,11 @@ pub fn resolve_url(from: &str, to: &str) -> Result { let mut re = String::new(); if is_valid_url(from) { // It's a remote resource (HTTP) - if to.chars().nth(0) == Some('/') { + if to.starts_with('/') { // (http://site.com/article/1, /...?) let from_url = Url::parse(from)?; - if to.chars().nth(1) == Some('/') { + if to.starts_with("//") { // (http://site.com/article/1, //images/1.png) re.push_str(from_url.scheme()); re.push_str(":"); @@ -40,9 +40,8 @@ pub fn resolve_url(from: &str, to: &str) -> Result { } else { // (http://site.com, css/main.css) // TODO improve to ensure no // or /// ever happen - re.push_str(from); - re.push_str("/"); - re.push_str(to); + let base = Url::parse(from)?; + re = base.join(to)?.to_string(); } } else { // It's a local resource (fs) @@ -126,13 +125,19 @@ mod tests { #[test] fn test_resolve_url() -> Result<(), ParseError> { - let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?; + let resolved_url = resolve_url( + "https://www.kernel.org", + "../category/signatures.html", + )?; assert_eq!( resolved_url.as_str(), - "https://www.kernel.org/../category/signatures.html" + "https://www.kernel.org/category/signatures.html" ); - let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?; + let resolved_url = resolve_url( + "https://www.kernel.org", + "category/signatures.html", + )?; assert_eq!( resolved_url.as_str(), "https://www.kernel.org/category/signatures.html" @@ -165,6 +170,15 @@ mod tests { "https://www.kernel.org/theme/images/logos/tux.png" ); + let resolved_url = resolve_url( + "https://www.w3schools.com/html/html_iframe.asp", + "default.asp", + )?; + assert_eq!( + resolved_url.as_str(), + "https://www.w3schools.com/html/default.asp" + ); + Ok(()) } diff --git a/src/main.rs b/src/main.rs index 7988f2f..3e45f97 100644 --- a/src/main.rs +++ b/src/main.rs @@ -28,14 +28,14 @@ fn main() { // Process the command let arg_target = command.value_of("url").unwrap(); let opt_no_js = command.is_present("no-js"); - let opt_no_img = command.is_present("no-images"); + let opt_no_images = command.is_present("no-images"); let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT); if is_valid_url(arg_target) { let data = retrieve_asset(&arg_target, false, "", opt_user_agent); let dom = html_to_dom(&data.unwrap()); - walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img, opt_user_agent); + walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_images, opt_user_agent); print_dom(&dom.document); println!(); // Ensure newline at end of output From 9138485c6100f2bdb5c6665b7127493483e1fca2 Mon Sep 17 00:00:00 2001 From: Vincent Flyson Date: Fri, 23 Aug 2019 20:19:49 -0400 Subject: [PATCH 4/6] Update Cargo.toml --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8335adf..14a9cb3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "monolith" version = "2.0.8" authors = ["Sunshine "] -description = "CLI tool to save webpages as a single HTML file" +description = "CLI tool for saving web pages as a single HTML file" [dependencies] base64 = "0.10.1" From 27f1411decdbadb8a483905b64839f9d153e1e49 Mon Sep 17 00:00:00 2001 From: Alan Pope Date: Fri, 23 Aug 2019 17:41:55 -0700 Subject: [PATCH 5/6] Add snapcraft yaml Adds support for building snap of monolith --- snap/snapcraft.yaml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 snap/snapcraft.yaml diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml new file mode 100644 index 0000000..8daf922 --- /dev/null +++ b/snap/snapcraft.yaml @@ -0,0 +1,34 @@ +name: monolith +base: core18 +version: git +summary: Monolith - Save HTML pages with ease +description: | + A data hoarder's dream come true: bundle any web page into a single + HTML file. You can finally replace that gazillion of open tabs with + a gazillion of .html files stored somewhere on your precious little + drive. + Unlike conventional "Save page as…", monolith not only saves the + target document, it embeds CSS, image, and JavaScript assets all + at once, producing a single HTML5 document that is a joy to store + and share. + If compared to saving websites with wget -mpk, monolith embeds + all assets as data URLs and therefore displays the saved page + exactly the same, being completely separated from the Internet. + +confinement: strict + +parts: + monolith: + plugin: rust + source: . + build-packages: + - libssl-dev + - pkg-config + +apps: + monolith: + command: monolith + plugs: + - home + - network + - removable-media From c34d77d5d84dc190e0673779a7cbe3b261c422d4 Mon Sep 17 00:00:00 2001 From: Vincent Flyson Date: Fri, 23 Aug 2019 23:06:06 -0400 Subject: [PATCH 6/6] Revamp resolve_url() and improve code format --- Cargo.toml | 2 +- src/html.rs | 16 +++++++++---- src/http.rs | 68 +++++++++++++++++------------------------------------ src/main.rs | 15 ++++++++---- 4 files changed, 46 insertions(+), 55 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 14a9cb3..4694a41 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "monolith" -version = "2.0.8" +version = "2.0.9" authors = ["Sunshine "] description = "CLI tool for saving web pages as a single HTML file" diff --git a/src/html.rs b/src/html.rs index 100cddd..22052e3 100644 --- a/src/html.rs +++ b/src/html.rs @@ -19,7 +19,8 @@ enum NodeMatch { Other, } -const PNG_PIXEL: &str = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="; +const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\ +iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="; const JS_DOM_EVENT_ATTRS: [&str; 21] = [ // Input @@ -76,7 +77,8 @@ pub fn walk_and_embed_assets( NodeData::Comment { .. } => { // Note: in case of opt_no_js being set to true, there's no need to worry about // getting rid of comments that may contain scripts, e.g.