Merged Y2Z/master with Alch-Emi/lazyload

This commit is contained in:
Emi Simpson 2019-12-24 10:07:56 -05:00
commit dab4ae6965
No known key found for this signature in database
GPG key ID: 68FAB2E2E6DFC98B
12 changed files with 295 additions and 114 deletions

View file

@ -16,6 +16,11 @@ before_script:
- rustup component add rustfmt
script:
- cargo build --locked --verbose
- cargo test --locked --verbose
- cargo fmt --all -- --check
- cargo build --all --locked --verbose
- cargo test --all --locked --verbose
- cargo fmt --all -- --check
jobs:
allow_failures:
- rust: nightly
fast_finish: true

2
Cargo.lock generated
View file

@ -625,7 +625,7 @@ dependencies = [
[[package]]
name = "monolith"
version = "2.0.23"
version = "2.1.0"
dependencies = [
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",

View file

@ -1,10 +1,11 @@
[package]
name = "monolith"
version = "2.0.23"
version = "2.1.0"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
"Emmanuel Delaborde <th3rac25@gmail.com>",
"Emi Simpson <emi@alchemi.dev>",
]
description = "CLI tool for saving web pages as a single HTML file"

View file

@ -18,17 +18,20 @@ Unlike the conventional "Save page as", `monolith` not only saves the target doc
If compared to saving websites with `wget -mpk`, this tool embeds all assets as data URLs and therefore lets browsers render the saved page exactly the way it was on the Internet, even when no network connection is available.
<!-- `This program works both on remote and local targets. -->
## Installation
### Installation
### From source
$ git clone https://github.com/Y2Z/monolith.git
$ cd monolith
$ cargo install --path .
### Usage
### On macOS (via Homebrew)
$ brew install monolith
## Usage
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
### Options
## Options
- `-c`: Ignore styles
- `-f`: Exclude iframes
- `-i`: Remove images
@ -38,11 +41,11 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
- `-s`: Silent mode
- `-u`: Specify custom User-Agent
### Related Projects
## Related projects
- `Pagesaver`: https://github.com/distributed-mind/pagesaver
- `SingleFile`: https://github.com/gildas-lormeau/SingleFile
### License
## License
The Unlicense
<!-- Microtext -->

View file

@ -7,13 +7,10 @@ use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset;
use js::attr_is_event_handler;
use reqwest::Client;
use std::collections::HashMap;
use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
lazy_static! {
static ref EMPTY_STRING: String = String::new();
}
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
const ICON_VALUES: [&str; 5] = [
"icon",
@ -35,7 +32,7 @@ pub fn get_parent_node(node: &Handle) -> Handle {
pub fn get_node_name(node: &Handle) -> String {
match &node.data {
NodeData::Element { ref name, .. } => name.local.as_ref().to_string(),
_ => EMPTY_STRING.clone(),
_ => str!(),
}
}
@ -45,14 +42,13 @@ pub fn is_icon(attr_value: &str) -> bool {
pub fn walk_and_embed_assets(
cache: &mut HashMap<String, String>,
client: &Client,
url: &str,
node: &Handle,
opt_no_css: bool,
opt_no_js: bool,
opt_no_images: bool,
opt_user_agent: &str,
opt_silent: bool,
opt_insecure: bool,
opt_no_frames: bool,
) {
match node.data {
@ -61,14 +57,13 @@ pub fn walk_and_embed_assets(
for child in node.children.borrow().iter() {
walk_and_embed_assets(
cache,
client,
&url,
child,
opt_no_css,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
opt_no_frames,
);
}
@ -104,17 +99,16 @@ pub fn walk_and_embed_assets(
} else {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
.unwrap_or(str!());
let (favicon_dataurl, _) = retrieve_asset(
cache,
client,
&href_full_url,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(favicon_dataurl.as_str());
}
@ -128,19 +122,37 @@ pub fn walk_and_embed_assets(
} else {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let (css_dataurl, _) = retrieve_asset(
.unwrap_or(str!());
let replacement_text = match retrieve_asset(
cache,
client,
&href_full_url,
true,
false,
"text/css",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
) {
// On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports(
cache,
client,
&css_data,
true,
&href_full_url,
opt_no_images,
opt_silent,
),
// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e,);
// If failed to resolve, replace with absolute URL
href_full_url
}
};
attr.value.clear();
attr.value.push_slice(css_dataurl.as_str());
attr.value.push_slice(&replacement_text);
}
}
}
@ -148,8 +160,7 @@ pub fn walk_and_embed_assets(
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
attr.value.clear();
attr.value.push_slice(&href_full_url.as_str());
}
@ -184,16 +195,15 @@ pub fn walk_and_embed_assets(
.map(|attr| &attr.value)
.filter(|src| !src.is_empty()) // Ignore empty srcs
.next()
.and_then(|src| resolve_url(&url, src).ok()) //Make absolute
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
.and_then(|abs_src| // Download and convert to dataurl
retrieve_asset(
cache,
client,
&abs_src,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
).ok())
{
// Add the new dataurl src attribute
@ -220,17 +230,16 @@ pub fn walk_and_embed_assets(
} else {
let srcset_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
.unwrap_or(str!());
let (source_dataurl, _) = retrieve_asset(
cache,
client,
&srcset_full_url,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(source_dataurl.as_str());
}
@ -246,8 +255,8 @@ pub fn walk_and_embed_assets(
continue;
}
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
@ -266,18 +275,16 @@ pub fn walk_and_embed_assets(
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let src_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
let (js_dataurl, _) = retrieve_asset(
cache,
client,
&src_full_url,
true,
"application/javascript",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(js_dataurl.as_str());
}
@ -288,6 +295,23 @@ pub fn walk_and_embed_assets(
if opt_no_css {
// Empty inner content of STYLE tags
node.children.borrow_mut().clear();
} else {
for node in node.children.borrow_mut().iter_mut() {
if let NodeData::Text { ref contents } = node.data {
let mut tendril = contents.borrow_mut();
let replacement = resolve_css_imports(
cache,
client,
tendril.as_ref(),
false,
&url,
opt_no_images,
opt_silent,
);
tendril.clear();
tendril.push_slice(&replacement);
}
}
}
}
"form" => {
@ -296,8 +320,7 @@ pub fn walk_and_embed_assets(
// Modify action to be a full URL
if !is_valid_url(&attr.value) {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
@ -316,33 +339,31 @@ pub fn walk_and_embed_assets(
let iframe_src: String = attr.value.to_string();
// Ignore iframes with empty source (they cause infinite loops)
if iframe_src == EMPTY_STRING.clone() {
if iframe_src == str!() {
continue;
}
let src_full_url: String =
resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone());
resolve_url(&url, &iframe_src).unwrap_or(str!());
let (iframe_data, iframe_final_url) = retrieve_asset(
cache,
client,
&src_full_url,
false,
"text/html",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or((EMPTY_STRING.clone(), src_full_url));
.unwrap_or((str!(), src_full_url));
let dom = html_to_dom(&iframe_data);
walk_and_embed_assets(
cache,
client,
&iframe_final_url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
@ -359,25 +380,24 @@ pub fn walk_and_embed_assets(
let video_poster = attr.value.to_string();
// Skip posters with empty source
if video_poster == EMPTY_STRING.clone() {
if video_poster == str!() {
continue;
}
if opt_no_images {
attr.value.clear();
} else {
let poster_full_url: String = resolve_url(&url, &video_poster)
.unwrap_or(EMPTY_STRING.clone());
let poster_full_url: String =
resolve_url(&url, &video_poster).unwrap_or(str!());
let (poster_dataurl, _) = retrieve_asset(
cache,
client,
&poster_full_url,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or((poster_full_url, EMPTY_STRING.clone()));
.unwrap_or((poster_full_url, str!()));
attr.value.clear();
attr.value.push_slice(poster_dataurl.as_str());
}
@ -387,6 +407,7 @@ pub fn walk_and_embed_assets(
_ => {}
}
// Process style attributes
if opt_no_css {
// Get rid of style attributes
let mut style_attr_indexes = Vec::new();
@ -399,6 +420,24 @@ pub fn walk_and_embed_assets(
for attr_index in style_attr_indexes {
attrs_mut.remove(attr_index);
}
} else {
// Otherwise, parse any links found in the attributes
for attribute in attrs_mut
.iter_mut()
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
{
let replacement = resolve_css_imports(
cache,
client,
attribute.value.as_ref(),
false,
&url,
opt_no_images,
opt_silent,
);
attribute.value.clear();
attribute.value.push_slice(&replacement);
}
}
if opt_no_js {
@ -419,14 +458,13 @@ pub fn walk_and_embed_assets(
for child in node.children.borrow().iter() {
walk_and_embed_assets(
cache,
client,
&url,
child,
opt_no_css,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
opt_no_frames,
);
}
@ -481,7 +519,7 @@ pub fn stringify_document(
let doc = dom.get_document();
let html = get_child_node_by_name(&doc, "html");
let head = get_child_node_by_name(&html, "head");
let mut content_attr = EMPTY_STRING.clone();
let mut content_attr = str!();
if opt_isolate {
content_attr += " default-src 'unsafe-inline' data:;";
}

View file

@ -1,17 +1,15 @@
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
use reqwest::header::CONTENT_TYPE;
use reqwest::Client;
use std::collections::HashMap;
use std::time::Duration;
use utils::{data_to_dataurl, is_data_url};
pub fn retrieve_asset(
cache: &mut HashMap<String, String>,
client: &Client,
url: &str,
as_dataurl: bool,
mime: &str,
opt_user_agent: &str,
opt_silent: bool,
opt_insecure: bool,
) -> Result<(String, String), reqwest::Error> {
if is_data_url(&url).unwrap() {
Ok((url.to_string(), url.to_string()))
@ -19,23 +17,19 @@ pub fn retrieve_asset(
if cache.contains_key(&url.to_string()) {
// url is in cache
if !opt_silent {
eprintln!("[ {} ] (from cache)", &url);
eprintln!("{} (from cache)", &url);
}
let data = cache.get(&url.to_string()).unwrap();
Ok((data.to_string(), url.to_string()))
} else {
// url not in cache, we request it
let client = Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(opt_insecure)
.build()?;
let mut response = client.get(url).header(USER_AGENT, opt_user_agent).send()?;
let mut response = client.get(url).send()?;
if !opt_silent {
if url == response.url().as_str() {
eprintln!("[ {} ]", &url);
eprintln!("{}", &url);
} else {
eprintln!("[ {} -> {} ]", &url, &response.url().as_str());
eprintln!("{} -> {}", &url, &response.url().as_str());
}
}

View file

@ -1,10 +1,13 @@
extern crate html5ever;
#[macro_use]
extern crate lazy_static;
extern crate html5ever;
extern crate regex;
extern crate reqwest;
extern crate url;
#[macro_use]
mod macros;
pub mod html;
pub mod http;
pub mod js;

9
src/macros.rs Normal file
View file

@ -0,0 +1,9 @@
#[macro_export]
macro_rules! str {
() => {
String::new()
};
($val: expr) => {
ToString::to_string(&$val)
};
}

View file

@ -1,6 +1,7 @@
#[macro_use]
extern crate clap;
extern crate monolith;
extern crate reqwest;
mod args;
@ -8,34 +9,50 @@ use args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset;
use monolith::utils::is_valid_url;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap;
use std::time::Duration;
fn main() {
let app_args = AppArgs::get();
let cache = &mut HashMap::new();
if is_valid_url(app_args.url_target.as_str()) {
// Initialize client
let mut header_map = HeaderMap::new();
match HeaderValue::from_str(&app_args.user_agent) {
Ok(header) => header_map.insert(USER_AGENT, header),
Err(err) => {
eprintln!("Invalid user agent! {}", err);
return;
}
};
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(app_args.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
let (data, final_url) = retrieve_asset(
cache,
&client,
app_args.url_target.as_str(),
false,
"",
app_args.user_agent.as_str(),
app_args.silent,
app_args.insecure,
)
.unwrap();
let dom = html_to_dom(&data);
walk_and_embed_assets(
cache,
&client,
&final_url,
&dom.document,
app_args.no_css,
app_args.no_js,
app_args.no_images,
app_args.user_agent.as_str(),
app_args.silent,
app_args.insecure,
app_args.no_frames,
);

View file

@ -70,18 +70,18 @@ fn test_walk_and_embed_assets() {
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
@ -106,18 +106,18 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
@ -144,18 +144,17 @@ fn test_walk_and_embed_assets_no_css() {
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
@ -189,18 +188,18 @@ fn test_walk_and_embed_assets_no_images() {
let opt_no_js: bool = false;
let opt_no_images: bool = true;
let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
@ -236,18 +235,17 @@ fn test_walk_and_embed_assets_no_frames() {
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);
@ -275,18 +273,18 @@ fn test_walk_and_embed_assets_no_js() {
let opt_no_js: bool = true;
let opt_no_images: bool = false;
let opt_silent = true;
let opt_insecure = false;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
"",
opt_silent,
opt_insecure,
opt_no_frames,
);

View file

@ -3,26 +3,18 @@ use std::collections::HashMap;
#[test]
fn test_retrieve_asset() {
let cache = &mut HashMap::new();
let (data, final_url) = retrieve_asset(
cache,
"data:text/html;base64,...",
true,
"",
"",
true,
false,
)
.unwrap();
let client = reqwest::Client::new();
let (data, final_url) =
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
let (data, final_url) = retrieve_asset(
cache,
&client,
"data:text/html;base64,...",
true,
"image/png",
"",
true,
false,
)
.unwrap();

View file

@ -1,12 +1,47 @@
extern crate base64;
use self::base64::encode;
use http::retrieve_asset;
use regex::Regex;
use reqwest::Client;
use std::collections::HashMap;
use url::{ParseError, Url};
/// This monster of a regex is used to match any kind of URL found in CSS.
///
/// There are roughly three different categories that a found URL could fit
/// into:
/// - Font [found after a src: property in an @font-family rule]
/// - Stylesheet [denoted by an @import before the url
/// - Image [covers all other uses of the url() function]
///
/// This regex aims to extract the following information:
/// - What type of URL is it (font/image/css)
/// - Where is the part that needs to be replaced (incl any wrapping quotes)
/// - What is the URL (excl any wrapping quotes)
///
/// Essentially, the regex can be broken down into two parts:
///
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
/// This matches the precursor to a font or CSS URL, and fills in a match under
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
/// Determining whether or not it's an image can be done by the negation of both
/// of these. Either zero or one of these can match.
///
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
/// This matches the actual URL part of the url(), and must always match. It also
/// sets `<to_repl>` and `<url>` which correspond to everything within
/// `url(...)` and a usable URL, respectively.
///
/// Note, however, that this does not perform any validation of the found URL.
/// Malformed CSS could lead to an invalid URL being present. It is therefore
/// recomended that the URL gets manually validated.
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
lazy_static! {
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
}
const MAGIC: [[&[u8]; 2]; 19] = [
@ -75,3 +110,89 @@ pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<Strin
};
Ok(result)
}
pub fn resolve_css_imports(
cache: &mut HashMap<String, String>,
client: &Client,
css_string: &str,
as_dataurl: bool,
href: &str,
opt_no_images: bool,
opt_silent: bool,
) -> String {
let mut resolved_css = String::from(css_string);
for link in REGEX_CSS_URL.captures_iter(&css_string) {
let target_link = link.name("url").unwrap().as_str();
// Determine the type of link
let is_stylesheet = link.name("stylesheet").is_some();
let is_font = link.name("font").is_some();
let is_image = !is_stylesheet && !is_font;
// Generate absolute URL for content
let embedded_url = match resolve_url(href, target_link) {
Ok(url) => url,
Err(_) => continue, // Malformed URL
};
// Download the asset. If it's more CSS, resolve that too
let content = if is_stylesheet {
// The link is an @import link
retrieve_asset(
cache,
client,
&embedded_url,
false, // Formating as data URL will be done later
"text/css", // Expect CSS
opt_silent,
)
.map(|(content, _)| {
resolve_css_imports(
cache,
client,
&content,
true, // Finally, convert to a dataurl
&embedded_url,
opt_no_images,
opt_silent,
)
})
} else if (is_image && !opt_no_images) || is_font {
// The link is some other, non-@import link
retrieve_asset(
cache,
client,
&embedded_url,
true, // Format as data URL
"", // Unknown MIME type
opt_silent,
)
.map(|(a, _)| a)
} else {
// If it's a datatype that has been opt_no'd out of, replace with
// absolute URL
Ok(embedded_url.clone())
}
.unwrap_or_else(|e| {
eprintln!("Warning: {}", e);
// If failed to resolve, replace with absolute URL
embedded_url
});
let replacement = format!("\"{}\"", &content);
let dest = link.name("to_repl").unwrap();
let offset = resolved_css.len() - css_string.len();
let target_range = (dest.start() + offset)..(dest.end() + offset);
resolved_css.replace_range(target_range, &replacement);
}
if as_dataurl {
data_to_dataurl("text/css", resolved_css.as_bytes())
} else {
resolved_css
}
}