Merge branch 'master' into author-robatipoor

This commit is contained in:
Vincent Flyson 2019-08-23 23:24:07 -04:00 committed by GitHub
commit d8d6437a15
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 205 additions and 86 deletions

View File

@ -1,19 +1,18 @@
[package]
name = "monolith"
version = "2.0.5"
version = "2.0.9"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmil.com>",
]
description = "CLI tool to save webpages as a single HTML file"
description = "CLI tool for saving web pages as a single HTML file"
[dependencies]
base64 = "0.10.1"
clap = "2.33.0"
html5ever = "0.24.0"
indicatif = "0.11.0"
mime-sniffer = "0.1.2"
lazy_static = "1.3.0"
regex = "1.2.1"
reqwest = "0.9.20"
url = "2.1.0"
lazy_static = "1.3.0"

34
snap/snapcraft.yaml Normal file
View File

@ -0,0 +1,34 @@
name: monolith
base: core18
version: git
summary: Monolith - Save HTML pages with ease
description: |
A data hoarder's dream come true: bundle any web page into a single
HTML file. You can finally replace that gazillion of open tabs with
a gazillion of .html files stored somewhere on your precious little
drive.
Unlike conventional "Save page as…", monolith not only saves the
target document, it embeds CSS, image, and JavaScript assets all
at once, producing a single HTML5 document that is a joy to store
and share.
If compared to saving websites with wget -mpk, monolith embeds
all assets as data URLs and therefore displays the saved page
exactly the same, being completely separated from the Internet.
confinement: strict
parts:
monolith:
plugin: rust
source: .
build-packages:
- libssl-dev
- pkg-config
apps:
monolith:
command: monolith
plugs:
- home
- network
- removable-media

View File

@ -1,6 +1,7 @@
use http::{is_valid_url, resolve_url, retrieve_asset};
use std::default::Default;
use std::io;
use utils::data_to_dataurl;
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
@ -14,10 +15,12 @@ enum NodeMatch {
Anchor,
Script,
Form,
IFrame,
Other,
}
const PNG_PIXEL: &str = "";
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
// Input
@ -74,7 +77,8 @@ pub fn walk_and_embed_assets(
NodeData::Comment { .. } => {
// Note: in case of opt_no_js being set to true, there's no need to worry about
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
// since that's not part of W3C standard and gets ignored by browsers other than IE [5, 9]
// since that's not part of W3C standard and therefore gets ignored
// by browsers other than IE [5, 9]
}
NodeData::Element {
@ -85,26 +89,26 @@ pub fn walk_and_embed_assets(
let attrs_mut = &mut attrs.borrow_mut();
let mut found = NodeMatch::Other;
if &name.local == "link" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" {
if is_icon(&attr.value.to_string()) {
found = NodeMatch::Icon;
break;
} else if attr.value.to_string() == "stylesheet" {
found = NodeMatch::StyleSheet;
break;
match name.local.as_ref() {
"link" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" {
if is_icon(&attr.value.to_string()) {
found = NodeMatch::Icon;
break;
} else if attr.value.to_string() == "stylesheet" {
found = NodeMatch::StyleSheet;
break;
}
}
}
}
} else if &name.local == "img" {
found = NodeMatch::Image;
} else if &name.local == "a" {
found = NodeMatch::Anchor;
} else if &name.local == "script" {
found = NodeMatch::Script;
} else if &name.local == "form" {
found = NodeMatch::Form;
"img" => { found = NodeMatch::Image; }
"a" => { found = NodeMatch::Anchor; }
"script" => { found = NodeMatch::Script; }
"form" => { found = NodeMatch::Form; }
"iframe" => { found = NodeMatch::IFrame; }
_ => {}
}
match found {
@ -128,7 +132,7 @@ pub fn walk_and_embed_assets(
if &attr.name.local == "src" {
if opt_no_images {
attr.value.clear();
attr.value.push_slice(PNG_PIXEL);
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
let src_full_url = resolve_url(&url, &attr.value.to_string());
let img_datauri = retrieve_asset(
@ -146,8 +150,8 @@ pub fn walk_and_embed_assets(
NodeMatch::Anchor => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
// Do not touch hrefs which begin with a hash sign
if attr.value.to_string().chars().nth(0) == Some('#') {
// Don't touch email links or hrefs which begin with a hash sign
if attr.value.starts_with('#') || attr.value.starts_with("mailto:") {
continue;
}
@ -211,6 +215,32 @@ pub fn walk_and_embed_assets(
}
}
}
NodeMatch::IFrame => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let src_full_url = resolve_url(&url, &attr.value.to_string()).unwrap();
let iframe_data = retrieve_asset(
&src_full_url,
false,
"text/html",
opt_user_agent,
);
let dom = html_to_dom(&iframe_data.unwrap());
walk_and_embed_assets(
&src_full_url,
&dom.document,
opt_no_js,
opt_no_images,
opt_user_agent,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
let iframe_datauri = data_to_dataurl("text/html", &buf);
attr.value.clear();
attr.value.push_slice(iframe_datauri.as_str());
}
}
}
NodeMatch::Other => {}
}

View File

@ -1,6 +1,6 @@
use regex::Regex;
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
use reqwest::Client;
use reqwest::{Client, RedirectPolicy};
use std::time::Duration;
use url::{ParseError, Url};
use utils::data_to_dataurl;
@ -9,6 +9,13 @@ lazy_static! {
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
}
pub fn is_data_url(url: &str) -> Result<bool, String> {
match Url::parse(url) {
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
Err(err) => Err(format!("{}", err)),
}
}
pub fn is_valid_url(path: &str) -> bool {
REGEX_URL.is_match(path)
}
@ -18,63 +25,23 @@ pub fn resolve_url(from: &str, to: &str) -> Result<String, ParseError> {
// (anything, http://site.com/css/main.css)
to.to_string()
} else {
let mut re = String::new();
if is_valid_url(from) {
// It's a remote resource (HTTP)
if to.chars().nth(0) == Some('/') {
// (http://site.com/article/1, /...?)
let from_url = Url::parse(from)?;
if to.chars().nth(1) == Some('/') {
// (http://site.com/article/1, //images/1.png)
re.push_str(from_url.scheme());
re.push_str(":");
re.push_str(to);
} else {
// (http://site.com/article/1, /css/main.css)
re.push_str(from_url.scheme());
re.push_str("://");
re.push_str(from_url.host_str().unwrap());
re.push_str(to);
}
} else {
// (http://site.com, css/main.css)
// TODO improve to ensure no // or /// ever happen
re.push_str(from);
re.push_str("/");
re.push_str(to);
}
} else {
// It's a local resource (fs)
// TODO improve to ensure no // or /// ever happen
// TODO for fs use basepath instead of $from
re.push_str(from);
re.push_str("/");
re.push_str(to);
}
re
Url::parse(from)?.join(to)?.to_string()
};
Ok(result)
}
pub fn url_is_data(url: &str) -> Result<bool, String> {
match Url::parse(url) {
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
Err(err) => Err(format!("{}", err)),
}
}
pub fn retrieve_asset(
url: &str,
as_dataurl: bool,
as_mime: &str,
opt_user_agent: &str,
) -> Result<String, reqwest::Error> {
if url_is_data(&url).unwrap() {
if is_data_url(&url).unwrap() {
Ok(url.to_string())
} else {
let client = Client::builder()
.redirect(RedirectPolicy::limited(3))
.timeout(Duration::from_secs(10))
.build()
.unwrap();
@ -125,13 +92,19 @@ mod tests {
#[test]
fn test_resolve_url() -> Result<(), ParseError> {
let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?;
let resolved_url = resolve_url(
"https://www.kernel.org",
"../category/signatures.html",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/../category/signatures.html"
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?;
let resolved_url = resolve_url(
"https://www.kernel.org",
"category/signatures.html",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
@ -155,6 +128,15 @@ mod tests {
"https://www.kernel.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
"https://www.kernel.org",
"//another-host.org/theme/images/logos/tux.png",
)?;
assert_eq!(
resolved_url.as_str(),
"https://another-host.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
"https://www.kernel.org/category/signatures.html",
"/theme/images/logos/tux.png",
@ -164,16 +146,25 @@ mod tests {
"https://www.kernel.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
"https://www.w3schools.com/html/html_iframe.asp",
"default.asp",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.w3schools.com/html/default.asp"
);
Ok(())
}
#[test]
fn test_url_is_data() {
fn test_is_data_url() {
assert!(
url_is_data("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
.unwrap_or(false)
);
assert!(!url_is_data("https://kernel.org").unwrap_or(false));
assert!(!url_is_data("//kernel.org").unwrap_or(false));
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
assert!(!is_data_url("//kernel.org").unwrap_or(false));
}
}

View File

@ -6,7 +6,8 @@ use clap::{App, Arg};
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets};
use monolith::http::{is_valid_url, retrieve_asset};
static DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
static DEFAULT_USER_AGENT: &str =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
fn main() {
let command = App::new("monolith")
@ -22,20 +23,26 @@ fn main() {
)
.args_from_usage("-j, --no-js 'Excludes JavaScript'")
.args_from_usage("-i, --no-images 'Removes images'")
.args_from_usage("-u, --user-agent=<Iceweasel> 'Custom User-Agent string'")
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
.get_matches();
// Process the command
let arg_target = command.value_of("url").unwrap();
let opt_no_js = command.is_present("no-js");
let opt_no_img = command.is_present("no-images");
let opt_no_images = command.is_present("no-images");
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
if is_valid_url(arg_target) {
let data = retrieve_asset(&arg_target, false, "", opt_user_agent);
let dom = html_to_dom(&data.unwrap());
let data = retrieve_asset(&arg_target, false, "", opt_user_agent).unwrap();
let dom = html_to_dom(&data);
walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img, opt_user_agent);
walk_and_embed_assets(
&arg_target,
&dom.document,
opt_no_js,
opt_no_images,
opt_user_agent,
);
print_dom(&dom.document);
println!(); // Ensure newline at end of output

View File

@ -1,8 +1,31 @@
extern crate base64;
extern crate mime_sniffer;
use self::base64::encode;
use self::mime_sniffer::MimeTypeSniffer;
static MAGIC: [[&[u8]; 2]; 19] = [
// Image
[b"GIF87a", b"image/gif"],
[b"GIF89a", b"image/gif"],
[b"\xFF\xD8\xFF", b"image/jpeg"],
[b"\x89PNG\x0D\x0A\x1A\x0A", b"image/png"],
[b"<?xml ", b"image/svg+xml"],
[b"<svg ", b"image/svg+xml"],
[b"RIFF....WEBPVP8 ", b"image/webp"],
[b"\x00\x00\x01\x00", b"image/x-icon"],
// Audio
[b"ID3", b"audio/mpeg"],
[b"\xFF\x0E", b"audio/mpeg"],
[b"\xFF\x0F", b"audio/mpeg"],
[b"OggS", b"audio/ogg"],
[b"RIFF....WAVEfmt ", b"audio/wav"],
[b"fLaC", b"audio/x-flac"],
// Video
[b"RIFF....AVI LIST", b"video/avi"],
[b"....ftyp", b"video/mp4"],
[b"\x00\x00\x01\x0B", b"video/mpeg"],
[b"....moov", b"video/quicktime"],
[b"\x1A\x45\xDF\xA3", b"video/webm"],
];
pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
let mimetype = if mime == "" {
@ -14,7 +37,16 @@ pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
}
fn detect_mimetype(data: &[u8]) -> String {
data.sniff_mime_type().unwrap_or("").to_string()
let mut re = String::new();
for item in MAGIC.iter() {
if data.starts_with(item[0]) {
re = String::from_utf8(item[1].to_vec()).unwrap();
break;
}
}
re
}
#[cfg(test)]
@ -31,4 +63,30 @@ mod tests {
"data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
);
}
#[test]
fn test_detect_mimetype() {
// Image
assert_eq!(detect_mimetype(b"GIF87a"), "image/gif");
assert_eq!(detect_mimetype(b"GIF89a"), "image/gif");
assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png");
assert_eq!(detect_mimetype(b"<?xml "), "image/svg+xml");
assert_eq!(detect_mimetype(b"<svg "), "image/svg+xml");
assert_eq!(detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
assert_eq!(detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
// Audio
assert_eq!(detect_mimetype(b"ID3"), "audio/mpeg");
assert_eq!(detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
assert_eq!(detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
assert_eq!(detect_mimetype(b"OggS"), "audio/ogg");
assert_eq!(detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
assert_eq!(detect_mimetype(b"fLaC"), "audio/x-flac");
// Video
assert_eq!(detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
assert_eq!(detect_mimetype(b"....ftyp"), "video/mp4");
assert_eq!(detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
assert_eq!(detect_mimetype(b"....moov"), "video/quicktime");
assert_eq!(detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
}
}