Merge pull request #22 from Y2Z/user-agent
Add -u flag for custom User-Agent
This commit is contained in:
commit
75969c9943
5 changed files with 54 additions and 19 deletions
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "monolith"
|
name = "monolith"
|
||||||
version = "2.0.4"
|
version = "2.0.5"
|
||||||
authors = ["Sunshine <sunshine@uberspace.net>"]
|
authors = ["Sunshine <sunshine@uberspace.net>"]
|
||||||
description = "CLI tool to save webpages as a single HTML file"
|
description = "CLI tool to save webpages as a single HTML file"
|
||||||
|
|
||||||
|
@ -14,4 +14,3 @@ regex = "1.2.1"
|
||||||
reqwest = "0.9.20"
|
reqwest = "0.9.20"
|
||||||
url = "2.1.0"
|
url = "2.1.0"
|
||||||
lazy_static = "1.3.0"
|
lazy_static = "1.3.0"
|
||||||
|
|
||||||
|
|
|
@ -17,14 +17,11 @@ If compared to saving websites with `wget -mpk`, `monolith` embeds all assets as
|
||||||
|
|
||||||
### Usage
|
### Usage
|
||||||
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
|
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
|
||||||
<!-- or -->
|
|
||||||
|
|
||||||
<!-- cat local.html | monolith - > local.html -->
|
|
||||||
|
|
||||||
### Options
|
### Options
|
||||||
- `-i`: Remove images
|
- `-i`: Remove images
|
||||||
- `-j`: Exclude JavaScript
|
- `-j`: Exclude JavaScript
|
||||||
<!-- - `-a`: Don't make anchors link to remote documents -->
|
- `-u`: Specify custom User-Agent
|
||||||
|
|
||||||
### License
|
### License
|
||||||
The Unlicense
|
The Unlicense
|
||||||
|
|
48
src/html.rs
48
src/html.rs
|
@ -47,13 +47,23 @@ const JS_DOM_EVENT_ATTRS: [&str; 21] = [
|
||||||
"onresize",
|
"onresize",
|
||||||
];
|
];
|
||||||
|
|
||||||
#[allow(clippy::cognitive_complexity)]
|
pub fn walk_and_embed_assets(
|
||||||
pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_images: bool) {
|
url: &str,
|
||||||
|
node: &Handle,
|
||||||
|
opt_no_js: bool,
|
||||||
|
opt_no_images: bool,
|
||||||
|
opt_user_agent: &str,
|
||||||
|
) {
|
||||||
match node.data {
|
match node.data {
|
||||||
NodeData::Document => {
|
NodeData::Document => {
|
||||||
// Dig deeper
|
// Dig deeper
|
||||||
for child in node.children.borrow().iter() {
|
for child in node.children.borrow().iter() {
|
||||||
walk_and_embed_assets(&url, child, opt_no_js, opt_no_images);
|
walk_and_embed_assets(
|
||||||
|
&url, child,
|
||||||
|
opt_no_js,
|
||||||
|
opt_no_images,
|
||||||
|
opt_user_agent,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,7 +112,12 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "href" {
|
if &attr.name.local == "href" {
|
||||||
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
||||||
let favicon_datauri = retrieve_asset(&href_full_url.unwrap(), true, "");
|
let favicon_datauri = retrieve_asset(
|
||||||
|
&href_full_url.unwrap(),
|
||||||
|
true,
|
||||||
|
"",
|
||||||
|
opt_user_agent,
|
||||||
|
);
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(favicon_datauri.unwrap().as_str());
|
attr.value.push_slice(favicon_datauri.unwrap().as_str());
|
||||||
}
|
}
|
||||||
|
@ -116,7 +131,12 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
|
||||||
attr.value.push_slice(PNG_PIXEL);
|
attr.value.push_slice(PNG_PIXEL);
|
||||||
} else {
|
} else {
|
||||||
let src_full_url = resolve_url(&url, &attr.value.to_string());
|
let src_full_url = resolve_url(&url, &attr.value.to_string());
|
||||||
let img_datauri = retrieve_asset(&src_full_url.unwrap(), true, "");
|
let img_datauri = retrieve_asset(
|
||||||
|
&src_full_url.unwrap(),
|
||||||
|
true,
|
||||||
|
"",
|
||||||
|
opt_user_agent,
|
||||||
|
);
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(img_datauri.unwrap().as_str());
|
attr.value.push_slice(img_datauri.unwrap().as_str());
|
||||||
}
|
}
|
||||||
|
@ -141,8 +161,12 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
|
||||||
for attr in attrs_mut.iter_mut() {
|
for attr in attrs_mut.iter_mut() {
|
||||||
if &attr.name.local == "href" {
|
if &attr.name.local == "href" {
|
||||||
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
let href_full_url = resolve_url(&url, &attr.value.to_string());
|
||||||
let css_datauri =
|
let css_datauri = retrieve_asset(
|
||||||
retrieve_asset(&href_full_url.unwrap(), true, "text/css");
|
&href_full_url.unwrap(),
|
||||||
|
true,
|
||||||
|
"text/css",
|
||||||
|
opt_user_agent,
|
||||||
|
);
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(css_datauri.unwrap().as_str());
|
attr.value.push_slice(css_datauri.unwrap().as_str());
|
||||||
}
|
}
|
||||||
|
@ -165,6 +189,7 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
|
||||||
&src_full_url.unwrap(),
|
&src_full_url.unwrap(),
|
||||||
true,
|
true,
|
||||||
"application/javascript",
|
"application/javascript",
|
||||||
|
opt_user_agent,
|
||||||
);
|
);
|
||||||
attr.value.clear();
|
attr.value.clear();
|
||||||
attr.value.push_slice(js_datauri.unwrap().as_str());
|
attr.value.push_slice(js_datauri.unwrap().as_str());
|
||||||
|
@ -200,7 +225,13 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
|
||||||
|
|
||||||
// Dig deeper
|
// Dig deeper
|
||||||
for child in node.children.borrow().iter() {
|
for child in node.children.borrow().iter() {
|
||||||
walk_and_embed_assets(&url, child, opt_no_js, opt_no_images);
|
walk_and_embed_assets(
|
||||||
|
&url,
|
||||||
|
child,
|
||||||
|
opt_no_js,
|
||||||
|
opt_no_images,
|
||||||
|
opt_user_agent,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -216,7 +247,6 @@ pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn print_dom(handle: &Handle) {
|
pub fn print_dom(handle: &Handle) {
|
||||||
// TODO: append <meta http-equiv="Access-Control-Allow-Origin" content="'self'"/> to the <head> if opt_isolate
|
|
||||||
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
|
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use reqwest::header::CONTENT_TYPE;
|
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use url::{ParseError, Url};
|
use url::{ParseError, Url};
|
||||||
|
@ -69,6 +69,7 @@ pub fn retrieve_asset(
|
||||||
url: &str,
|
url: &str,
|
||||||
as_dataurl: bool,
|
as_dataurl: bool,
|
||||||
as_mime: &str,
|
as_mime: &str,
|
||||||
|
opt_user_agent: &str,
|
||||||
) -> Result<String, reqwest::Error> {
|
) -> Result<String, reqwest::Error> {
|
||||||
if url_is_data(&url).unwrap() {
|
if url_is_data(&url).unwrap() {
|
||||||
Ok(url.to_string())
|
Ok(url.to_string())
|
||||||
|
@ -77,7 +78,11 @@ pub fn retrieve_asset(
|
||||||
.timeout(Duration::from_secs(10))
|
.timeout(Duration::from_secs(10))
|
||||||
.build()
|
.build()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let mut response = client.get(url).send().unwrap();
|
let mut response = client
|
||||||
|
.get(url)
|
||||||
|
.header(USER_AGENT, opt_user_agent)
|
||||||
|
.send()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
if as_dataurl {
|
if as_dataurl {
|
||||||
// Convert response into a byte array
|
// Convert response into a byte array
|
||||||
|
|
|
@ -6,6 +6,8 @@ use clap::{App, Arg};
|
||||||
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets};
|
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets};
|
||||||
use monolith::http::{is_valid_url, retrieve_asset};
|
use monolith::http::{is_valid_url, retrieve_asset};
|
||||||
|
|
||||||
|
static DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let command = App::new("monolith")
|
let command = App::new("monolith")
|
||||||
.version(crate_version!())
|
.version(crate_version!())
|
||||||
|
@ -20,18 +22,20 @@ fn main() {
|
||||||
)
|
)
|
||||||
.args_from_usage("-j, --no-js 'Excludes JavaScript'")
|
.args_from_usage("-j, --no-js 'Excludes JavaScript'")
|
||||||
.args_from_usage("-i, --no-images 'Removes images'")
|
.args_from_usage("-i, --no-images 'Removes images'")
|
||||||
|
.args_from_usage("-u, --user-agent=<Iceweasel> 'Custom User-Agent string'")
|
||||||
.get_matches();
|
.get_matches();
|
||||||
|
|
||||||
// Process the command
|
// Process the command
|
||||||
let arg_target = command.value_of("url").unwrap();
|
let arg_target = command.value_of("url").unwrap();
|
||||||
let opt_no_js = command.is_present("no-js");
|
let opt_no_js = command.is_present("no-js");
|
||||||
let opt_no_img = command.is_present("no-images");
|
let opt_no_img = command.is_present("no-images");
|
||||||
|
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
||||||
|
|
||||||
if is_valid_url(arg_target) {
|
if is_valid_url(arg_target) {
|
||||||
let data = retrieve_asset(&arg_target, false, "");
|
let data = retrieve_asset(&arg_target, false, "", opt_user_agent);
|
||||||
let dom = html_to_dom(&data.unwrap());
|
let dom = html_to_dom(&data.unwrap());
|
||||||
|
|
||||||
walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img);
|
walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img, opt_user_agent);
|
||||||
|
|
||||||
print_dom(&dom.document);
|
print_dom(&dom.document);
|
||||||
println!(); // Ensure newline at end of output
|
println!(); // Ensure newline at end of output
|
||||||
|
|
Loading…
Reference in a new issue