diff --git a/README.md b/README.md index 671fa22..64d9bba 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,33 @@ # monolith -Save HTML pages with ease +A data hoarder's dream come true: +bundle any web page into a stand-alone HTML file. + +Unlike conventional "Save page as …", `monolith` saves the target +document **and** embeds JavaScript, CSS and image assets **all at once**, +resulting in a single HTML5 document that is easy to store and share. + +Works both on remote and local targets. + +If compared to saving websites with `wget -mpk http://news.ycombinator.com`, +`monolith` embeds all assets as data-URIs and therefore would display the page +exactly the same at any time, not depending on the Internet connection. + +However, keep in mind that `monolith` is not aware of your browser's session. + +### Installation + $ sudo npm install -g git@github.com:Y2Z/monolith.git + +### Usage + $ monolith <local path>/index.html > mysite.html +or + $ monolith https://github.com > github.html + + + +### Options + - `-u`: output the result document as one big data-URI + - `-q`: don't be verbose + + +### License +GPLv3 diff --git a/bin/index.js b/bin/index.js new file mode 100755 index 0000000..46e3ad4 --- /dev/null +++ b/bin/index.js @@ -0,0 +1,40 @@ +#!/usr/bin/env nodejs + +var compactor = require('../compactor.js'); +var options = require('../options.js'); + +function printUsage () { + console.log("\nUsage: \n monolith https://github.com\n") +} + +if (process.argv.length > 2) { + var target = null + + for (var i = 2, ilen = process.argv.length; i < ilen; i++) { + var argument = process.argv[i] + + if (argument == '--data-uri' || argument == '-u') { + options.outputFinalResultAsBase64 = true + } else if (argument == '--quiet' || argument == '-q') { + options.suppressVerboseOutput = true + } else { + if (!target) { + target = argument + } else { + // Can't have more than one target + target = null + break + } + } + } + + if (target) { + compactor(target, function(error, result) { + console.log(result) + }) + } else { + printUsage() + } +} else { + printUsage() +} diff --git a/compactor.js b/compactor.js new file mode 100755 index 0000000..e57379e --- /dev/null +++ b/compactor.js @@ -0,0 +1,49 @@ +#!/usr/bin/env node + +'use strict' + +var path = require('path') +var jsdom = require('jsdom') + +var options = require('./options.js'); +var functions = require('./functions.js') +var absoluteURLPath = functions.absoluteURLPath, +isURL = functions.isURL, +base64 = functions.base64, +resolve = functions.resolve, +retrieveFile = functions.retrieveFile + +var modules = [ + // 1. CSS + require('./modules/css.js').parser, + // 2. JS + require('./modules/js.js').parser, + // 3. images + require('./modules/img.js').parser, + // 4. favicon + require('./modules/favicon.js').parser, + // 5. anchors + require('./modules/anchors.js').parser, +] + +function monolith (targetDocumentPath, callback) { + // Determine the absolute initial document path + var absBasePath = isURL(targetDocumentPath) + ? absoluteURLPath(targetDocumentPath) + : path.dirname(path.resolve(targetDocumentPath)) + absBasePath += '/' // Append trailing slash + + // Retrieve the root document to use as a base + var rootFileContent = retrieveFile(absBasePath, targetDocumentPath) + + // Convert the target document into a DOM tree + jsdom.env(rootFileContent, [], function (err, window) { + for (var i = 0, ilen = modules.length; i < ilen; i++) + modules[i](window, absBasePath) + + var result = window.document.documentElement.innerHTML + callback(null, options.outputFinalResultAsBase64 ? base64(result) : result) + }) +} + +module.exports = monolith diff --git a/functions.js b/functions.js new file mode 100644 index 0000000..9141870 --- /dev/null +++ b/functions.js @@ -0,0 +1,67 @@ +var fs = require('fs') +var path = require('path') +var url = require('url') +var request = require('sync-request') + +var options = require('./options.js'); + +var cache = {} + +// Note: http://site.com/image/icons/home.png -> http://site.com/image/icons +function absoluteURLPath (aURL) { + var URL = url.parse(aURL) + + return URL.protocol + '//' + URL.host + URL.path +} + +var reURL = /^https?:\/\//i // TODO file:/// + +function isURL (aPath) { return reURL.test(aPath) } +function base64 (aInput) { return new Buffer(aInput).toString('base64') } + +function resolve (aFrom, aTo) { + if (isURL(aFrom)) { + var URL = url.parse(aFrom) + + if (aTo[0] == '/') { // (http://site.com/article/1, /css/main.css) + if (aTo[1] == '/') { // (http://site.com/article/1, //images/1.png) + return URL.protocol + aTo + } else { + return url.resolve(URL.protocol + '//' + URL.host, aTo) + } + } else if (isURL(aTo)) { // (http://site.com, http://site.com/css/main.css) + return aTo + } else { // (http://site.com, css/main.css) + return url.resolve(aFrom, aTo) + } + } else { + return path.resolve(aFrom, aTo) + } +} + +function retrieveFile (aAbsBasePath, aFilePath, aBinary) { + var fullFilePath = resolve(aAbsBasePath, aFilePath) + var format = aBinary ? 'base64' : 'utf8' + var cacheKey = fullFilePath + '@' + format + + if (isURL(fullFilePath)) { + if (cacheKey in cache) { + return cache[cacheKey] + } else { + try { + var res = request('GET', fullFilePath) + + if (!options.suppressVerboseOutput) + console.warn('Retrieving file', fullFilePath, '...') + + return cache[cacheKey] = res.getBody(format) + } catch (httpError) { + return '' + } + } + } else { + return fs.readFileSync(fullFilePath, format) + } +} + +module.exports = { absoluteURLPath, isURL, base64, resolve, retrieveFile } diff --git a/modules/anchors.js b/modules/anchors.js new file mode 100644 index 0000000..7c4278c --- /dev/null +++ b/modules/anchors.js @@ -0,0 +1,21 @@ +'use strict'; + +var functions = require('../functions.js') + +module.exports = { + + parser: function (window, absBasePath) { + var anchors = window.document.getElementsByTagName('a') + + for (var i = 0, ilen = anchors.length; i < ilen; i++) { + if (anchors[i].getAttribute('href')) { + var anchor = anchors[i] + var href = anchor.getAttribute('href').trim() + var absoluteURL = functions.resolve(absBasePath, href) + + anchor.setAttribute('href', absoluteURL) + } + } + } + +} diff --git a/modules/css.js b/modules/css.js new file mode 100644 index 0000000..04c2175 --- /dev/null +++ b/modules/css.js @@ -0,0 +1,20 @@ +'use strict'; + +var functions = require('../functions.js') +var retrieveFile = functions.retrieveFile + +module.exports = { + + parser: function (window, absBasePath) { + var links = window.document.head.getElementsByTagName('link') + + for (var i = 0, ilen = links.length; i < ilen; i++) { + if (links[i].getAttribute('rel') == 'stylesheet') { + var data = retrieveFile(absBasePath, links[i].getAttribute('href').trim(), true) + + links[i].setAttribute('href', "data:text/css;base64," + data) + } + } + } + +} diff --git a/modules/favicon.js b/modules/favicon.js new file mode 100644 index 0000000..f377994 --- /dev/null +++ b/modules/favicon.js @@ -0,0 +1,24 @@ +'use strict' + +var functions = require('../functions.js') +var retrieveFile = functions.retrieveFile +var mime = require('./img.js').mime + +//var reIcon = /^([a-z]+\s)?icon(\s[a-z]+)?$/i +var reIcon = /icon/i + +module.exports = { + + parser: function (window, absBasePath) { + var links = window.document.head.getElementsByTagName('link') + + for (var i = 0, ilen = links.length; i < ilen; i++) { + if (reIcon.test(links[i].getAttribute('rel'))) { + var data = retrieveFile(absBasePath, links[i].getAttribute('href').trim(), true) + + links[i].setAttribute('href', "data:" + mime(data) + "base64," + data) + } + } + } + +} diff --git a/modules/img.js b/modules/img.js new file mode 100644 index 0000000..ec6bea9 --- /dev/null +++ b/modules/img.js @@ -0,0 +1,57 @@ +'use strict'; + +var functions = require('../functions.js') +var retrieveFile = functions.retrieveFile + +function mime (data) { + var mime = 'image/jpeg' + + if (~data.indexOf('iVBORw0K')) + mime = 'image/png' + else if (~data.indexOf('R0lGODlh')) + mime = 'image/gif' + else if (~data.indexOf(', + var imgs = window.document.getElementsByTagName('img') + + for (var i = 0, ilen = imgs.length; i < ilen; i++) { + var img = imgs[i] + + if (img.getAttribute('src')) { + var data = retrieveFile(absBasePath, img.getAttribute('src').trim(), true) + + img.setAttribute('src', "data:" + mime(data) + ";base64," + data) + } + } + + // + var pictures = window.document.getElementsByTagName('picture') + + for (var i = 0, ilen = pictures.length; i < ilen; i++) { + var picture = pictures[i] + var sources = picture.getElementsByTagName('source') + + for (var s = 0, slen = sources.length; s < slen; s++) { + var source = sources[s] + + if (source.getAttribute('srcset')) { + var data = retrieveFile(absBasePath, source.getAttribute('srcset').trim(), true) + var type = source.getAttribute('type') + + source.setAttribute('srcset', "data:" + (type || mime(data)) + ";base64," + data) + } + } + } + } + +} diff --git a/modules/js.js b/modules/js.js new file mode 100644 index 0000000..7a4c63f --- /dev/null +++ b/modules/js.js @@ -0,0 +1,27 @@ +'use strict'; + +var functions = require('../functions.js') +var retrieveFile = functions.retrieveFile + +var dataURI = true // set to true to convert the src attribute to a dataURI link + +module.exports = { + + parser: function (window, absBasePath) { + var scripts = window.document.getElementsByTagName('script') + + for (var i = 0, ilen = scripts.length; i < ilen; i++) { + if (scripts[i].getAttribute('src')) { + var data = retrieveFile(absBasePath, scripts[i].getAttribute('src').trim(), dataURI) + + if (dataURI) { + scripts[i].setAttribute('src', "data:text/javascript;base64," + data) + } else { + scripts[i].removeAttribute('src') + scripts[i].innerHTML = data + } + } + } + } + +} diff --git a/options.js b/options.js new file mode 100644 index 0000000..a10c9da --- /dev/null +++ b/options.js @@ -0,0 +1,6 @@ +var options = { + outputFinalResultAsBase64: false, + suppressVerboseOutput: false +} + +module.exports = options diff --git a/package.json b/package.json new file mode 100644 index 0000000..572e2f6 --- /dev/null +++ b/package.json @@ -0,0 +1,24 @@ +{ + "name": "monolith", + "version": "0.4.5", + "description": "Save HTML pages with ease", + "main": "index.js", + "dependencies": { + "sync-request": "^3.0.1", + "jsdom": "^9.9.1" + }, + "bin": { + "monolith": "bin/index.js" + }, + "scripts": { + "test": "bin/index.js https://github.com > github.html" + }, + "keywords": [ + "html5", + "monolith", + "one-for-all", + "all-for-one" + ], + "author": "Y2Z", + "license": "GPL-3.0" +}