diff --git a/.eslintignore b/.eslintignore index 36c33a59..1e9dfc58 100644 --- a/.eslintignore +++ b/.eslintignore @@ -1,3 +1,3 @@ src/core/lib/** !src/core/lib/Magic.js -src/core/config/MetaConfig.js \ No newline at end of file +src/core/config/MetaConfig.js diff --git a/package-lock.json b/package-lock.json index 4bcf071c..a008ca2b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1404,6 +1404,14 @@ "supports-color": "2.0.0" } }, + "chi-squared": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/chi-squared/-/chi-squared-1.1.0.tgz", + "integrity": "sha1-iShlz/qOCnIPkhv8nGNcGawqNG0=", + "requires": { + "gamma": "1.0.0" + } + }, "chokidar": { "version": "1.7.0", "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz", @@ -4255,6 +4263,11 @@ "integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=", "dev": true }, + "gamma": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/gamma/-/gamma-1.0.0.tgz", + "integrity": "sha1-mDwck5/iPZMnAVhXEeHZpDDLdMs=" + }, "get-caller-file": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.2.tgz", diff --git a/package.json b/package.json index 46450b06..b71ff92d 100644 --- a/package.json +++ b/package.json @@ -72,6 +72,7 @@ "bootstrap": "^3.3.7", "bootstrap-colorpicker": "^2.5.2", "bootstrap-switch": "^3.3.4", + "chi-squared": "^1.1.0", "crypto-api": "^0.7.5", "crypto-js": "^3.1.9-1", "diff": "^3.4.0", diff --git a/src/core/FlowControl.js b/src/core/FlowControl.js index b9eff7f0..059cdebc 100755 --- a/src/core/FlowControl.js +++ b/src/core/FlowControl.js @@ -278,8 +278,7 @@ const FlowControl = { Recipe (click to load) Data snippet - Most likely language\n(lower scores are better) - File type + Properties `; options.forEach(option => { @@ -290,20 +289,25 @@ const FlowControl = { .concat(currentRecipeConfig.slice(state.progress + 1)), recipeURL = "recipe=" + Utils.encodeURIFragment(Utils.generatePrettyRecipe(recipeConfig)); - const language = option.languageScores[0]; - let fileType = "Unknown"; + const bestLanguage = option.languageScores[0]; + let language = "Unknown", + fileType = "Unknown"; + + if (bestLanguage.probability > 0.00005) { + language = Magic.codeToLanguage(bestLanguage.lang) + " " + + (bestLanguage.probability * 100).toFixed(2) + "%"; + } if (option.fileType) { - fileType = `Extension: ${option.fileType.ext}\nMime type: ${option.fileType.mime}`; - if (option.fileType.desc) - fileType += `\nDescription: ${option.fileType.desc}`; + fileType = `${option.fileType.mime} (${option.fileType.ext})`; } output += ` ${Utils.generatePrettyRecipe(option.recipe, true)} ${Utils.escapeHtml(Utils.printable(Utils.truncate(option.data, 99)))} - ${Magic.codeToLanguage(language.lang)}\nScore: ${language.chiSqr.toFixed()} - ${fileType} + Language: ${language} +File type: ${fileType} +Valid UTF8: ${option.isUTF8} `; }); diff --git a/src/core/lib/Magic.js b/src/core/lib/Magic.js index 2e29cd0b..b93ad2ec 100644 --- a/src/core/lib/Magic.js +++ b/src/core/lib/Magic.js @@ -3,6 +3,7 @@ import Utils from "../Utils.js"; import Recipe from "../Recipe.js"; import Dish from "../Dish.js"; import FileType from "../operations/FileType.js"; +import chiSquared from "chi-squared"; /** @@ -19,11 +20,12 @@ class Magic { * Magic constructor. * * @param {ArrayBuffer} buf + * @param {Object[]} [opPatterns] */ - constructor(buf) { + constructor(buf, opPatterns) { this.inputBuffer = new Uint8Array(buf); this.inputStr = Utils.arrayBufferToStr(buf); - this.opPatterns = Magic._generateOpPatterns(); + this.opPatterns = opPatterns || Magic._generateOpPatterns(); } /** @@ -58,15 +60,17 @@ class Magic { let chiSqrs = []; for (let lang in LANG_FREQS) { + let [score, prob] = Magic._chiSqr(inputFreq, LANG_FREQS[lang]); chiSqrs.push({ lang: lang, - chiSqr: Magic._chiSqr(inputFreq, LANG_FREQS[lang]) + score: score, + probability: prob }); } // Sort results so that the most likely match is at the top chiSqrs.sort((a, b) => { - return a.chiSqr - b.chiSqr; + return a.score - b.score; }); return chiSqrs; @@ -84,6 +88,81 @@ class Magic { return FileType.magicType(this.inputBuffer); } + /** + * Detects whether the input buffer is valid UTF8. + * + * @returns {boolean} + */ + isUTF8() { + const bytes = new Uint8Array(this.inputBuffer); + let i = 0; + while (i < bytes.length) { + if (( // ASCII + bytes[i] === 0x09 || + bytes[i] === 0x0A || + bytes[i] === 0x0D || + (0x20 <= bytes[i] && bytes[i] <= 0x7E) + )) { + i += 1; + continue; + } + + if (( // non-overlong 2-byte + (0xC2 <= bytes[i] && bytes[i] <= 0xDF) && + (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF) + )) { + i += 2; + continue; + } + + if (( // excluding overlongs + bytes[i] === 0xE0 && + (0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) && + (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) + ) || + ( // straight 3-byte + ((0xE1 <= bytes[i] && bytes[i] <= 0xEC) || + bytes[i] === 0xEE || + bytes[i] === 0xEF) && + (0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) && + (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) + ) || + ( // excluding surrogates + bytes[i] === 0xED && + (0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) && + (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF) + )) { + i += 3; + continue; + } + + if (( // planes 1-3 + bytes[i] === 0xF0 && + (0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) && + (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) && + (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF) + ) || + ( // planes 4-15 + (0xF1 <= bytes[i] && bytes[i] <= 0xF3) && + (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) && + (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) && + (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF) + ) || + ( // plane 16 + bytes[i] === 0xF4 && + (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) && + (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) && + (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF) + )) { + i += 4; + continue; + } + + return false; + } + + return true; + } /** * Speculatively executes matching operations, recording metadata of each result. @@ -103,6 +182,7 @@ class Magic { data: this.inputStr.slice(0, 100), languageScores: this.detectLanguage(), fileType: this.detectFileType(), + isUTF8: this.isUTF8() }); // Find any operations that can be run on this data @@ -122,7 +202,7 @@ class Magic { const recipe = new Recipe([opConfig]); await recipe.execute(dish, 0); - const magic = new Magic(dish.get(Dish.ARRAY_BUFFER)), + const magic = new Magic(dish.get(Dish.ARRAY_BUFFER), this.opPatterns), speculativeResults = await magic.speculativeExecution(depth-1, [...recipeConfig, opConfig]); results = results.concat(speculativeResults); @@ -131,13 +211,17 @@ class Magic { // Return a sorted list of possible recipes along with their properties return results.sort((a, b) => { // Each option is sorted based on its most likely language (lower is better) - let aScore = a.languageScores[0].chiSqr, - bScore = b.languageScores[0].chiSqr; + let aScore = a.languageScores[0].score, + bScore = b.languageScores[0].score; // If a recipe results in a file being detected, it receives a relatively good score if (a.fileType) aScore = 500; if (b.fileType) bScore = 500; + // If the result is valid UTF8, its score gets boosted (lower being better) + if (a.isUTF8) aScore -= 100; + if (b.isUTF8) bScore -= 100; + return aScore - bScore; }); } @@ -194,19 +278,24 @@ class Magic { * https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test * * @private - * @param {number[]} observed - * @param {number[]} expected - * @returns {number} + * @param {number[]} observed + * @param {number[]} expected + * @param {number} ddof - Delta degrees of freedom + * @returns {number[]} - The score and the probability */ - static _chiSqr(observed, expected) { + static _chiSqr(observed, expected, ddof=0) { let tmp, - res = 0; + score = 0; for (let i = 0; i < observed.length; i++) { tmp = observed[i] - expected[i]; - res += tmp * tmp / expected[i]; + score += tmp * tmp / expected[i]; } - return res; + + return [ + score, + 1 - chiSquared.cdf(score, observed.length - 1 - ddof) + ]; } /**