Magic operation now detects UTF8 and gives a probability score for each language

This commit is contained in:
n1474335 2018-02-10 15:10:53 +00:00
parent 865ee6a720
commit 6624f25a64
5 changed files with 131 additions and 24 deletions

View File

@ -1,3 +1,3 @@
src/core/lib/** src/core/lib/**
!src/core/lib/Magic.js !src/core/lib/Magic.js
src/core/config/MetaConfig.js src/core/config/MetaConfig.js

13
package-lock.json generated
View File

@ -1404,6 +1404,14 @@
"supports-color": "2.0.0" "supports-color": "2.0.0"
} }
}, },
"chi-squared": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/chi-squared/-/chi-squared-1.1.0.tgz",
"integrity": "sha1-iShlz/qOCnIPkhv8nGNcGawqNG0=",
"requires": {
"gamma": "1.0.0"
}
},
"chokidar": { "chokidar": {
"version": "1.7.0", "version": "1.7.0",
"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz", "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz",
@ -4255,6 +4263,11 @@
"integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=", "integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=",
"dev": true "dev": true
}, },
"gamma": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/gamma/-/gamma-1.0.0.tgz",
"integrity": "sha1-mDwck5/iPZMnAVhXEeHZpDDLdMs="
},
"get-caller-file": { "get-caller-file": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.2.tgz", "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.2.tgz",

View File

@ -72,6 +72,7 @@
"bootstrap": "^3.3.7", "bootstrap": "^3.3.7",
"bootstrap-colorpicker": "^2.5.2", "bootstrap-colorpicker": "^2.5.2",
"bootstrap-switch": "^3.3.4", "bootstrap-switch": "^3.3.4",
"chi-squared": "^1.1.0",
"crypto-api": "^0.7.5", "crypto-api": "^0.7.5",
"crypto-js": "^3.1.9-1", "crypto-js": "^3.1.9-1",
"diff": "^3.4.0", "diff": "^3.4.0",

View File

@ -278,8 +278,7 @@ const FlowControl = {
<tr> <tr>
<th>Recipe (click to load)</th> <th>Recipe (click to load)</th>
<th>Data snippet</th> <th>Data snippet</th>
<th>Most likely language\n(lower scores are better)</th> <th>Properties</th>
<th>File type</th>
</tr>`; </tr>`;
options.forEach(option => { options.forEach(option => {
@ -290,20 +289,25 @@ const FlowControl = {
.concat(currentRecipeConfig.slice(state.progress + 1)), .concat(currentRecipeConfig.slice(state.progress + 1)),
recipeURL = "recipe=" + Utils.encodeURIFragment(Utils.generatePrettyRecipe(recipeConfig)); recipeURL = "recipe=" + Utils.encodeURIFragment(Utils.generatePrettyRecipe(recipeConfig));
const language = option.languageScores[0]; const bestLanguage = option.languageScores[0];
let fileType = "Unknown"; let language = "Unknown",
fileType = "Unknown";
if (bestLanguage.probability > 0.00005) {
language = Magic.codeToLanguage(bestLanguage.lang) + " " +
(bestLanguage.probability * 100).toFixed(2) + "%";
}
if (option.fileType) { if (option.fileType) {
fileType = `Extension: ${option.fileType.ext}\nMime type: ${option.fileType.mime}`; fileType = `${option.fileType.mime} (${option.fileType.ext})`;
if (option.fileType.desc)
fileType += `\nDescription: ${option.fileType.desc}`;
} }
output += `<tr> output += `<tr>
<td><a href="#${recipeURL}">${Utils.generatePrettyRecipe(option.recipe, true)}</a></td> <td><a href="#${recipeURL}">${Utils.generatePrettyRecipe(option.recipe, true)}</a></td>
<td>${Utils.escapeHtml(Utils.printable(Utils.truncate(option.data, 99)))}</td> <td>${Utils.escapeHtml(Utils.printable(Utils.truncate(option.data, 99)))}</td>
<td>${Magic.codeToLanguage(language.lang)}\nScore: ${language.chiSqr.toFixed()}</td> <td>Language: ${language}
<td>${fileType}</td> File type: ${fileType}
Valid UTF8: ${option.isUTF8}</td>
</tr>`; </tr>`;
}); });

View File

@ -3,6 +3,7 @@ import Utils from "../Utils.js";
import Recipe from "../Recipe.js"; import Recipe from "../Recipe.js";
import Dish from "../Dish.js"; import Dish from "../Dish.js";
import FileType from "../operations/FileType.js"; import FileType from "../operations/FileType.js";
import chiSquared from "chi-squared";
/** /**
@ -19,11 +20,12 @@ class Magic {
* Magic constructor. * Magic constructor.
* *
* @param {ArrayBuffer} buf * @param {ArrayBuffer} buf
* @param {Object[]} [opPatterns]
*/ */
constructor(buf) { constructor(buf, opPatterns) {
this.inputBuffer = new Uint8Array(buf); this.inputBuffer = new Uint8Array(buf);
this.inputStr = Utils.arrayBufferToStr(buf); this.inputStr = Utils.arrayBufferToStr(buf);
this.opPatterns = Magic._generateOpPatterns(); this.opPatterns = opPatterns || Magic._generateOpPatterns();
} }
/** /**
@ -58,15 +60,17 @@ class Magic {
let chiSqrs = []; let chiSqrs = [];
for (let lang in LANG_FREQS) { for (let lang in LANG_FREQS) {
let [score, prob] = Magic._chiSqr(inputFreq, LANG_FREQS[lang]);
chiSqrs.push({ chiSqrs.push({
lang: lang, lang: lang,
chiSqr: Magic._chiSqr(inputFreq, LANG_FREQS[lang]) score: score,
probability: prob
}); });
} }
// Sort results so that the most likely match is at the top // Sort results so that the most likely match is at the top
chiSqrs.sort((a, b) => { chiSqrs.sort((a, b) => {
return a.chiSqr - b.chiSqr; return a.score - b.score;
}); });
return chiSqrs; return chiSqrs;
@ -84,6 +88,81 @@ class Magic {
return FileType.magicType(this.inputBuffer); return FileType.magicType(this.inputBuffer);
} }
/**
* Detects whether the input buffer is valid UTF8.
*
* @returns {boolean}
*/
isUTF8() {
const bytes = new Uint8Array(this.inputBuffer);
let i = 0;
while (i < bytes.length) {
if (( // ASCII
bytes[i] === 0x09 ||
bytes[i] === 0x0A ||
bytes[i] === 0x0D ||
(0x20 <= bytes[i] && bytes[i] <= 0x7E)
)) {
i += 1;
continue;
}
if (( // non-overlong 2-byte
(0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
(0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
)) {
i += 2;
continue;
}
if (( // excluding overlongs
bytes[i] === 0xE0 &&
(0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
) ||
( // straight 3-byte
((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
bytes[i] === 0xEE ||
bytes[i] === 0xEF) &&
(0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
) ||
( // excluding surrogates
bytes[i] === 0xED &&
(0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
)) {
i += 3;
continue;
}
if (( // planes 1-3
bytes[i] === 0xF0 &&
(0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
) ||
( // planes 4-15
(0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
) ||
( // plane 16
bytes[i] === 0xF4 &&
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
)) {
i += 4;
continue;
}
return false;
}
return true;
}
/** /**
* Speculatively executes matching operations, recording metadata of each result. * Speculatively executes matching operations, recording metadata of each result.
@ -103,6 +182,7 @@ class Magic {
data: this.inputStr.slice(0, 100), data: this.inputStr.slice(0, 100),
languageScores: this.detectLanguage(), languageScores: this.detectLanguage(),
fileType: this.detectFileType(), fileType: this.detectFileType(),
isUTF8: this.isUTF8()
}); });
// Find any operations that can be run on this data // Find any operations that can be run on this data
@ -122,7 +202,7 @@ class Magic {
const recipe = new Recipe([opConfig]); const recipe = new Recipe([opConfig]);
await recipe.execute(dish, 0); await recipe.execute(dish, 0);
const magic = new Magic(dish.get(Dish.ARRAY_BUFFER)), const magic = new Magic(dish.get(Dish.ARRAY_BUFFER), this.opPatterns),
speculativeResults = await magic.speculativeExecution(depth-1, [...recipeConfig, opConfig]); speculativeResults = await magic.speculativeExecution(depth-1, [...recipeConfig, opConfig]);
results = results.concat(speculativeResults); results = results.concat(speculativeResults);
@ -131,13 +211,17 @@ class Magic {
// Return a sorted list of possible recipes along with their properties // Return a sorted list of possible recipes along with their properties
return results.sort((a, b) => { return results.sort((a, b) => {
// Each option is sorted based on its most likely language (lower is better) // Each option is sorted based on its most likely language (lower is better)
let aScore = a.languageScores[0].chiSqr, let aScore = a.languageScores[0].score,
bScore = b.languageScores[0].chiSqr; bScore = b.languageScores[0].score;
// If a recipe results in a file being detected, it receives a relatively good score // If a recipe results in a file being detected, it receives a relatively good score
if (a.fileType) aScore = 500; if (a.fileType) aScore = 500;
if (b.fileType) bScore = 500; if (b.fileType) bScore = 500;
// If the result is valid UTF8, its score gets boosted (lower being better)
if (a.isUTF8) aScore -= 100;
if (b.isUTF8) bScore -= 100;
return aScore - bScore; return aScore - bScore;
}); });
} }
@ -194,19 +278,24 @@ class Magic {
* https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test * https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
* *
* @private * @private
* @param {number[]} observed * @param {number[]} observed
* @param {number[]} expected * @param {number[]} expected
* @returns {number} * @param {number} ddof - Delta degrees of freedom
* @returns {number[]} - The score and the probability
*/ */
static _chiSqr(observed, expected) { static _chiSqr(observed, expected, ddof=0) {
let tmp, let tmp,
res = 0; score = 0;
for (let i = 0; i < observed.length; i++) { for (let i = 0; i < observed.length; i++) {
tmp = observed[i] - expected[i]; tmp = observed[i] - expected[i];
res += tmp * tmp / expected[i]; score += tmp * tmp / expected[i];
} }
return res;
return [
score,
1 - chiSquared.cdf(score, observed.length - 1 - ddof)
];
} }
/** /**