Magic operation now detects UTF8 and gives a probability score for each language

This commit is contained in:
n1474335 2018-02-10 15:10:53 +00:00
parent 865ee6a720
commit 6624f25a64
5 changed files with 131 additions and 24 deletions

View File

@ -1,3 +1,3 @@
src/core/lib/**
!src/core/lib/Magic.js
src/core/config/MetaConfig.js
src/core/config/MetaConfig.js

13
package-lock.json generated
View File

@ -1404,6 +1404,14 @@
"supports-color": "2.0.0"
}
},
"chi-squared": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/chi-squared/-/chi-squared-1.1.0.tgz",
"integrity": "sha1-iShlz/qOCnIPkhv8nGNcGawqNG0=",
"requires": {
"gamma": "1.0.0"
}
},
"chokidar": {
"version": "1.7.0",
"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz",
@ -4255,6 +4263,11 @@
"integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=",
"dev": true
},
"gamma": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/gamma/-/gamma-1.0.0.tgz",
"integrity": "sha1-mDwck5/iPZMnAVhXEeHZpDDLdMs="
},
"get-caller-file": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.2.tgz",

View File

@ -72,6 +72,7 @@
"bootstrap": "^3.3.7",
"bootstrap-colorpicker": "^2.5.2",
"bootstrap-switch": "^3.3.4",
"chi-squared": "^1.1.0",
"crypto-api": "^0.7.5",
"crypto-js": "^3.1.9-1",
"diff": "^3.4.0",

View File

@ -278,8 +278,7 @@ const FlowControl = {
<tr>
<th>Recipe (click to load)</th>
<th>Data snippet</th>
<th>Most likely language\n(lower scores are better)</th>
<th>File type</th>
<th>Properties</th>
</tr>`;
options.forEach(option => {
@ -290,20 +289,25 @@ const FlowControl = {
.concat(currentRecipeConfig.slice(state.progress + 1)),
recipeURL = "recipe=" + Utils.encodeURIFragment(Utils.generatePrettyRecipe(recipeConfig));
const language = option.languageScores[0];
let fileType = "Unknown";
const bestLanguage = option.languageScores[0];
let language = "Unknown",
fileType = "Unknown";
if (bestLanguage.probability > 0.00005) {
language = Magic.codeToLanguage(bestLanguage.lang) + " " +
(bestLanguage.probability * 100).toFixed(2) + "%";
}
if (option.fileType) {
fileType = `Extension: ${option.fileType.ext}\nMime type: ${option.fileType.mime}`;
if (option.fileType.desc)
fileType += `\nDescription: ${option.fileType.desc}`;
fileType = `${option.fileType.mime} (${option.fileType.ext})`;
}
output += `<tr>
<td><a href="#${recipeURL}">${Utils.generatePrettyRecipe(option.recipe, true)}</a></td>
<td>${Utils.escapeHtml(Utils.printable(Utils.truncate(option.data, 99)))}</td>
<td>${Magic.codeToLanguage(language.lang)}\nScore: ${language.chiSqr.toFixed()}</td>
<td>${fileType}</td>
<td>Language: ${language}
File type: ${fileType}
Valid UTF8: ${option.isUTF8}</td>
</tr>`;
});

View File

@ -3,6 +3,7 @@ import Utils from "../Utils.js";
import Recipe from "../Recipe.js";
import Dish from "../Dish.js";
import FileType from "../operations/FileType.js";
import chiSquared from "chi-squared";
/**
@ -19,11 +20,12 @@ class Magic {
* Magic constructor.
*
* @param {ArrayBuffer} buf
* @param {Object[]} [opPatterns]
*/
constructor(buf) {
constructor(buf, opPatterns) {
this.inputBuffer = new Uint8Array(buf);
this.inputStr = Utils.arrayBufferToStr(buf);
this.opPatterns = Magic._generateOpPatterns();
this.opPatterns = opPatterns || Magic._generateOpPatterns();
}
/**
@ -58,15 +60,17 @@ class Magic {
let chiSqrs = [];
for (let lang in LANG_FREQS) {
let [score, prob] = Magic._chiSqr(inputFreq, LANG_FREQS[lang]);
chiSqrs.push({
lang: lang,
chiSqr: Magic._chiSqr(inputFreq, LANG_FREQS[lang])
score: score,
probability: prob
});
}
// Sort results so that the most likely match is at the top
chiSqrs.sort((a, b) => {
return a.chiSqr - b.chiSqr;
return a.score - b.score;
});
return chiSqrs;
@ -84,6 +88,81 @@ class Magic {
return FileType.magicType(this.inputBuffer);
}
/**
* Detects whether the input buffer is valid UTF8.
*
* @returns {boolean}
*/
isUTF8() {
const bytes = new Uint8Array(this.inputBuffer);
let i = 0;
while (i < bytes.length) {
if (( // ASCII
bytes[i] === 0x09 ||
bytes[i] === 0x0A ||
bytes[i] === 0x0D ||
(0x20 <= bytes[i] && bytes[i] <= 0x7E)
)) {
i += 1;
continue;
}
if (( // non-overlong 2-byte
(0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
(0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
)) {
i += 2;
continue;
}
if (( // excluding overlongs
bytes[i] === 0xE0 &&
(0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
) ||
( // straight 3-byte
((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
bytes[i] === 0xEE ||
bytes[i] === 0xEF) &&
(0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
) ||
( // excluding surrogates
bytes[i] === 0xED &&
(0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
)) {
i += 3;
continue;
}
if (( // planes 1-3
bytes[i] === 0xF0 &&
(0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
) ||
( // planes 4-15
(0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
) ||
( // plane 16
bytes[i] === 0xF4 &&
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
)) {
i += 4;
continue;
}
return false;
}
return true;
}
/**
* Speculatively executes matching operations, recording metadata of each result.
@ -103,6 +182,7 @@ class Magic {
data: this.inputStr.slice(0, 100),
languageScores: this.detectLanguage(),
fileType: this.detectFileType(),
isUTF8: this.isUTF8()
});
// Find any operations that can be run on this data
@ -122,7 +202,7 @@ class Magic {
const recipe = new Recipe([opConfig]);
await recipe.execute(dish, 0);
const magic = new Magic(dish.get(Dish.ARRAY_BUFFER)),
const magic = new Magic(dish.get(Dish.ARRAY_BUFFER), this.opPatterns),
speculativeResults = await magic.speculativeExecution(depth-1, [...recipeConfig, opConfig]);
results = results.concat(speculativeResults);
@ -131,13 +211,17 @@ class Magic {
// Return a sorted list of possible recipes along with their properties
return results.sort((a, b) => {
// Each option is sorted based on its most likely language (lower is better)
let aScore = a.languageScores[0].chiSqr,
bScore = b.languageScores[0].chiSqr;
let aScore = a.languageScores[0].score,
bScore = b.languageScores[0].score;
// If a recipe results in a file being detected, it receives a relatively good score
if (a.fileType) aScore = 500;
if (b.fileType) bScore = 500;
// If the result is valid UTF8, its score gets boosted (lower being better)
if (a.isUTF8) aScore -= 100;
if (b.isUTF8) bScore -= 100;
return aScore - bScore;
});
}
@ -194,19 +278,24 @@ class Magic {
* https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
*
* @private
* @param {number[]} observed
* @param {number[]} expected
* @returns {number}
* @param {number[]} observed
* @param {number[]} expected
* @param {number} ddof - Delta degrees of freedom
* @returns {number[]} - The score and the probability
*/
static _chiSqr(observed, expected) {
static _chiSqr(observed, expected, ddof=0) {
let tmp,
res = 0;
score = 0;
for (let i = 0; i < observed.length; i++) {
tmp = observed[i] - expected[i];
res += tmp * tmp / expected[i];
score += tmp * tmp / expected[i];
}
return res;
return [
score,
1 - chiSquared.cdf(score, observed.length - 1 - ddof)
];
}
/**