Add operation to normalise unicode

This commit is contained in:
Matthieu 2019-11-25 22:53:31 +01:00
parent 610d46a1a4
commit a6fa0628f2
7 changed files with 134 additions and 0 deletions

5
package-lock.json generated
View File

@ -14403,6 +14403,11 @@
"normalize-path": "^2.1.1" "normalize-path": "^2.1.1"
} }
}, },
"unorm": {
"version": "1.6.0",
"resolved": "https://registry.npmjs.org/unorm/-/unorm-1.6.0.tgz",
"integrity": "sha512-b2/KCUlYZUeA7JFUuRJZPUtr4gZvBh7tavtv4fvk4+KV9pfGiR6CQAQAWl49ZpR3ts2dk4FYkP7EIgDJoiOLDA=="
},
"unpipe": { "unpipe": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",

View File

@ -147,6 +147,7 @@
"ssdeep.js": "0.0.2", "ssdeep.js": "0.0.2",
"tesseract.js": "^2.0.0-alpha.15", "tesseract.js": "^2.0.0-alpha.15",
"ua-parser-js": "^0.7.20", "ua-parser-js": "^0.7.20",
"unorm": "^1.6.0",
"utf8": "^3.0.0", "utf8": "^3.0.0",
"vkbeautify": "^0.99.3", "vkbeautify": "^0.99.3",
"xmldom": "^0.1.27", "xmldom": "^0.1.27",

View File

@ -39,6 +39,7 @@
"URL Decode", "URL Decode",
"Escape Unicode Characters", "Escape Unicode Characters",
"Unescape Unicode Characters", "Unescape Unicode Characters",
"Normalise Unicode",
"To Quoted Printable", "To Quoted Printable",
"From Quoted Printable", "From Quoted Printable",
"To Punycode", "To Punycode",

View File

@ -164,3 +164,15 @@ export const IO_FORMAT = {
"Simplified Chinese GB18030 (54936)": 54936, "Simplified Chinese GB18030 (54936)": 54936,
}; };
/**
* Unicode Normalisation Forms
*
* @author Matthieu [m@tthieu.xyz]
* @copyright Crown Copyright 2016
* @license Apache-2.0
*/
/**
* Character encoding format mappings.
*/
export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];

View File

@ -0,0 +1,60 @@
/**
* @author Matthieu [m@tthieu.xyz]
* @copyright Crown Copyright 2019
* @license Apache-2.0
*/
import Operation from "../Operation.mjs";
import OperationError from "../errors/OperationError.mjs";
import unorm from "unorm";
import {UNICODE_NORMALISATION_FORMS} from "../lib/ChrEnc";
/**
* Normalise Unicode operation
*/
class NormaliseUnicode extends Operation {
/**
* NormaliseUnicode constructor
*/
constructor() {
super();
this.name = "Normalise Unicode";
this.module = "UnicodeNormalisation";
this.description = "Transform Unicode to one of the Normalisation Form";
this.infoURL = "http://www.unicode.org/reports/tr15/";
this.inputType = "string";
this.outputType = "string";
this.args = [
{
name: "Normal Form",
type: "option",
value: UNICODE_NORMALISATION_FORMS
}
];
}
/**
* @param {string} input
* @param {Object[]} args
* @returns {string}
*/
run(input, args) {
const [normalForm] = args;
if (normalForm === "NFD") {
return unorm.nfd(input);
} else if (normalForm === "NFC") {
return unorm.nfc(input);
} else if (normalForm === "NFKD") {
return unorm.nfkd(input);
} else if (normalForm === "NFKC") {
return unorm.nfc(input);
}
throw new OperationError("Unknown Normalisation Form");
}
}
export default NormaliseUnicode;

View File

@ -57,6 +57,7 @@ import "./tests/MS.mjs";
import "./tests/Magic.mjs"; import "./tests/Magic.mjs";
import "./tests/MorseCode.mjs"; import "./tests/MorseCode.mjs";
import "./tests/NetBIOS.mjs"; import "./tests/NetBIOS.mjs";
import "./tests/NormaliseUnicode.mjs";
import "./tests/OTP.mjs"; import "./tests/OTP.mjs";
import "./tests/PGP.mjs"; import "./tests/PGP.mjs";
import "./tests/PHP.mjs"; import "./tests/PHP.mjs";

View File

@ -0,0 +1,54 @@
/**
* Text Encoding Brute Force tests.
*
* @author Matthieu [m@tthieux.xyz]
*
* @copyright Crown Copyright 2018
* @license Apache-2.0
*/
import TestRegister from "../../lib/TestRegister.mjs";
TestRegister.addTests([
{
name: "Normalise Unicode - NFD",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /C\u0327C\u0327\u2160/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFD"],
},
],
}, {
name: "Normalise Unicode - NFC",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /\u00C7\u00C7\u2160/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFC"],
},
],
}, {
name: "Normalise Unicode - NFKD",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /C\u0327C\u0327I/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFKD"],
},
],
}, {
name: "Normalise Unicode - NFKC",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /\u00C7\u00C7\u2160/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFKC"],
},
],
},
]);