Add operation to normalise unicode

This commit is contained in:
Matthieu 2019-11-25 22:53:31 +01:00
parent 610d46a1a4
commit a6fa0628f2
7 changed files with 134 additions and 0 deletions

5
package-lock.json generated
View File

@ -14403,6 +14403,11 @@
"normalize-path": "^2.1.1"
}
},
"unorm": {
"version": "1.6.0",
"resolved": "https://registry.npmjs.org/unorm/-/unorm-1.6.0.tgz",
"integrity": "sha512-b2/KCUlYZUeA7JFUuRJZPUtr4gZvBh7tavtv4fvk4+KV9pfGiR6CQAQAWl49ZpR3ts2dk4FYkP7EIgDJoiOLDA=="
},
"unpipe": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",

View File

@ -147,6 +147,7 @@
"ssdeep.js": "0.0.2",
"tesseract.js": "^2.0.0-alpha.15",
"ua-parser-js": "^0.7.20",
"unorm": "^1.6.0",
"utf8": "^3.0.0",
"vkbeautify": "^0.99.3",
"xmldom": "^0.1.27",

View File

@ -39,6 +39,7 @@
"URL Decode",
"Escape Unicode Characters",
"Unescape Unicode Characters",
"Normalise Unicode",
"To Quoted Printable",
"From Quoted Printable",
"To Punycode",

View File

@ -164,3 +164,15 @@ export const IO_FORMAT = {
"Simplified Chinese GB18030 (54936)": 54936,
};
/**
* Unicode Normalisation Forms
*
* @author Matthieu [m@tthieu.xyz]
* @copyright Crown Copyright 2016
* @license Apache-2.0
*/
/**
* Character encoding format mappings.
*/
export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];

View File

@ -0,0 +1,60 @@
/**
* @author Matthieu [m@tthieu.xyz]
* @copyright Crown Copyright 2019
* @license Apache-2.0
*/
import Operation from "../Operation.mjs";
import OperationError from "../errors/OperationError.mjs";
import unorm from "unorm";
import {UNICODE_NORMALISATION_FORMS} from "../lib/ChrEnc";
/**
* Normalise Unicode operation
*/
class NormaliseUnicode extends Operation {
/**
* NormaliseUnicode constructor
*/
constructor() {
super();
this.name = "Normalise Unicode";
this.module = "UnicodeNormalisation";
this.description = "Transform Unicode to one of the Normalisation Form";
this.infoURL = "http://www.unicode.org/reports/tr15/";
this.inputType = "string";
this.outputType = "string";
this.args = [
{
name: "Normal Form",
type: "option",
value: UNICODE_NORMALISATION_FORMS
}
];
}
/**
* @param {string} input
* @param {Object[]} args
* @returns {string}
*/
run(input, args) {
const [normalForm] = args;
if (normalForm === "NFD") {
return unorm.nfd(input);
} else if (normalForm === "NFC") {
return unorm.nfc(input);
} else if (normalForm === "NFKD") {
return unorm.nfkd(input);
} else if (normalForm === "NFKC") {
return unorm.nfc(input);
}
throw new OperationError("Unknown Normalisation Form");
}
}
export default NormaliseUnicode;

View File

@ -57,6 +57,7 @@ import "./tests/MS.mjs";
import "./tests/Magic.mjs";
import "./tests/MorseCode.mjs";
import "./tests/NetBIOS.mjs";
import "./tests/NormaliseUnicode.mjs";
import "./tests/OTP.mjs";
import "./tests/PGP.mjs";
import "./tests/PHP.mjs";

View File

@ -0,0 +1,54 @@
/**
* Text Encoding Brute Force tests.
*
* @author Matthieu [m@tthieux.xyz]
*
* @copyright Crown Copyright 2018
* @license Apache-2.0
*/
import TestRegister from "../../lib/TestRegister.mjs";
TestRegister.addTests([
{
name: "Normalise Unicode - NFD",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /C\u0327C\u0327\u2160/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFD"],
},
],
}, {
name: "Normalise Unicode - NFC",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /\u00C7\u00C7\u2160/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFC"],
},
],
}, {
name: "Normalise Unicode - NFKD",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /C\u0327C\u0327I/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFKD"],
},
],
}, {
name: "Normalise Unicode - NFKC",
input: "\u00c7\u0043\u0327\u2160",
expectedMatch: /\u00C7\u00C7\u2160/,
recipeConfig: [
{
op: "Normalise Unicode",
args: ["NFKC"],
},
],
},
]);