diff --git a/package-lock.json b/package-lock.json index 6c765b4e..15aa5451 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14403,6 +14403,11 @@ "normalize-path": "^2.1.1" } }, + "unorm": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/unorm/-/unorm-1.6.0.tgz", + "integrity": "sha512-b2/KCUlYZUeA7JFUuRJZPUtr4gZvBh7tavtv4fvk4+KV9pfGiR6CQAQAWl49ZpR3ts2dk4FYkP7EIgDJoiOLDA==" + }, "unpipe": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", diff --git a/package.json b/package.json index 7bdea20b..61136169 100644 --- a/package.json +++ b/package.json @@ -147,6 +147,7 @@ "ssdeep.js": "0.0.2", "tesseract.js": "^2.0.0-alpha.15", "ua-parser-js": "^0.7.20", + "unorm": "^1.6.0", "utf8": "^3.0.0", "vkbeautify": "^0.99.3", "xmldom": "^0.1.27", diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index f663e16d..53ca796d 100755 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -39,6 +39,7 @@ "URL Decode", "Escape Unicode Characters", "Unescape Unicode Characters", + "Normalise Unicode", "To Quoted Printable", "From Quoted Printable", "To Punycode", diff --git a/src/core/lib/ChrEnc.mjs b/src/core/lib/ChrEnc.mjs index a472706b..d2d0625f 100644 --- a/src/core/lib/ChrEnc.mjs +++ b/src/core/lib/ChrEnc.mjs @@ -164,3 +164,15 @@ export const IO_FORMAT = { "Simplified Chinese GB18030 (54936)": 54936, }; +/** + * Unicode Normalisation Forms + * + * @author Matthieu [m@tthieu.xyz] + * @copyright Crown Copyright 2016 + * @license Apache-2.0 + */ + +/** + * Character encoding format mappings. + */ +export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"]; diff --git a/src/core/operations/NormaliseUnicode.mjs b/src/core/operations/NormaliseUnicode.mjs new file mode 100644 index 00000000..731a493f --- /dev/null +++ b/src/core/operations/NormaliseUnicode.mjs @@ -0,0 +1,60 @@ +/** + * @author Matthieu [m@tthieu.xyz] + * @copyright Crown Copyright 2019 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import OperationError from "../errors/OperationError.mjs"; +import unorm from "unorm"; +import {UNICODE_NORMALISATION_FORMS} from "../lib/ChrEnc"; + +/** + * Normalise Unicode operation + */ +class NormaliseUnicode extends Operation { + + /** + * NormaliseUnicode constructor + */ + constructor() { + super(); + + this.name = "Normalise Unicode"; + this.module = "UnicodeNormalisation"; + this.description = "Transform Unicode to one of the Normalisation Form"; + this.infoURL = "http://www.unicode.org/reports/tr15/"; + this.inputType = "string"; + this.outputType = "string"; + this.args = [ + { + name: "Normal Form", + type: "option", + value: UNICODE_NORMALISATION_FORMS + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + run(input, args) { + const [normalForm] = args; + if (normalForm === "NFD") { + return unorm.nfd(input); + } else if (normalForm === "NFC") { + return unorm.nfc(input); + } else if (normalForm === "NFKD") { + return unorm.nfkd(input); + } else if (normalForm === "NFKC") { + return unorm.nfc(input); + } + + throw new OperationError("Unknown Normalisation Form"); + } + +} + +export default NormaliseUnicode; diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index c54fa7ef..97745878 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -57,6 +57,7 @@ import "./tests/MS.mjs"; import "./tests/Magic.mjs"; import "./tests/MorseCode.mjs"; import "./tests/NetBIOS.mjs"; +import "./tests/NormaliseUnicode.mjs"; import "./tests/OTP.mjs"; import "./tests/PGP.mjs"; import "./tests/PHP.mjs"; diff --git a/tests/operations/tests/NormaliseUnicode.mjs b/tests/operations/tests/NormaliseUnicode.mjs new file mode 100644 index 00000000..d903e865 --- /dev/null +++ b/tests/operations/tests/NormaliseUnicode.mjs @@ -0,0 +1,54 @@ +/** + * Text Encoding Brute Force tests. + * + * @author Matthieu [m@tthieux.xyz] + * + * @copyright Crown Copyright 2018 + * @license Apache-2.0 + */ +import TestRegister from "../../lib/TestRegister.mjs"; + +TestRegister.addTests([ + { + name: "Normalise Unicode - NFD", + input: "\u00c7\u0043\u0327\u2160", + expectedMatch: /C\u0327C\u0327\u2160/, + recipeConfig: [ + { + op: "Normalise Unicode", + args: ["NFD"], + }, + ], + }, { + name: "Normalise Unicode - NFC", + input: "\u00c7\u0043\u0327\u2160", + expectedMatch: /\u00C7\u00C7\u2160/, + recipeConfig: [ + { + op: "Normalise Unicode", + args: ["NFC"], + }, + ], + }, { + name: "Normalise Unicode - NFKD", + input: "\u00c7\u0043\u0327\u2160", + expectedMatch: /C\u0327C\u0327I/, + recipeConfig: [ + { + op: "Normalise Unicode", + args: ["NFKD"], + }, + ], + }, { + name: "Normalise Unicode - NFKC", + input: "\u00c7\u0043\u0327\u2160", + expectedMatch: /\u00C7\u00C7\u2160/, + recipeConfig: [ + { + op: "Normalise Unicode", + args: ["NFKC"], + }, + ], + }, +]); +