From 06f95edd2e5299ac3d028475157e9e77421499db Mon Sep 17 00:00:00 2001 From: Emil Henry Date: Sun, 29 Mar 2020 23:15:03 +0200 Subject: [PATCH] Add 'Cut' operation --- src/core/config/Categories.json | 1 + src/core/operations/Cut.mjs | 217 ++++++++++++++++++++++++++++++++ tests/operations/index.mjs | 1 + tests/operations/tests/Cut.mjs | 101 +++++++++++++++ 4 files changed, 320 insertions(+) create mode 100644 src/core/operations/Cut.mjs create mode 100644 tests/operations/tests/Cut.mjs diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 77e3d319..191cda24 100755 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -208,6 +208,7 @@ { "name": "Utils", "ops": [ + "Cut", "Diff", "Remove whitespace", "Remove null bytes", diff --git a/src/core/operations/Cut.mjs b/src/core/operations/Cut.mjs new file mode 100644 index 00000000..d824e695 --- /dev/null +++ b/src/core/operations/Cut.mjs @@ -0,0 +1,217 @@ +/** + * @author emilhf [emil@cyberops.no] + * @copyright Crown Copyright 2020 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import OperationError from "../errors/OperationError.mjs"; +import {SPLIT_DELIM_OPTIONS, JOIN_DELIM_OPTIONS} from "../lib/Delim.mjs"; +import XRegExp from "xregexp"; + +/** + * Cut operation + */ +class Cut extends Operation { + + /** + * Cut constructor + */ + constructor() { + super(); + + this.name = "Cut"; + this.module = "Utils"; + this.description = "Extract fields from records similarly to awk and cut. The expression 1, 3-4 will extract the 2nd, 4th and 5th fields. 3, 1 "T" 2 will extract the 4th field, then combine the 2nd and 3rd field into a new field (with the letter 'T' separating the original values).

If no input field delimiter is set, fixed width mode is enabled: Fields become the indices of the payload, and ranges will be appended to the current output field instead of creating new fields. This aids in carving e.g. CSVs from fixed width data."; + this.infoURL = "https://en.wikipedia.org/wiki/Cut_(Unix)"; + this.inputType = "string"; + this.outputType = "string"; + this.args = [ + { + "name": "Common input type", + "type": "populateOption", + "value": [ + { + name: "User defined", + value: "" + }, + { + name: "CSV", + value: "," + }, + { + name: "TSV", + value: "\\t" + }, + { + name: "PSV", + value: "\\|" + }, + { + name: "Space aligned", + value: "\\s+" + } + ], + "target": 4 + }, + { + "name": "Expression", + "type": "text", + "value": "0-" + }, + { + "name": "Input record delimiter", + "type": "editableOptionShort", + "value": SPLIT_DELIM_OPTIONS, + "defaultIndex": 2 + }, + { + "name": "Output record delimiter", + "type": "editableOptionShort", + "value": SPLIT_DELIM_OPTIONS, + "defaultIndex": 2 + }, + { + "name": "Input field delimiter", + "type": "shortString", + "value": "" + }, + { + "name": "Output field delimiter", + "type": "editableOptionShort", + "value": JOIN_DELIM_OPTIONS, + "defaultIndex": 3 + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + run(input, args) { + const [, expr, inRecordDelim, outRecordDelim, inFieldDelim, outFieldDelim] = args; + const split = new XRegExp(inFieldDelim); + const fixedWidth = inFieldDelim === ""; + + /** + * @param {Array[]} + * @returns {Array[]} + */ + const gr = (data) => { + data = fixedWidth ? data : data.split(split); + return this.extract(data, expr, fixedWidth).join(outFieldDelim); + }; + + return input.split(inRecordDelim).map(gr).join(outRecordDelim); + // return gr(input); + } + + /** + * Extracts fields as specified by the extraction expression. If fixedWidth + * is true, ranges do not introduce new fields, but rather append to the + * current field being dealt with. + * + * The extract expression is a lightweight DSL similar to the fields flag + * (-f) of cut in UNIX, and also incorporates elements of the awk print + * statement. It departs from cut in a few noteworthy ways: + * + * - Reverse ranges are supported, e.g. 4-1. + * + * - Negative field values, e.g. -1, are offsets from the end of the data. + * Note that negative ranges are not supported. + * + * - Fields are numbered from 0 instead of 1. + * + * - New fields can be constructed by combining existing fields. This + * operation also supports appending strings: '1 "@" 2' will join field 1 + * and 2 with "@" in between them. + * + * @param {Array[]} data + * @param {string} expr + * @param {Boolean} fixedWidth + * @returns {Array[Number]} + */ + extract(data, expr, fixedWidth) { + const maxOffset = data.length - 1; + + /** + * @param {Number} n + * @returns {Array[]} + */ + const pick = (n) => n < 0 ? data[maxOffset + n + 1] : data[n]; + + const fields = []; + let currentField = []; + let previousToken = null; + const tokens = expr.trim().match(/((".*?")|(\d+-\d*)|(-?\d+)|(,))/g); + tokens.forEach(token => { + // Field separator + if (token.match(/^,$/)) { + previousToken = "delimiter"; + if (currentField.length) { + fields.push(currentField.join("")); + currentField = []; + } + return; + } + + if (!fixedWidth && previousToken === "range") { + throw new OperationError( + `Cannot join '${token}', as previous term was a range. Requires fixed width mode.` + ); + } + + if (token.match("^-?[0-9]+$")) { + previousToken = "extraction"; + const n = Number(token); + currentField.push(pick(n)); + return; + } + if (token.match(/^\d+-\d*$/)) { + previousToken = "range"; + if (!fixedWidth && currentField.length) { + throw new OperationError( + `Cannot join range '${token}' with rest of field: ${currentField.join("")}. Requires fixed width mode.` + ); + } + const m = token.match(/^([0-9]+)-([0-9]*)$/); + const a = Number(m[1]); + const b = m[2] === "" ? maxOffset: Number(m[2]); + + const vals = []; + if (a <= b) { + for (let i = a; i <= b && i <= maxOffset; i++) { + vals.push(pick(i)); + } + } else { + for (let i = a; i >= b && i <= maxOffset; i--) { + vals.push(pick(i)); + } + } + + if (fixedWidth) { + currentField.push(...vals); + } else { + fields.push(...vals); + } + return; + } + if (token.match(/^".*"$/)) { + previousToken = "string"; + const m = token.match(/"(.*)"/); + currentField.push(m[1]); + } + // NOT REACHED + }); + // Terminal condition + if (currentField.length) { + fields.push(currentField.join("")); + } + return fields; + } + +} + +export default Cut; diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index 8d3cd623..1f41cfef 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -37,6 +37,7 @@ import "./tests/Compress.mjs"; import "./tests/ConditionalJump.mjs"; import "./tests/Crypt.mjs"; import "./tests/CSV.mjs"; +import "./tests/Cut.mjs"; import "./tests/DateTime.mjs"; import "./tests/ExtractEmailAddresses.mjs"; import "./tests/Fork.mjs"; diff --git a/tests/operations/tests/Cut.mjs b/tests/operations/tests/Cut.mjs new file mode 100644 index 00000000..d0d18986 --- /dev/null +++ b/tests/operations/tests/Cut.mjs @@ -0,0 +1,101 @@ +/** + * Cut operation tests + * + * @author emilhf [emil@cyberops.no] + * + * @copyright Crown Copyright 2020 + * @license Apache-2.0 + */ + +import TestRegister from "../../lib/TestRegister.mjs"; + +TestRegister.addTests([ + { + name: "Extract single field", + input: "test1,test2,test3", + expectedOutput: "test2", + recipeConfig: [ + { + op: "Cut", + args: ["User defined", "1", "\\n", "\\n", ",", ","], + }, + ], + }, + { + name: "Extract range", + input: "test1,test2,test3", + expectedOutput: "test2,test3", + recipeConfig: [ + { + op: "Cut", + args: ["User defined", "1-2", "\\n", "\\n", ",", ","], + }, + ], + }, + { + name: "Extract reverse range", + input: "test1,test2,test3", + expectedOutput: "test2,test1", + recipeConfig: [ + { + op: "Cut", + args: ["User defined", "2-1", "\\n", "\\n", ",", ","], + }, + ], + }, + { + name: "Extract multiple ranges", + input: "test1,test2,test3", + expectedOutput: "test2,test3,test1", + recipeConfig: [ + { + op: "Cut", + args: ["User defined", "1-2,0", "\\n", "\\n", ",", ","], + }, + ], + }, + { + name: "Combine two existing fields", + input: "john.doe,CONTOSO\nadams,CONTOSO", + expectedOutput: "john.doe@CONTOSO\nadams@CONTOSO", + recipeConfig: [ + { + op: "Cut", + args: ["User defined", "0 \"@\" 1", "\\n", "\\n", ",", ","], + }, + ], + }, + { + name: "Fixed width to CSV", + input: "abcdefghijklmnopqrstuvxyz", + expectedOutput: "abc,xyz", + recipeConfig: [ + { + op: "Cut", + args: ["User defined", "0-2, 22-24", "\\n", "\\n", "", ","], + }, + ], + }, + { + name: "Extract and convert CSV to TSV", + input: "ITEM,VALUE\nflamingo,439\nvodka,14", + expectedOutput: "ITEM\tVALUE\nflamingo\t439\nvodka\t14", + recipeConfig: [ + { + op: "Cut", + args: ["User defined", "0-", "\\n", "\\n", ",", "\\t"], + } + ], + }, + { + name: "Extract with wrong delimiter", + input: "test1,test2", + expectedOutput: "test1,test2", + recipeConfig: [ + { + op: "Cut", + args: ["User defined", "0-", "\\n", "\\n", "\\t", ";"], + }, + ], + }, +]);