Add 'Cut' operation

This commit is contained in:
Emil Henry 2020-03-29 23:15:03 +02:00
parent 29255d2338
commit 06f95edd2e
4 changed files with 320 additions and 0 deletions

View File

@ -208,6 +208,7 @@
{
"name": "Utils",
"ops": [
"Cut",
"Diff",
"Remove whitespace",
"Remove null bytes",

217
src/core/operations/Cut.mjs Normal file
View File

@ -0,0 +1,217 @@
/**
* @author emilhf [emil@cyberops.no]
* @copyright Crown Copyright 2020
* @license Apache-2.0
*/
import Operation from "../Operation.mjs";
import OperationError from "../errors/OperationError.mjs";
import {SPLIT_DELIM_OPTIONS, JOIN_DELIM_OPTIONS} from "../lib/Delim.mjs";
import XRegExp from "xregexp";
/**
* Cut operation
*/
class Cut extends Operation {
/**
* Cut constructor
*/
constructor() {
super();
this.name = "Cut";
this.module = "Utils";
this.description = "Extract fields from records similarly to <code>awk</code> and <code>cut</code>. The expression <code>1, 3-4</code> will extract the 2nd, 4th and 5th fields. <code>3, 1 &quot;T&quot; 2</code> will extract the 4th field, then combine the 2nd and 3rd field into a new field (with the letter 'T' separating the original values).<br><br>If no input field delimiter is set, <strong>fixed width mode</strong> is enabled: Fields become the indices of the payload, and ranges will be appended to the current output field instead of creating new fields. This aids in carving e.g. CSVs from fixed width data.";
this.infoURL = "https://en.wikipedia.org/wiki/Cut_(Unix)";
this.inputType = "string";
this.outputType = "string";
this.args = [
{
"name": "Common input type",
"type": "populateOption",
"value": [
{
name: "User defined",
value: ""
},
{
name: "CSV",
value: ","
},
{
name: "TSV",
value: "\\t"
},
{
name: "PSV",
value: "\\|"
},
{
name: "Space aligned",
value: "\\s+"
}
],
"target": 4
},
{
"name": "Expression",
"type": "text",
"value": "0-"
},
{
"name": "Input record delimiter",
"type": "editableOptionShort",
"value": SPLIT_DELIM_OPTIONS,
"defaultIndex": 2
},
{
"name": "Output record delimiter",
"type": "editableOptionShort",
"value": SPLIT_DELIM_OPTIONS,
"defaultIndex": 2
},
{
"name": "Input field delimiter",
"type": "shortString",
"value": ""
},
{
"name": "Output field delimiter",
"type": "editableOptionShort",
"value": JOIN_DELIM_OPTIONS,
"defaultIndex": 3
}
];
}
/**
* @param {string} input
* @param {Object[]} args
* @returns {string}
*/
run(input, args) {
const [, expr, inRecordDelim, outRecordDelim, inFieldDelim, outFieldDelim] = args;
const split = new XRegExp(inFieldDelim);
const fixedWidth = inFieldDelim === "";
/**
* @param {Array[]}
* @returns {Array[]}
*/
const gr = (data) => {
data = fixedWidth ? data : data.split(split);
return this.extract(data, expr, fixedWidth).join(outFieldDelim);
};
return input.split(inRecordDelim).map(gr).join(outRecordDelim);
// return gr(input);
}
/**
* Extracts fields as specified by the extraction expression. If fixedWidth
* is true, ranges do not introduce new fields, but rather append to the
* current field being dealt with.
*
* The extract expression is a lightweight DSL similar to the fields flag
* (-f) of cut in UNIX, and also incorporates elements of the awk print
* statement. It departs from cut in a few noteworthy ways:
*
* - Reverse ranges are supported, e.g. 4-1.
*
* - Negative field values, e.g. -1, are offsets from the end of the data.
* Note that negative ranges are not supported.
*
* - Fields are numbered from 0 instead of 1.
*
* - New fields can be constructed by combining existing fields. This
* operation also supports appending strings: '1 "@" 2' will join field 1
* and 2 with "@" in between them.
*
* @param {Array[]} data
* @param {string} expr
* @param {Boolean} fixedWidth
* @returns {Array[Number]}
*/
extract(data, expr, fixedWidth) {
const maxOffset = data.length - 1;
/**
* @param {Number} n
* @returns {Array[]}
*/
const pick = (n) => n < 0 ? data[maxOffset + n + 1] : data[n];
const fields = [];
let currentField = [];
let previousToken = null;
const tokens = expr.trim().match(/((".*?")|(\d+-\d*)|(-?\d+)|(,))/g);
tokens.forEach(token => {
// Field separator
if (token.match(/^,$/)) {
previousToken = "delimiter";
if (currentField.length) {
fields.push(currentField.join(""));
currentField = [];
}
return;
}
if (!fixedWidth && previousToken === "range") {
throw new OperationError(
`Cannot join '${token}', as previous term was a range. Requires fixed width mode.`
);
}
if (token.match("^-?[0-9]+$")) {
previousToken = "extraction";
const n = Number(token);
currentField.push(pick(n));
return;
}
if (token.match(/^\d+-\d*$/)) {
previousToken = "range";
if (!fixedWidth && currentField.length) {
throw new OperationError(
`Cannot join range '${token}' with rest of field: ${currentField.join("")}. Requires fixed width mode.`
);
}
const m = token.match(/^([0-9]+)-([0-9]*)$/);
const a = Number(m[1]);
const b = m[2] === "" ? maxOffset: Number(m[2]);
const vals = [];
if (a <= b) {
for (let i = a; i <= b && i <= maxOffset; i++) {
vals.push(pick(i));
}
} else {
for (let i = a; i >= b && i <= maxOffset; i--) {
vals.push(pick(i));
}
}
if (fixedWidth) {
currentField.push(...vals);
} else {
fields.push(...vals);
}
return;
}
if (token.match(/^".*"$/)) {
previousToken = "string";
const m = token.match(/"(.*)"/);
currentField.push(m[1]);
}
// NOT REACHED
});
// Terminal condition
if (currentField.length) {
fields.push(currentField.join(""));
}
return fields;
}
}
export default Cut;

View File

@ -37,6 +37,7 @@ import "./tests/Compress.mjs";
import "./tests/ConditionalJump.mjs";
import "./tests/Crypt.mjs";
import "./tests/CSV.mjs";
import "./tests/Cut.mjs";
import "./tests/DateTime.mjs";
import "./tests/ExtractEmailAddresses.mjs";
import "./tests/Fork.mjs";

View File

@ -0,0 +1,101 @@
/**
* Cut operation tests
*
* @author emilhf [emil@cyberops.no]
*
* @copyright Crown Copyright 2020
* @license Apache-2.0
*/
import TestRegister from "../../lib/TestRegister.mjs";
TestRegister.addTests([
{
name: "Extract single field",
input: "test1,test2,test3",
expectedOutput: "test2",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "1", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Extract range",
input: "test1,test2,test3",
expectedOutput: "test2,test3",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "1-2", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Extract reverse range",
input: "test1,test2,test3",
expectedOutput: "test2,test1",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "2-1", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Extract multiple ranges",
input: "test1,test2,test3",
expectedOutput: "test2,test3,test1",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "1-2,0", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Combine two existing fields",
input: "john.doe,CONTOSO\nadams,CONTOSO",
expectedOutput: "john.doe@CONTOSO\nadams@CONTOSO",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "0 \"@\" 1", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Fixed width to CSV",
input: "abcdefghijklmnopqrstuvxyz",
expectedOutput: "abc,xyz",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "0-2, 22-24", "\\n", "\\n", "", ","],
},
],
},
{
name: "Extract and convert CSV to TSV",
input: "ITEM,VALUE\nflamingo,439\nvodka,14",
expectedOutput: "ITEM\tVALUE\nflamingo\t439\nvodka\t14",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "0-", "\\n", "\\n", ",", "\\t"],
}
],
},
{
name: "Extract with wrong delimiter",
input: "test1,test2",
expectedOutput: "test1,test2",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "0-", "\\n", "\\n", "\\t", ";"],
},
],
},
]);