From 8ad18bc7db6d9ff184ba3518686293a7685bf7b7 Mon Sep 17 00:00:00 2001 From: n1474335 Date: Fri, 12 Feb 2021 13:51:51 +0000 Subject: [PATCH] Added 'Fuzzy Match' operation --- .github/workflows/master.yml | 2 +- .github/workflows/pull_requests.yml | 2 +- .github/workflows/releases.yml | 2 +- package.json | 4 +- src/core/config/Categories.json | 1 + .../lib/{FuzzySearch.mjs => FuzzyMatch.mjs} | 98 +++++++++----- src/core/operations/FuzzyMatch.mjs | 120 ++++++++++++++++++ src/core/operations/RegularExpression.mjs | 4 +- src/web/waiters/OperationsWaiter.mjs | 2 +- 9 files changed, 196 insertions(+), 39 deletions(-) rename src/core/lib/{FuzzySearch.mjs => FuzzyMatch.mjs} (68%) create mode 100644 src/core/operations/FuzzyMatch.mjs diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 931715da..d6b6498d 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -19,7 +19,7 @@ jobs: - name: Install run: | npm install - export NODE_OPTIONS=--max_old_space_size=2048 + npm run setheapsize - name: Lint run: npx grunt lint diff --git a/.github/workflows/pull_requests.yml b/.github/workflows/pull_requests.yml index 9b21e385..379078b8 100644 --- a/.github/workflows/pull_requests.yml +++ b/.github/workflows/pull_requests.yml @@ -18,7 +18,7 @@ jobs: - name: Install run: | npm install - export NODE_OPTIONS=--max_old_space_size=2048 + npm run setheapsize - name: Lint run: npx grunt lint diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index dda8fbef..7985c99a 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -19,7 +19,7 @@ jobs: - name: Install run: | npm install - export NODE_OPTIONS=--max_old_space_size=2048 + npm run setheapsize - name: Lint run: npx grunt lint diff --git a/package.json b/package.json index 75ce4876..8b4018ff 100644 --- a/package.json +++ b/package.json @@ -173,6 +173,8 @@ "testuidev": "npx nightwatch --env=dev", "lint": "npx grunt lint", "postinstall": "npx grunt exec:fixCryptoApiImports", - "newop": "node --experimental-modules src/core/config/scripts/newOperation.mjs" + "newop": "node --experimental-modules src/core/config/scripts/newOperation.mjs", + "getheapsize": "node -e 'console.log(`node heap limit = ${require(\"v8\").getHeapStatistics().heap_size_limit / (1024 * 1024)} Mb`)'", + "setheapsize": "export NODE_OPTIONS=--max_old_space_size=2048" } } diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 257f4742..3a5eb0d5 100755 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -238,6 +238,7 @@ "Pad lines", "Find / Replace", "Regular expression", + "Fuzzy Match", "Offset checker", "Hamming Distance", "Convert distance", diff --git a/src/core/lib/FuzzySearch.mjs b/src/core/lib/FuzzyMatch.mjs similarity index 68% rename from src/core/lib/FuzzySearch.mjs rename to src/core/lib/FuzzyMatch.mjs index d24f3fe6..693527a5 100644 --- a/src/core/lib/FuzzySearch.mjs +++ b/src/core/lib/FuzzyMatch.mjs @@ -16,40 +16,72 @@ * Anurag Awasthi - updated to 0.2.0 */ -const SEQUENTIAL_BONUS = 15; // bonus for adjacent matches -const SEPARATOR_BONUS = 30; // bonus if match occurs after a separator -const CAMEL_BONUS = 30; // bonus if match is uppercase and prev is lower -const FIRST_LETTER_BONUS = 15; // bonus if the first letter is matched +export const DEFAULT_WEIGHTS = { + sequentialBonus: 15, // bonus for adjacent matches + separatorBonus: 30, // bonus if match occurs after a separator + camelBonus: 30, // bonus if match is uppercase and prev is lower + firstLetterBonus: 15, // bonus if the first letter is matched -const LEADING_LETTER_PENALTY = -5; // penalty applied for every letter in str before the first match -const MAX_LEADING_LETTER_PENALTY = -15; // maximum penalty for leading letters -const UNMATCHED_LETTER_PENALTY = -1; + leadingLetterPenalty: -5, // penalty applied for every letter in str before the first match + maxLeadingLetterPenalty: -15, // maximum penalty for leading letters + unmatchedLetterPenalty: -1 +}; /** * Does a fuzzy search to find pattern inside a string. - * @param {*} pattern string pattern to search for - * @param {*} str string string which is being searched + * @param {string} pattern pattern to search for + * @param {string} str string which is being searched + * @param {boolean} global whether to search for all matches or just one * @returns [boolean, number] a boolean which tells if pattern was * found or not and a search score */ -export function fuzzyMatch(pattern, str) { +export function fuzzyMatch(pattern, str, global=false, weights=DEFAULT_WEIGHTS) { const recursionCount = 0; const recursionLimit = 10; const matches = []; const maxMatches = 256; - return fuzzyMatchRecursive( - pattern, - str, - 0 /* patternCurIndex */, - 0 /* strCurrIndex */, - null /* srcMatces */, - matches, - maxMatches, - 0 /* nextMatch */, - recursionCount, - recursionLimit - ); + if (!global) { + return fuzzyMatchRecursive( + pattern, + str, + 0 /* patternCurIndex */, + 0 /* strCurrIndex */, + null /* srcMatches */, + matches, + maxMatches, + 0 /* nextMatch */, + recursionCount, + recursionLimit, + weights + ); + } + + // Return all matches + let foundMatch = true, + score, + idxs, + strCurrIndex = 0; + const results = []; + + while (foundMatch) { + [foundMatch, score, idxs] = fuzzyMatchRecursive( + pattern, + str, + 0 /* patternCurIndex */, + strCurrIndex, + null /* srcMatches */, + matches, + maxMatches, + 0 /* nextMatch */, + recursionCount, + recursionLimit, + weights + ); + if (foundMatch) results.push([foundMatch, score, [...idxs]]); + strCurrIndex = idxs[idxs.length - 1] + 1; + } + return results; } /** @@ -65,7 +97,8 @@ function fuzzyMatchRecursive( maxMatches, nextMatch, recursionCount, - recursionLimit + recursionLimit, + weights ) { let outScore = 0; @@ -110,7 +143,8 @@ function fuzzyMatchRecursive( maxMatches, nextMatch, recursionCount, - recursionLimit + recursionLimit, + weights ); if (matched) { @@ -134,16 +168,16 @@ function fuzzyMatchRecursive( outScore = 100; // Apply leading letter penalty - let penalty = LEADING_LETTER_PENALTY * matches[0]; + let penalty = weights.leadingLetterPenalty * matches[0]; penalty = - penalty < MAX_LEADING_LETTER_PENALTY ? - MAX_LEADING_LETTER_PENALTY : + penalty < weights.maxLeadingLetterPenalty ? + weights.maxLeadingLetterPenalty : penalty; outScore += penalty; // Apply unmatched penalty const unmatched = str.length - nextMatch; - outScore += UNMATCHED_LETTER_PENALTY * unmatched; + outScore += weights.unmatchedLetterPenalty * unmatched; // Apply ordering bonuses for (let i = 0; i < nextMatch; i++) { @@ -152,7 +186,7 @@ function fuzzyMatchRecursive( if (i > 0) { const prevIdx = matches[i - 1]; if (currIdx === prevIdx + 1) { - outScore += SEQUENTIAL_BONUS; + outScore += weights.sequentialBonus; } } @@ -165,15 +199,15 @@ function fuzzyMatchRecursive( neighbor !== neighbor.toUpperCase() && curr !== curr.toLowerCase() ) { - outScore += CAMEL_BONUS; + outScore += weights.camelBonus; } const isNeighbourSeparator = neighbor === "_" || neighbor === " "; if (isNeighbourSeparator) { - outScore += SEPARATOR_BONUS; + outScore += weights.separatorBonus; } } else { // First letter - outScore += FIRST_LETTER_BONUS; + outScore += weights.firstLetterBonus; } } diff --git a/src/core/operations/FuzzyMatch.mjs b/src/core/operations/FuzzyMatch.mjs new file mode 100644 index 00000000..f7c9b358 --- /dev/null +++ b/src/core/operations/FuzzyMatch.mjs @@ -0,0 +1,120 @@ +/** + * @author n1474335 [n1474335@gmail.com] + * @copyright Crown Copyright 2021 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import {fuzzyMatch, calcMatchRanges, DEFAULT_WEIGHTS} from "../lib/FuzzyMatch.mjs"; + +/** + * Fuzzy Match operation + */ +class FuzzyMatch extends Operation { + + /** + * FuzzyMatch constructor + */ + constructor() { + super(); + + this.name = "Fuzzy Match"; + this.module = "Default"; + this.description = "Conducts a fuzzy search to find a pattern within the input based on weighted criteria.

e.g. A search for dpan will match on Don't Panic"; + this.infoURL = "https://wikipedia.org/wiki/Fuzzy_matching_(computer-assisted_translation)"; + this.inputType = "string"; + this.outputType = "html"; + this.args = [ + { + name: "Search", + type: "binaryString", + value: "" + }, + { + name: "Sequential bonus", + type: "number", + value: DEFAULT_WEIGHTS.sequentialBonus, + hint: "Bonus for adjacent matches" + }, + { + name: "Separator bonus", + type: "number", + value: DEFAULT_WEIGHTS.separatorBonus, + hint: "Bonus if match occurs after a separator" + }, + { + name: "Camel bonus", + type: "number", + value: DEFAULT_WEIGHTS.camelBonus, + hint: "Bonus if match is uppercase and previous is lower" + }, + { + name: "First letter bonus", + type: "number", + value: DEFAULT_WEIGHTS.firstLetterBonus, + hint: "Bonus if the first letter is matched" + }, + { + name: "Leading letter penalty", + type: "number", + value: DEFAULT_WEIGHTS.leadingLetterPenalty, + hint: "Penalty applied for every letter in the input before the first match" + }, + { + name: "Max leading letter penalty", + type: "number", + value: DEFAULT_WEIGHTS.maxLeadingLetterPenalty, + hint: "Maxiumum penalty for leading letters" + }, + { + name: "Unmatched letter penalty", + type: "number", + value: DEFAULT_WEIGHTS.unmatchedLetterPenalty + }, + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {html} + */ + run(input, args) { + const searchStr = args[0]; + const weights = { + sequentialBonus: args[1], + separatorBonus: args[2], + camelBonus: args[3], + firstLetterBonus: args[4], + leadingLetterPenalty: args[5], + maxLeadingLetterPenalty: args[6], + unmatchedLetterPenalty: args[7] + }; + const matches = fuzzyMatch(searchStr, input, true, weights); + + if (!matches) { + return "No matches."; + } + + let result = "", pos = 0, hlClass = "hl1"; + matches.forEach(([matches, score, idxs]) => { + const matchRanges = calcMatchRanges(idxs); + + matchRanges.forEach(([start, length], i) => { + result += input.slice(pos, start); + if (i === 0) result += ``; + pos = start + length; + result += `${input.slice(start, pos)}`; + }); + result += ""; + hlClass = hlClass === "hl1" ? "hl2" : "hl1"; + }); + + result += input.slice(pos, input.length); + + return result; + } + +} + +export default FuzzyMatch; diff --git a/src/core/operations/RegularExpression.mjs b/src/core/operations/RegularExpression.mjs index 8771b55f..1d8de9c4 100644 --- a/src/core/operations/RegularExpression.mjs +++ b/src/core/operations/RegularExpression.mjs @@ -185,7 +185,7 @@ class RegularExpression extends Operation { * @param {boolean} captureGroups - Display each of the capture groups separately * @returns {string} */ -function regexList (input, regex, displayTotal, matches, captureGroups) { +function regexList(input, regex, displayTotal, matches, captureGroups) { let output = "", total = 0, match; @@ -225,7 +225,7 @@ function regexList (input, regex, displayTotal, matches, captureGroups) { * @param {boolean} displayTotal * @returns {string} */ -function regexHighlight (input, regex, displayTotal) { +function regexHighlight(input, regex, displayTotal) { let output = "", title = "", hl = 1, diff --git a/src/web/waiters/OperationsWaiter.mjs b/src/web/waiters/OperationsWaiter.mjs index 200ae5df..6efbab72 100755 --- a/src/web/waiters/OperationsWaiter.mjs +++ b/src/web/waiters/OperationsWaiter.mjs @@ -6,7 +6,7 @@ import HTMLOperation from "../HTMLOperation.mjs"; import Sortable from "sortablejs"; -import {fuzzyMatch, calcMatchRanges} from "../../core/lib/FuzzySearch.mjs"; +import {fuzzyMatch, calcMatchRanges} from "../../core/lib/FuzzyMatch.mjs"; /**