From 62b76777c065c78c4cb10737deb612e0b5d6babf Mon Sep 17 00:00:00 2001 From: Klaxon Date: Tue, 2 Oct 2018 13:40:47 +1000 Subject: [PATCH 1/5] update regex to match more email address variations --- src/core/operations/ExtractEmailAddresses.mjs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/operations/ExtractEmailAddresses.mjs b/src/core/operations/ExtractEmailAddresses.mjs index 6c2dc740..6b652798 100644 --- a/src/core/operations/ExtractEmailAddresses.mjs +++ b/src/core/operations/ExtractEmailAddresses.mjs @@ -39,8 +39,8 @@ class ExtractEmailAddresses extends Operation { */ run(input, args) { const displayTotal = args[0], - regex = /\b\w[-.\w]*@[-\w]+(?:\.[-\w]+)*\.[A-Z]{2,4}\b/ig; - + // email regex from: https://www.regular-expressions.info/email.html + regex = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/ig; return search(input, regex, null, displayTotal); } From a69063de9b86428c9e87412ad7d6fa1bc1d8734d Mon Sep 17 00:00:00 2001 From: Klaxon Date: Tue, 2 Oct 2018 13:51:55 +1000 Subject: [PATCH 2/5] add tests --- test/index.mjs | 1 + .../operations/ExtractEmailAddresses.mjs | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 test/tests/operations/ExtractEmailAddresses.mjs diff --git a/test/index.mjs b/test/index.mjs index 3f873201..c3e413fe 100644 --- a/test/index.mjs +++ b/test/index.mjs @@ -40,6 +40,7 @@ import "./tests/operations/Compress"; import "./tests/operations/ConditionalJump"; import "./tests/operations/Crypt"; import "./tests/operations/DateTime"; +import "./tests/operations/ExtractEmailAddresses"; import "./tests/operations/Fork"; import "./tests/operations/FromGeohash.mjs"; import "./tests/operations/Hash"; diff --git a/test/tests/operations/ExtractEmailAddresses.mjs b/test/tests/operations/ExtractEmailAddresses.mjs new file mode 100644 index 00000000..728d3c57 --- /dev/null +++ b/test/tests/operations/ExtractEmailAddresses.mjs @@ -0,0 +1,33 @@ +/** + * Parse IP Range tests. + * + * @author Klaxon [klaxon@veyr.com] + * @copyright Crown Copyright 2017 + * @license Apache-2.0 + */ +import TestRegister from "../../TestRegister"; + +TestRegister.addTests([ + { + name: "Extract email address", + input: "email@example.com\nfirstname.lastname@example.com\nemail@subdomain.example.com\nfirstname+lastname@example.com\n1234567890@example.com\nemail@example-one.com\n_______@example.com email@example.name\nemail@example.museum email@example.co.jp firstname-lastname@example.com", + expectedOutput: "email@example.com\nfirstname.lastname@example.com\nemail@subdomain.example.com\nfirstname+lastname@example.com\n1234567890@example.com\nemail@example-one.com\n_______@example.com\nemail@example.name\nemail@example.museum\nemail@example.co.jp\nfirstname-lastname@example.com\n", + recipeConfig: [ + { + "op": "Extract email addresses", + "args": [false] + }, + ], + }, + { + name: "Extract email address - Display total", + input: "email@example.com\nfirstname.lastname@example.com\nemail@subdomain.example.com\nfirstname+lastname@example.com\n1234567890@example.com\nemail@example-one.com\n_______@example.com email@example.name\nemail@example.museum email@example.co.jp firstname-lastname@example.com", + expectedOutput: "Total found: 11\n\nemail@example.com\nfirstname.lastname@example.com\nemail@subdomain.example.com\nfirstname+lastname@example.com\n1234567890@example.com\nemail@example-one.com\n_______@example.com\nemail@example.name\nemail@example.museum\nemail@example.co.jp\nfirstname-lastname@example.com\n", + recipeConfig: [ + { + "op": "Extract email addresses", + "args": [true] + }, + ], + }, +]); From ab4c9ef0d6e687166fe6b6a73fc9910eb87e802d Mon Sep 17 00:00:00 2001 From: Klaxon Date: Tue, 2 Oct 2018 15:12:51 +1000 Subject: [PATCH 3/5] fix comment --- test/tests/operations/ExtractEmailAddresses.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tests/operations/ExtractEmailAddresses.mjs b/test/tests/operations/ExtractEmailAddresses.mjs index 728d3c57..d56a9faf 100644 --- a/test/tests/operations/ExtractEmailAddresses.mjs +++ b/test/tests/operations/ExtractEmailAddresses.mjs @@ -1,5 +1,5 @@ /** - * Parse IP Range tests. + * extract email address tests. * * @author Klaxon [klaxon@veyr.com] * @copyright Crown Copyright 2017 From 3079059ce340ebd5b67ceed50305dd59968c93e2 Mon Sep 17 00:00:00 2001 From: Klaxon Date: Thu, 11 Oct 2018 18:25:05 +1000 Subject: [PATCH 4/5] Update regex to support a wider variety of email addresses. --- src/core/operations/ExtractEmailAddresses.mjs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/operations/ExtractEmailAddresses.mjs b/src/core/operations/ExtractEmailAddresses.mjs index 6b652798..54ccf95b 100644 --- a/src/core/operations/ExtractEmailAddresses.mjs +++ b/src/core/operations/ExtractEmailAddresses.mjs @@ -39,8 +39,8 @@ class ExtractEmailAddresses extends Operation { */ run(input, args) { const displayTotal = args[0], - // email regex from: https://www.regular-expressions.info/email.html - regex = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/ig; + // email regex from: https://www.regextester.com/98066 + regex = /(?:[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9](?:[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9-]*[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9])?\.)+[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9](?:[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9-]*[\u00A0-\uD7FF\uE000-\uFFFF-a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])/ig; return search(input, regex, null, displayTotal); } From 718a94b5e09e4fa9910518f83fdd987c5e14015e Mon Sep 17 00:00:00 2001 From: Klaxon Date: Thu, 11 Oct 2018 20:42:16 +1000 Subject: [PATCH 5/5] add tests for internationalized email addresses --- .../operations/ExtractEmailAddresses.mjs | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/tests/operations/ExtractEmailAddresses.mjs b/test/tests/operations/ExtractEmailAddresses.mjs index 728d3c57..5963dc71 100644 --- a/test/tests/operations/ExtractEmailAddresses.mjs +++ b/test/tests/operations/ExtractEmailAddresses.mjs @@ -30,4 +30,26 @@ TestRegister.addTests([ }, ], }, + { + name: "Extract email address (Internationalized)", + input: "\u4f0a\u662d\u5091@\u90f5\u4ef6.\u5546\u52d9 \u093e\u092e@\u092e\u094b\u0939\u0928.\u0908\u0928\u094d\u092b\u094b\n\u044e\u0437\u0435\u0440@\u0435\u043a\u0437\u0430\u043c\u043f\u043b.\u043a\u043e\u043c \u03b8\u03c3\u03b5\u03c1@\u03b5\u03c7\u03b1\u03bc\u03c0\u03bb\u03b5.\u03c8\u03bf\u03bc Jos\u1ec5Silv\u1ec5@googl\u1ec5.com\nJos\u1ec5Silv\u1ec5@google.com and Jos\u1ec5Silva@google.com\nFoO@BaR.CoM, john@192.168.10.100\ng\xf3mez@junk.br and Abc.123@example.com.\nuser+mailbox/department=shipping@example.com\n\u7528\u6237@\u4f8b\u5b50.\u5e7f\u544a\n\u0909\u092a\u092f\u094b\u0917\u0915\u0930\u094d\u0924\u093e@\u0909\u0926\u093e\u0939\u0930\u0923.\u0915\u0949\u092e\n\u044e\u0437\u0435\u0440@\u0435\u043a\u0437\u0430\u043c\u043f\u043b.\u043a\u043e\u043c\n\u03b8\u03c3\u03b5\u03c1@\u03b5\u03c7\u03b1\u03bc\u03c0\u03bb\u03b5.\u03c8\u03bf\u03bc\nD\xf6rte@S\xf6rensen.example.com\n\u0430\u0434\u0436\u0430\u0439@\u044d\u043a\u0437\u0430\u043c\u043f\u043b.\u0440\u0443\u0441\ntest@xn--bcher-kva.com", + expectedOutput: "\u4f0a\u662d\u5091@\u90f5\u4ef6.\u5546\u52d9\n\u093e\u092e@\u092e\u094b\u0939\u0928.\u0908\u0928\u094d\u092b\u094b\n\u044e\u0437\u0435\u0440@\u0435\u043a\u0437\u0430\u043c\u043f\u043b.\u043a\u043e\u043c\n\u03b8\u03c3\u03b5\u03c1@\u03b5\u03c7\u03b1\u03bc\u03c0\u03bb\u03b5.\u03c8\u03bf\u03bc\nJos\u1ec5Silv\u1ec5@googl\u1ec5.com\nJos\u1ec5Silv\u1ec5@google.com\nJos\u1ec5Silva@google.com\nFoO@BaR.CoM\njohn@192.168.10.100\ng\xf3mez@junk.br\nAbc.123@example.com\nuser+mailbox/department=shipping@example.com\n\u7528\u6237@\u4f8b\u5b50.\u5e7f\u544a\n\u0909\u092a\u092f\u094b\u0917\u0915\u0930\u094d\u0924\u093e@\u0909\u0926\u093e\u0939\u0930\u0923.\u0915\u0949\u092e\n\u044e\u0437\u0435\u0440@\u0435\u043a\u0437\u0430\u043c\u043f\u043b.\u043a\u043e\u043c\n\u03b8\u03c3\u03b5\u03c1@\u03b5\u03c7\u03b1\u03bc\u03c0\u03bb\u03b5.\u03c8\u03bf\u03bc\nD\xf6rte@S\xf6rensen.example.com\n\u0430\u0434\u0436\u0430\u0439@\u044d\u043a\u0437\u0430\u043c\u043f\u043b.\u0440\u0443\u0441\ntest@xn--bcher-kva.com\n", + recipeConfig: [ + { + "op": "Extract email addresses", + "args": [false] + }, + ], + }, + { + name: "Extract email address - Display total (Internationalized)", + input: "\u4f0a\u662d\u5091@\u90f5\u4ef6.\u5546\u52d9 \u093e\u092e@\u092e\u094b\u0939\u0928.\u0908\u0928\u094d\u092b\u094b\n\u044e\u0437\u0435\u0440@\u0435\u043a\u0437\u0430\u043c\u043f\u043b.\u043a\u043e\u043c \u03b8\u03c3\u03b5\u03c1@\u03b5\u03c7\u03b1\u03bc\u03c0\u03bb\u03b5.\u03c8\u03bf\u03bc Jos\u1ec5Silv\u1ec5@googl\u1ec5.com\nJos\u1ec5Silv\u1ec5@google.com and Jos\u1ec5Silva@google.com\nFoO@BaR.CoM, john@192.168.10.100\ng\xf3mez@junk.br and Abc.123@example.com.\nuser+mailbox/department=shipping@example.com\n\u7528\u6237@\u4f8b\u5b50.\u5e7f\u544a\n\u0909\u092a\u092f\u094b\u0917\u0915\u0930\u094d\u0924\u093e@\u0909\u0926\u093e\u0939\u0930\u0923.\u0915\u0949\u092e\n\u044e\u0437\u0435\u0440@\u0435\u043a\u0437\u0430\u043c\u043f\u043b.\u043a\u043e\u043c\n\u03b8\u03c3\u03b5\u03c1@\u03b5\u03c7\u03b1\u03bc\u03c0\u03bb\u03b5.\u03c8\u03bf\u03bc\nD\xf6rte@S\xf6rensen.example.com\n\u0430\u0434\u0436\u0430\u0439@\u044d\u043a\u0437\u0430\u043c\u043f\u043b.\u0440\u0443\u0441\ntest@xn--bcher-kva.com", + expectedOutput: "Total found: 19\n\n\u4f0a\u662d\u5091@\u90f5\u4ef6.\u5546\u52d9\n\u093e\u092e@\u092e\u094b\u0939\u0928.\u0908\u0928\u094d\u092b\u094b\n\u044e\u0437\u0435\u0440@\u0435\u043a\u0437\u0430\u043c\u043f\u043b.\u043a\u043e\u043c\n\u03b8\u03c3\u03b5\u03c1@\u03b5\u03c7\u03b1\u03bc\u03c0\u03bb\u03b5.\u03c8\u03bf\u03bc\nJos\u1ec5Silv\u1ec5@googl\u1ec5.com\nJos\u1ec5Silv\u1ec5@google.com\nJos\u1ec5Silva@google.com\nFoO@BaR.CoM\njohn@192.168.10.100\ng\xf3mez@junk.br\nAbc.123@example.com\nuser+mailbox/department=shipping@example.com\n\u7528\u6237@\u4f8b\u5b50.\u5e7f\u544a\n\u0909\u092a\u092f\u094b\u0917\u0915\u0930\u094d\u0924\u093e@\u0909\u0926\u093e\u0939\u0930\u0923.\u0915\u0949\u092e\n\u044e\u0437\u0435\u0440@\u0435\u043a\u0437\u0430\u043c\u043f\u043b.\u043a\u043e\u043c\n\u03b8\u03c3\u03b5\u03c1@\u03b5\u03c7\u03b1\u03bc\u03c0\u03bb\u03b5.\u03c8\u03bf\u03bc\nD\xf6rte@S\xf6rensen.example.com\n\u0430\u0434\u0436\u0430\u0439@\u044d\u043a\u0437\u0430\u043c\u043f\u043b.\u0440\u0443\u0441\ntest@xn--bcher-kva.com\n", + recipeConfig: [ + { + "op": "Extract email addresses", + "args": [true] + }, + ], + }, ]);