Merge pull request #20 from Mikescher/feature_xpath

Added operations 'XPath expression' and 'CSS selector'
2016-12-20 18:16:14 +00:00 · 2016-12-20 18:16:14 +00:00 · 650fd9a940
parent af4644c9eb 8db1b2fc79
commit 650fd9a940
5 changed files with 8623 additions and 3 deletions
--- a/Gruntfile.js
+++ b/Gruntfile.js
@ -130,7 +130,8 @@ module.exports = function(grunt) {
        "src/js/lib/vkbeautify.js",
        "src/js/lib/Sortable.js",
        "src/js/lib/bootstrap-colorpicker.js",
-
+        "src/js/lib/xpath.js",
+        
        // Custom libraries
        "src/js/lib/canvas_components.js",

--- a/src/js/config/Categories.js
+++ b/src/js/config/Categories.js
@ -189,6 +189,8 @@ const Categories = [
            "Extract file paths",
            "Extract dates",
            "Regular expression",
+            "XPath expression",
+            "CSS selector",
        ]
    },
    {
--- a/src/js/config/OperationConfig.js
+++ b/src/js/config/OperationConfig.js
@ -1938,6 +1938,42 @@ const OperationConfig = {
            },
        ]
    },
+    "XPath expression": {
+        description: "Extract information from an xml document with an XPath query",
+        run: Extract.run_xpath,
+        input_type: "string",
+        output_type: "string",
+        args: [
+            {
+                name: "XPath",
+                type: "string",
+                value: Extract.XPATH_INITIAL
+            },
+            {
+                name: "Result delimiter",
+                type: "binary_short_string",
+                value: Extract.XPATH_DELIMITER
+            }
+        ]
+    },
+    "CSS selector": {
+        description: "Extract information from an HTML document with an CSS selector",
+        run: Extract.run_css_query,
+        input_type: "string",
+        output_type: "string",
+        args: [
+            {
+                name: "CSS selector",
+                type: "string",
+                value: Extract.SELECTOR_INITIAL
+            },
+            {
+                name: "Delimiter",
+                type: "binary_short_string",
+                value: Extract.CSS_QUERY_DELIMITER
+            },
+        ]
+    },
    "From UNIX Timestamp": {
        description: "Converts a UNIX timestamp to a datetime string.<br><br>e.g. <code>978346800</code> becomes <code>Mon 1 January 2001 11:00:00 UTC</code>",
        run: DateTime.run_from_unix_timestamp,
--- a/src/js/lib/xpath.js
+++ b/src/js/lib/xpath.js
--- a/src/js/operations/Extract.js
+++ b/src/js/operations/Extract.js
@ -1,3 +1,5 @@
+/* globals xpath */
+
 /**
 * Identifier extraction operations.
 *
@ -10,7 +12,7 @@
 var Extract = {

    /**
-     * Runs search operations across the input data using refular expressions.
+     * Runs search operations across the input data using regular expressions.
     *
     * @private
     * @param {string} input
@ -293,5 +295,118 @@ var Extract = {
        output += Extract.run_dates(input, []);
        return output;
    },
-    
+
+    /**
+     * @constant
+     * @default
+     */
+    XPATH_INITIAL: "",
+
+    /**
+     * @constant
+     * @default
+     */
+    XPATH_DELIMITER: "\\n",
+
+    /**
+     * Extract information (from an xml document) with an XPath query
+     *
+     * @author Mikescher (https://github.com/Mikescher | https://mikescher.com)
+     *
+     * @param {string} input
+     * @param {Object[]} args
+     * @returns {string}
+     */
+    run_xpath:function(input, args) {
+        const query = args[0];
+        const delimiter = args[1];
+
+        var xml;
+        try {
+            xml = $.parseXML(input);
+        } catch (err) {
+            return "Invalid input XML.";
+        }
+
+        var result;
+        try {
+            result = xpath.evaluate(xml, query);
+        } catch (err) {
+            return "Invalid XPath. Details:\n" + err.message;
+        }
+
+        const serializer = new XMLSerializer();
+        const nodeToString = function(node) {
+            switch (node.nodeType) {
+                case Node.ELEMENT_NODE: return serializer.serializeToString(node);
+                case Node.ATTRIBUTE_NODE: return node.value;
+                case Node.COMMENT_NODE: return node.data;
+                case Node.DOCUMENT_NODE: return serializer.serializeToString(node);
+                default: throw new Error("Unknown Node Type: " + node.nodeType);
+            }
+        };
+
+        return Object.values(result).slice(0, -1) // all values except last (length)
+            .map(nodeToString)
+            .join(delimiter);
+    },
+
+
+    /**
+     * @constant
+     * @default
+     */
+    SELECTOR_INITIAL: "",
+    /**
+     * @constant
+     * @default
+     */
+    CSS_QUERY_DELIMITER: "\\n",
+
+    /**
+     * Extract information (from an hmtl document) with an css selector
+     *
+     * @author Mikescher (https://github.com/Mikescher | https://mikescher.com)
+     *
+     * @param {string} input
+     * @param {Object[]} args
+     * @returns {string}
+     */
+    run_css_query: function(input, args) {
+        const query = args[0];
+        const delimiter = args[1];
+
+        var html;
+        try {
+            html = $.parseHTML(input);
+        } catch (err) {
+            return "Invalid input HTML.";
+        }
+
+        var result;
+        try {
+            result = $(html).find(query);
+        } catch (err) {
+            return "Invalid CSS Selector. Details:\n" + err.message;
+        }
+
+        const nodeToString = function(node) {
+            switch (node.nodeType) {
+                case Node.ELEMENT_NODE: return node.outerHTML;
+                case Node.ATTRIBUTE_NODE: return node.value;
+                case Node.COMMENT_NODE: return node.data;
+                case Node.TEXT_NODE: return node.wholeText;
+                case Node.DOCUMENT_NODE: return node.outerHTML;
+                default: throw new Error("Unknown Node Type: " + node.nodeType);
+            }
+        };
+
+        return Array.apply(null, Array(result.length))
+            .map(function(_, i) {
+                return result[i];
+            })
+            .map(nodeToString)
+            .join(delimiter);
+    },
+
 };