From e6fb0be1d04bd8c96a827d244f20ea3f01a991aa Mon Sep 17 00:00:00 2001 From: n1474335 Date: Tue, 18 Dec 2018 17:44:42 +0000 Subject: [PATCH] Refactored file type detection engine --- src/core/lib/BCD.mjs | 0 src/core/lib/Base58.mjs | 0 src/core/lib/Base64.mjs | 0 src/core/lib/CanvasComponents.mjs | 0 src/core/lib/FileExtraction.mjs | 231 ------ src/core/lib/FileType.mjs | 808 +++++++++++++++++++ src/core/lib/Magic.mjs | 455 +---------- src/core/lib/Stream.mjs | 7 - src/core/operations/DetectFileType.mjs | 20 +- src/core/operations/ExtractFiles.mjs | 28 +- src/core/operations/RenderImage.mjs | 12 +- src/core/operations/ScanForEmbeddedFiles.mjs | 33 +- 12 files changed, 867 insertions(+), 727 deletions(-) mode change 100755 => 100644 src/core/lib/BCD.mjs mode change 100755 => 100644 src/core/lib/Base58.mjs mode change 100755 => 100644 src/core/lib/Base64.mjs mode change 100755 => 100644 src/core/lib/CanvasComponents.mjs delete mode 100644 src/core/lib/FileExtraction.mjs create mode 100644 src/core/lib/FileType.mjs diff --git a/src/core/lib/BCD.mjs b/src/core/lib/BCD.mjs old mode 100755 new mode 100644 diff --git a/src/core/lib/Base58.mjs b/src/core/lib/Base58.mjs old mode 100755 new mode 100644 diff --git a/src/core/lib/Base64.mjs b/src/core/lib/Base64.mjs old mode 100755 new mode 100644 diff --git a/src/core/lib/CanvasComponents.mjs b/src/core/lib/CanvasComponents.mjs old mode 100755 new mode 100644 diff --git a/src/core/lib/FileExtraction.mjs b/src/core/lib/FileExtraction.mjs deleted file mode 100644 index 927aca92..00000000 --- a/src/core/lib/FileExtraction.mjs +++ /dev/null @@ -1,231 +0,0 @@ -/** - * File extraction functions - * - * @author n1474335 [n1474335@gmail.com] - * @copyright Crown Copyright 2018 - * @license Apache-2.0 - * - */ -import Stream from "./Stream"; - -/** - * Attempts to extract a file from a data stream given its mime type and offset. - * - * @param {Uint8Array} bytes - * @param {Object} fileDetail - * @param {string} fileDetail.mime - * @param {string} fileDetail.ext - * @param {number} fileDetail.offset - * @returns {File} - */ -export function extractFile(bytes, fileDetail) { - let fileData; - switch (fileDetail.mime) { - case "image/jpeg": - fileData = extractJPEG(bytes, fileDetail.offset); - break; - case "application/x-msdownload": - fileData = extractMZPE(bytes, fileDetail.offset); - break; - case "application/pdf": - fileData = extractPDF(bytes, fileDetail.offset); - break; - case "application/zip": - fileData = extractZIP(bytes, fileDetail.offset); - break; - default: - throw new Error(`No extraction algorithm available for "${fileDetail.mime}" files`); - } - - return new File([fileData], `extracted_at_0x${fileDetail.offset.toString(16)}.${fileDetail.ext}`); -} - - -/** - * JPEG extractor. - * - * @param {Uint8Array} bytes - * @param {number} offset - * @returns {Uint8Array} - */ -export function extractJPEG(bytes, offset) { - const stream = new Stream(bytes.slice(offset)); - - while (stream.hasMore()) { - const marker = stream.getBytes(2); - if (marker[0] !== 0xff) throw new Error("Invalid JPEG marker: " + marker); - - let segmentSize = 0; - switch (marker[1]) { - // No length - case 0xd8: // Start of Image - case 0x01: // For temporary use in arithmetic coding - break; - case 0xd9: // End found - return stream.carve(); - - // Variable size segment - case 0xc0: // Start of frame (Baseline DCT) - case 0xc1: // Start of frame (Extended sequential DCT) - case 0xc2: // Start of frame (Progressive DCT) - case 0xc3: // Start of frame (Lossless sequential) - case 0xc4: // Define Huffman Table - case 0xc5: // Start of frame (Differential sequential DCT) - case 0xc6: // Start of frame (Differential progressive DCT) - case 0xc7: // Start of frame (Differential lossless) - case 0xc8: // Reserved for JPEG extensions - case 0xc9: // Start of frame (Extended sequential DCT) - case 0xca: // Start of frame (Progressive DCT) - case 0xcb: // Start of frame (Lossless sequential) - case 0xcc: // Define arithmetic conditioning table - case 0xcd: // Start of frame (Differential sequential DCT) - case 0xce: // Start of frame (Differential progressive DCT) - case 0xcf: // Start of frame (Differential lossless) - case 0xdb: // Define Quantization Table - case 0xde: // Define hierarchical progression - case 0xe0: // Application-specific - case 0xe1: // Application-specific - case 0xe2: // Application-specific - case 0xe3: // Application-specific - case 0xe4: // Application-specific - case 0xe5: // Application-specific - case 0xe6: // Application-specific - case 0xe7: // Application-specific - case 0xe8: // Application-specific - case 0xe9: // Application-specific - case 0xea: // Application-specific - case 0xeb: // Application-specific - case 0xec: // Application-specific - case 0xed: // Application-specific - case 0xee: // Application-specific - case 0xef: // Application-specific - case 0xfe: // Comment - segmentSize = stream.readInt(2, "be"); - stream.position += segmentSize - 2; - break; - - // 1 byte - case 0xdf: // Expand reference image - stream.position++; - break; - - // 2 bytes - case 0xdc: // Define number of lines - case 0xdd: // Define restart interval - stream.position += 2; - break; - - // Start scan - case 0xda: // Start of scan - segmentSize = stream.readInt(2, "be"); - stream.position += segmentSize - 2; - stream.continueUntil(0xff); - break; - - // Continue through encoded data - case 0x00: // Byte stuffing - case 0xd0: // Restart - case 0xd1: // Restart - case 0xd2: // Restart - case 0xd3: // Restart - case 0xd4: // Restart - case 0xd5: // Restart - case 0xd6: // Restart - case 0xd7: // Restart - stream.continueUntil(0xff); - break; - - default: - stream.continueUntil(0xff); - break; - } - } - - throw new Error("Unable to parse JPEG successfully"); -} - - -/** - * Portable executable extractor. - * Assumes that the offset refers to an MZ header. - * - * @param {Uint8Array} bytes - * @param {number} offset - * @returns {Uint8Array} - */ -export function extractMZPE(bytes, offset) { - const stream = new Stream(bytes.slice(offset)); - - // Move to PE header pointer - stream.moveTo(0x3c); - const peAddress = stream.readInt(4, "le"); - - // Move to PE header - stream.moveTo(peAddress); - - // Get number of sections - stream.moveForwardsBy(6); - const numSections = stream.readInt(2, "le"); - - // Get optional header size - stream.moveForwardsBy(12); - const optionalHeaderSize = stream.readInt(2, "le"); - - // Move past optional header to section header - stream.moveForwardsBy(2 + optionalHeaderSize); - - // Move to final section header - stream.moveForwardsBy((numSections - 1) * 0x28); - - // Get raw data info - stream.moveForwardsBy(16); - const rawDataSize = stream.readInt(4, "le"); - const rawDataAddress = stream.readInt(4, "le"); - - // Move to end of final section - stream.moveTo(rawDataAddress + rawDataSize); - - return stream.carve(); -} - - -/** - * PDF extractor. - * - * @param {Uint8Array} bytes - * @param {number} offset - * @returns {Uint8Array} - */ -export function extractPDF(bytes, offset) { - const stream = new Stream(bytes.slice(offset)); - - // Find end-of-file marker (%%EOF) - stream.continueUntil([0x25, 0x25, 0x45, 0x4f, 0x46]); - stream.moveForwardsBy(5); - stream.consumeIf(0x0d); - stream.consumeIf(0x0a); - - return stream.carve(); -} - - -/** - * ZIP extractor. - * - * @param {Uint8Array} bytes - * @param {number} offset - * @returns {Uint8Array} - */ -export function extractZIP(bytes, offset) { - const stream = new Stream(bytes.slice(offset)); - - // Find End of central directory record - stream.continueUntil([0x50, 0x4b, 0x05, 0x06]); - - // Get comment length and consume - stream.moveForwardsBy(20); - const commentLength = stream.readInt(2, "le"); - stream.moveForwardsBy(commentLength); - - return stream.carve(); -} diff --git a/src/core/lib/FileType.mjs b/src/core/lib/FileType.mjs new file mode 100644 index 00000000..443bdd36 --- /dev/null +++ b/src/core/lib/FileType.mjs @@ -0,0 +1,808 @@ +/** + * File type functions + * + * @author n1474335 [n1474335@gmail.com] + * @copyright Crown Copyright 2018 + * @license Apache-2.0 + * + */ +import Stream from "./Stream"; + +/** + * A categorised table of file types, including signatures to identifying them and functions + * to extract them where possible. + */ +const FILE_SIGNATURES = { + "Pictures": [ + { + name: "JPEG Image", + extension: "jpg", + mime: "image/jpeg", + description: "", + signature: { + 0: 0xff, + 1: 0xd8, + 2: 0xff + }, + extractor: extractJPEG + }, + { + name: "GIF Image", + extension: "gif", + mime: "image/gif", + description: "", + signature: { + 0: 0x47, + 1: 0x49, + 2: 0x46 + }, + extractor: null + }, + { + name: "PNG Image", + extension: "png", + mime: "image/png", + description: "", + signature: { + 0: 0x89, + 1: 0x50, + 2: 0x4e, + 3: 0x47 + }, + extractor: null + }, + + ], + "Documents": [ + { + name: "Portable Document Format", + extension: "pdf", + mime: "application/pdf", + description: "", + signature: { + 0: 0x25, + 1: 0x50, + 2: 0x44, + 3: 0x46 + }, + extractor: extractPDF + }, + ], + "Applications": [ + { + name: "Windows Portable Executable", + extension: "exe", + mime: "application/x-msdownload", + description: "", + signature: { + 0: 0x4d, + 1: 0x5a + }, + extractor: extractMZPE + }, + ], + "Archives": [ + { + name: "ZIP", + extension: "zip", + mime: "application/zip", + description: "", + signature: { + 0: 0x50, + 1: 0x4b, + 2: [0x3, 0x5, 0x7], + 3: [0x4, 0x6, 0x8] + }, + extractor: extractZIP + }, + + ], +}; + + +/** + * Checks whether a signature matches a buffer. + * + * @param {Object} sig - A dictionary of offsets with values assigned to them. These + * values can be numbers for static checks, arrays of potential valid matches, or + * bespoke functions to check the validity of the buffer value at that offset. + * @param {Uint8Array} buf + * @returns {boolean} + */ +function signatureMatches(sig, buf) { + for (const offset in sig) { + switch (typeof sig[offset]) { + case "number": // Static check + if (buf[offset] !== sig[offset]) + return false; + break; + case "object": // Array of options + if (sig[offset].indexOf(buf[offset]) < 0) + return false; + break; + case "function": // More complex calculation + if (!sig[offset](buf[offset])) + return false; + break; + default: + throw new Error(`Unrecognised signature type at offset ${offset}`); + } + } + return true; +} + + +/** + * Given a buffer, detects magic byte sequences at specific positions and returns the + * extension and mime type. + * + * @param {Uint8Array} buf + * @returns {Object[]} type + * @returns {string} type.ext - File extension + * @returns {string} type.mime - Mime type + * @returns {string} [type.desc] - Description + */ +export function detectFileType(buf) { + if (!(buf && buf.length > 1)) { + return []; + } + + const matchingFiles = []; + + // TODO allow user to select which categories to check + for (const cat in FILE_SIGNATURES) { + const category = FILE_SIGNATURES[cat]; + + category.forEach(filetype => { + if (signatureMatches(filetype.signature, buf)) { + matchingFiles.push(filetype); + } + }); + } + return matchingFiles; + + // Delete all below this line once implemented in FILE_SIGNATURES above. + + + /* + if (buf[0] === 0xFF && buf[1] === 0xD8 && buf[2] === 0xFF) { + return { + ext: "jpg", + mime: "image/jpeg" + }; + } + + if (buf[0] === 0x89 && buf[1] === 0x50 && buf[2] === 0x4E && buf[3] === 0x47) { + return { + ext: "png", + mime: "image/png" + }; + } + + if (buf[0] === 0x47 && buf[1] === 0x49 && buf[2] === 0x46) { + return { + ext: "gif", + mime: "image/gif" + }; + } + + if (buf[8] === 0x57 && buf[9] === 0x45 && buf[10] === 0x42 && buf[11] === 0x50) { + return { + ext: "webp", + mime: "image/webp" + }; + } + + // needs to be before `tif` check + if (((buf[0] === 0x49 && buf[1] === 0x49 && buf[2] === 0x2A && buf[3] === 0x0) || (buf[0] === 0x4D && buf[1] === 0x4D && buf[2] === 0x0 && buf[3] === 0x2A)) && buf[8] === 0x43 && buf[9] === 0x52) { + return { + ext: "cr2", + mime: "image/x-canon-cr2" + }; + } + + if ((buf[0] === 0x49 && buf[1] === 0x49 && buf[2] === 0x2A && buf[3] === 0x0) || (buf[0] === 0x4D && buf[1] === 0x4D && buf[2] === 0x0 && buf[3] === 0x2A)) { + return { + ext: "tif", + mime: "image/tiff" + }; + } + + if (buf[0] === 0x42 && buf[1] === 0x4D) { + return { + ext: "bmp", + mime: "image/bmp" + }; + } + + if (buf[0] === 0x49 && buf[1] === 0x49 && buf[2] === 0xBC) { + return { + ext: "jxr", + mime: "image/vnd.ms-photo" + }; + } + + if (buf[0] === 0x38 && buf[1] === 0x42 && buf[2] === 0x50 && buf[3] === 0x53) { + return { + ext: "psd", + mime: "image/vnd.adobe.photoshop" + }; + } + + // needs to be before `zip` check + if (buf[0] === 0x50 && buf[1] === 0x4B && buf[2] === 0x3 && buf[3] === 0x4 && buf[30] === 0x6D && buf[31] === 0x69 && buf[32] === 0x6D && buf[33] === 0x65 && buf[34] === 0x74 && buf[35] === 0x79 && buf[36] === 0x70 && buf[37] === 0x65 && buf[38] === 0x61 && buf[39] === 0x70 && buf[40] === 0x70 && buf[41] === 0x6C && buf[42] === 0x69 && buf[43] === 0x63 && buf[44] === 0x61 && buf[45] === 0x74 && buf[46] === 0x69 && buf[47] === 0x6F && buf[48] === 0x6E && buf[49] === 0x2F && buf[50] === 0x65 && buf[51] === 0x70 && buf[52] === 0x75 && buf[53] === 0x62 && buf[54] === 0x2B && buf[55] === 0x7A && buf[56] === 0x69 && buf[57] === 0x70) { + return { + ext: "epub", + mime: "application/epub+zip" + }; + } + + if (buf[0] === 0x50 && buf[1] === 0x4B && (buf[2] === 0x3 || buf[2] === 0x5 || buf[2] === 0x7) && (buf[3] === 0x4 || buf[3] === 0x6 || buf[3] === 0x8)) { + return { + ext: "zip", + mime: "application/zip" + }; + } + + if (buf[257] === 0x75 && buf[258] === 0x73 && buf[259] === 0x74 && buf[260] === 0x61 && buf[261] === 0x72) { + return { + ext: "tar", + mime: "application/x-tar" + }; + } + + if (buf[0] === 0x52 && buf[1] === 0x61 && buf[2] === 0x72 && buf[3] === 0x21 && buf[4] === 0x1A && buf[5] === 0x7 && (buf[6] === 0x0 || buf[6] === 0x1)) { + return { + ext: "rar", + mime: "application/x-rar-compressed" + }; + } + + if (buf[0] === 0x1F && buf[1] === 0x8B && buf[2] === 0x8) { + return { + ext: "gz", + mime: "application/gzip" + }; + } + + if (buf[0] === 0x42 && buf[1] === 0x5A && buf[2] === 0x68) { + return { + ext: "bz2", + mime: "application/x-bzip2" + }; + } + + if (buf[0] === 0x37 && buf[1] === 0x7A && buf[2] === 0xBC && buf[3] === 0xAF && buf[4] === 0x27 && buf[5] === 0x1C) { + return { + ext: "7z", + mime: "application/x-7z-compressed" + }; + } + + if (buf[0] === 0x78 && buf[1] === 0x01) { + return { + ext: "dmg, zlib", + mime: "application/x-apple-diskimage, application/x-deflate" + }; + } + + if ((buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x0 && (buf[3] === 0x18 || buf[3] === 0x20) && buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70) || (buf[0] === 0x33 && buf[1] === 0x67 && buf[2] === 0x70 && buf[3] === 0x35) || (buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x0 && buf[3] === 0x1C && buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70 && buf[8] === 0x6D && buf[9] === 0x70 && buf[10] === 0x34 && buf[11] === 0x32 && buf[16] === 0x6D && buf[17] === 0x70 && buf[18] === 0x34 && buf[19] === 0x31 && buf[20] === 0x6D && buf[21] === 0x70 && buf[22] === 0x34 && buf[23] === 0x32 && buf[24] === 0x69 && buf[25] === 0x73 && buf[26] === 0x6F && buf[27] === 0x6D)) { + return { + ext: "mp4", + mime: "video/mp4" + }; + } + + if ((buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x0 && buf[3] === 0x1C && buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70 && buf[8] === 0x4D && buf[9] === 0x34 && buf[10] === 0x56)) { + return { + ext: "m4v", + mime: "video/x-m4v" + }; + } + + if (buf[0] === 0x4D && buf[1] === 0x54 && buf[2] === 0x68 && buf[3] === 0x64) { + return { + ext: "mid", + mime: "audio/midi" + }; + } + + // needs to be before the `webm` check + if (buf[31] === 0x6D && buf[32] === 0x61 && buf[33] === 0x74 && buf[34] === 0x72 && buf[35] === 0x6f && buf[36] === 0x73 && buf[37] === 0x6B && buf[38] === 0x61) { + return { + ext: "mkv", + mime: "video/x-matroska" + }; + } + + if (buf[0] === 0x1A && buf[1] === 0x45 && buf[2] === 0xDF && buf[3] === 0xA3) { + return { + ext: "webm", + mime: "video/webm" + }; + } + + if (buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x0 && buf[3] === 0x14 && buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70) { + return { + ext: "mov", + mime: "video/quicktime" + }; + } + + if (buf[0] === 0x52 && buf[1] === 0x49 && buf[2] === 0x46 && buf[3] === 0x46 && buf[8] === 0x41 && buf[9] === 0x56 && buf[10] === 0x49) { + return { + ext: "avi", + mime: "video/x-msvideo" + }; + } + + if (buf[0] === 0x30 && buf[1] === 0x26 && buf[2] === 0xB2 && buf[3] === 0x75 && buf[4] === 0x8E && buf[5] === 0x66 && buf[6] === 0xCF && buf[7] === 0x11 && buf[8] === 0xA6 && buf[9] === 0xD9) { + return { + ext: "wmv", + mime: "video/x-ms-wmv" + }; + } + + if (buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x1 && buf[3].toString(16)[0] === "b") { + return { + ext: "mpg", + mime: "video/mpeg" + }; + } + + if ((buf[0] === 0x49 && buf[1] === 0x44 && buf[2] === 0x33) || (buf[0] === 0xFF && buf[1] === 0xfb)) { + return { + ext: "mp3", + mime: "audio/mpeg" + }; + } + + if ((buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70 && buf[8] === 0x4D && buf[9] === 0x34 && buf[10] === 0x41) || (buf[0] === 0x4D && buf[1] === 0x34 && buf[2] === 0x41 && buf[3] === 0x20)) { + return { + ext: "m4a", + mime: "audio/m4a" + }; + } + + if (buf[0] === 0x4F && buf[1] === 0x67 && buf[2] === 0x67 && buf[3] === 0x53) { + return { + ext: "ogg", + mime: "audio/ogg" + }; + } + + if (buf[0] === 0x66 && buf[1] === 0x4C && buf[2] === 0x61 && buf[3] === 0x43) { + return { + ext: "flac", + mime: "audio/x-flac" + }; + } + + if (buf[0] === 0x52 && buf[1] === 0x49 && buf[2] === 0x46 && buf[3] === 0x46 && buf[8] === 0x57 && buf[9] === 0x41 && buf[10] === 0x56 && buf[11] === 0x45) { + return { + ext: "wav", + mime: "audio/x-wav" + }; + } + + if (buf[0] === 0x23 && buf[1] === 0x21 && buf[2] === 0x41 && buf[3] === 0x4D && buf[4] === 0x52 && buf[5] === 0x0A) { + return { + ext: "amr", + mime: "audio/amr" + }; + } + + if (buf[0] === 0x25 && buf[1] === 0x50 && buf[2] === 0x44 && buf[3] === 0x46) { + return { + ext: "pdf", + mime: "application/pdf" + }; + } + + if (buf[0] === 0x4D && buf[1] === 0x5A) { + return { + ext: "exe", + mime: "application/x-msdownload" + }; + } + + if ((buf[0] === 0x43 || buf[0] === 0x46) && buf[1] === 0x57 && buf[2] === 0x53) { + return { + ext: "swf", + mime: "application/x-shockwave-flash" + }; + } + + if (buf[0] === 0x7B && buf[1] === 0x5C && buf[2] === 0x72 && buf[3] === 0x74 && buf[4] === 0x66) { + return { + ext: "rtf", + mime: "application/rtf" + }; + } + + if (buf[0] === 0x77 && buf[1] === 0x4F && buf[2] === 0x46 && buf[3] === 0x46 && buf[4] === 0x00 && buf[5] === 0x01 && buf[6] === 0x00 && buf[7] === 0x00) { + return { + ext: "woff", + mime: "application/font-woff" + }; + } + + if (buf[0] === 0x77 && buf[1] === 0x4F && buf[2] === 0x46 && buf[3] === 0x32 && buf[4] === 0x00 && buf[5] === 0x01 && buf[6] === 0x00 && buf[7] === 0x00) { + return { + ext: "woff2", + mime: "application/font-woff" + }; + } + + if (buf[34] === 0x4C && buf[35] === 0x50 && ((buf[8] === 0x02 && buf[9] === 0x00 && buf[10] === 0x01) || (buf[8] === 0x01 && buf[9] === 0x00 && buf[10] === 0x00) || (buf[8] === 0x02 && buf[9] === 0x00 && buf[10] === 0x02))) { + return { + ext: "eot", + mime: "application/octet-stream" + }; + } + + if (buf[0] === 0x00 && buf[1] === 0x01 && buf[2] === 0x00 && buf[3] === 0x00 && buf[4] === 0x00) { + return { + ext: "ttf", + mime: "application/font-sfnt" + }; + } + + if (buf[0] === 0x4F && buf[1] === 0x54 && buf[2] === 0x54 && buf[3] === 0x4F && buf[4] === 0x00) { + return { + ext: "otf", + mime: "application/font-sfnt" + }; + } + + if (buf[0] === 0x00 && buf[1] === 0x00 && buf[2] === 0x01 && buf[3] === 0x00) { + return { + ext: "ico", + mime: "image/x-icon" + }; + } + + if (buf[0] === 0x46 && buf[1] === 0x4C && buf[2] === 0x56 && buf[3] === 0x01) { + return { + ext: "flv", + mime: "video/x-flv" + }; + } + + if (buf[0] === 0x25 && buf[1] === 0x21) { + return { + ext: "ps", + mime: "application/postscript" + }; + } + + if (buf[0] === 0xFD && buf[1] === 0x37 && buf[2] === 0x7A && buf[3] === 0x58 && buf[4] === 0x5A && buf[5] === 0x00) { + return { + ext: "xz", + mime: "application/x-xz" + }; + } + + if (buf[0] === 0x53 && buf[1] === 0x51 && buf[2] === 0x4C && buf[3] === 0x69) { + return { + ext: "sqlite", + mime: "application/x-sqlite3" + }; + } + */ + + /** + * + * Added by n1474335 [n1474335@gmail.com] from here on + * + */ + /* + if ((buf[0] === 0x1F && buf[1] === 0x9D) || (buf[0] === 0x1F && buf[1] === 0xA0)) { + return { + ext: "z, tar.z", + mime: "application/x-gtar" + }; + } + + if (buf[0] === 0x7F && buf[1] === 0x45 && buf[2] === 0x4C && buf[3] === 0x46) { + return { + ext: "none, axf, bin, elf, o, prx, puff, so", + mime: "application/x-executable", + desc: "Executable and Linkable Format file. No standard file extension." + }; + } + + if (buf[0] === 0xCA && buf[1] === 0xFE && buf[2] === 0xBA && buf[3] === 0xBE) { + return { + ext: "class", + mime: "application/java-vm" + }; + } + + if (buf[0] === 0xEF && buf[1] === 0xBB && buf[2] === 0xBF) { + return { + ext: "txt", + mime: "text/plain", + desc: "UTF-8 encoded Unicode byte order mark detected, commonly but not exclusively seen in text files." + }; + } + + // Must be before Little-endian UTF-16 BOM + if (buf[0] === 0xFF && buf[1] === 0xFE && buf[2] === 0x00 && buf[3] === 0x00) { + return { + ext: "UTF32LE", + mime: "charset/utf32le", + desc: "Little-endian UTF-32 encoded Unicode byte order mark detected." + }; + } + + if (buf[0] === 0xFF && buf[1] === 0xFE) { + return { + ext: "UTF16LE", + mime: "charset/utf16le", + desc: "Little-endian UTF-16 encoded Unicode byte order mark detected." + }; + } + + if ((buf[0x8001] === 0x43 && buf[0x8002] === 0x44 && buf[0x8003] === 0x30 && buf[0x8004] === 0x30 && buf[0x8005] === 0x31) || + (buf[0x8801] === 0x43 && buf[0x8802] === 0x44 && buf[0x8803] === 0x30 && buf[0x8804] === 0x30 && buf[0x8805] === 0x31) || + (buf[0x9001] === 0x43 && buf[0x9002] === 0x44 && buf[0x9003] === 0x30 && buf[0x9004] === 0x30 && buf[0x9005] === 0x31)) { + return { + ext: "iso", + mime: "application/octet-stream", + desc: "ISO 9660 CD/DVD image file" + }; + } + + if (buf[0] === 0xD0 && buf[1] === 0xCF && buf[2] === 0x11 && buf[3] === 0xE0 && buf[4] === 0xA1 && buf[5] === 0xB1 && buf[6] === 0x1A && buf[7] === 0xE1) { + return { + ext: "doc, xls, ppt", + mime: "application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint", + desc: "Microsoft Office documents" + }; + } + + if (buf[0] === 0x64 && buf[1] === 0x65 && buf[2] === 0x78 && buf[3] === 0x0A && buf[4] === 0x30 && buf[5] === 0x33 && buf[6] === 0x35 && buf[7] === 0x00) { + return { + ext: "dex", + mime: "application/octet-stream", + desc: "Dalvik Executable (Android)" + }; + } + + if (buf[0] === 0x4B && buf[1] === 0x44 && buf[2] === 0x4D) { + return { + ext: "vmdk", + mime: "application/vmdk, application/x-virtualbox-vmdk" + }; + } + + if (buf[0] === 0x43 && buf[1] === 0x72 && buf[2] === 0x32 && buf[3] === 0x34) { + return { + ext: "crx", + mime: "application/crx", + desc: "Google Chrome extension or packaged app" + }; + } + + if (buf[0] === 0x78 && (buf[1] === 0x01 || buf[1] === 0x9C || buf[1] === 0xDA || buf[1] === 0x5e)) { + return { + ext: "zlib", + mime: "application/x-deflate" + }; + } + + return null; + */ +} + + +/** + * Attempts to extract a file from a data stream given its offset and extractor function. + * + * @param {Uint8Array} bytes + * @param {Object} fileDetail + * @param {string} fileDetail.mime + * @param {string} fileDetail.extension + * @param {Function} fileDetail.extractor + * @param {number} offset + * @returns {File} + */ +export function extractFile(bytes, fileDetail, offset) { + if (fileDetail.extractor) { + const fileData = fileDetail.extractor(bytes, offset); + return new File([fileData], `extracted_at_0x${offset.toString(16)}.${fileDetail.extension}`); + } + + throw new Error(`No extraction algorithm available for "${fileDetail.mime}" files`); +} + + +/** + * JPEG extractor. + * + * @param {Uint8Array} bytes + * @param {number} offset + * @returns {Uint8Array} + */ +export function extractJPEG(bytes, offset) { + const stream = new Stream(bytes.slice(offset)); + + while (stream.hasMore()) { + const marker = stream.getBytes(2); + if (marker[0] !== 0xff) throw new Error("Invalid JPEG marker: " + marker); + + let segmentSize = 0; + switch (marker[1]) { + // No length + case 0xd8: // Start of Image + case 0x01: // For temporary use in arithmetic coding + break; + case 0xd9: // End found + return stream.carve(); + + // Variable size segment + case 0xc0: // Start of frame (Baseline DCT) + case 0xc1: // Start of frame (Extended sequential DCT) + case 0xc2: // Start of frame (Progressive DCT) + case 0xc3: // Start of frame (Lossless sequential) + case 0xc4: // Define Huffman Table + case 0xc5: // Start of frame (Differential sequential DCT) + case 0xc6: // Start of frame (Differential progressive DCT) + case 0xc7: // Start of frame (Differential lossless) + case 0xc8: // Reserved for JPEG extensions + case 0xc9: // Start of frame (Extended sequential DCT) + case 0xca: // Start of frame (Progressive DCT) + case 0xcb: // Start of frame (Lossless sequential) + case 0xcc: // Define arithmetic conditioning table + case 0xcd: // Start of frame (Differential sequential DCT) + case 0xce: // Start of frame (Differential progressive DCT) + case 0xcf: // Start of frame (Differential lossless) + case 0xdb: // Define Quantization Table + case 0xde: // Define hierarchical progression + case 0xe0: // Application-specific + case 0xe1: // Application-specific + case 0xe2: // Application-specific + case 0xe3: // Application-specific + case 0xe4: // Application-specific + case 0xe5: // Application-specific + case 0xe6: // Application-specific + case 0xe7: // Application-specific + case 0xe8: // Application-specific + case 0xe9: // Application-specific + case 0xea: // Application-specific + case 0xeb: // Application-specific + case 0xec: // Application-specific + case 0xed: // Application-specific + case 0xee: // Application-specific + case 0xef: // Application-specific + case 0xfe: // Comment + segmentSize = stream.readInt(2, "be"); + stream.position += segmentSize - 2; + break; + + // 1 byte + case 0xdf: // Expand reference image + stream.position++; + break; + + // 2 bytes + case 0xdc: // Define number of lines + case 0xdd: // Define restart interval + stream.position += 2; + break; + + // Start scan + case 0xda: // Start of scan + segmentSize = stream.readInt(2, "be"); + stream.position += segmentSize - 2; + stream.continueUntil(0xff); + break; + + // Continue through encoded data + case 0x00: // Byte stuffing + case 0xd0: // Restart + case 0xd1: // Restart + case 0xd2: // Restart + case 0xd3: // Restart + case 0xd4: // Restart + case 0xd5: // Restart + case 0xd6: // Restart + case 0xd7: // Restart + stream.continueUntil(0xff); + break; + + default: + stream.continueUntil(0xff); + break; + } + } + + throw new Error("Unable to parse JPEG successfully"); +} + + +/** + * Portable executable extractor. + * Assumes that the offset refers to an MZ header. + * + * @param {Uint8Array} bytes + * @param {number} offset + * @returns {Uint8Array} + */ +export function extractMZPE(bytes, offset) { + const stream = new Stream(bytes.slice(offset)); + + // Move to PE header pointer + stream.moveTo(0x3c); + const peAddress = stream.readInt(4, "le"); + + // Move to PE header + stream.moveTo(peAddress); + + // Get number of sections + stream.moveForwardsBy(6); + const numSections = stream.readInt(2, "le"); + + // Get optional header size + stream.moveForwardsBy(12); + const optionalHeaderSize = stream.readInt(2, "le"); + + // Move past optional header to section header + stream.moveForwardsBy(2 + optionalHeaderSize); + + // Move to final section header + stream.moveForwardsBy((numSections - 1) * 0x28); + + // Get raw data info + stream.moveForwardsBy(16); + const rawDataSize = stream.readInt(4, "le"); + const rawDataAddress = stream.readInt(4, "le"); + + // Move to end of final section + stream.moveTo(rawDataAddress + rawDataSize); + + return stream.carve(); +} + + +/** + * PDF extractor. + * + * @param {Uint8Array} bytes + * @param {number} offset + * @returns {Uint8Array} + */ +export function extractPDF(bytes, offset) { + const stream = new Stream(bytes.slice(offset)); + + // Find end-of-file marker (%%EOF) + stream.continueUntil([0x25, 0x25, 0x45, 0x4f, 0x46]); + stream.moveForwardsBy(5); + stream.consumeIf(0x0d); + stream.consumeIf(0x0a); + + return stream.carve(); +} + + +/** + * ZIP extractor. + * + * @param {Uint8Array} bytes + * @param {number} offset + * @returns {Uint8Array} + */ +export function extractZIP(bytes, offset) { + const stream = new Stream(bytes.slice(offset)); + + // Find End of central directory record + stream.continueUntil([0x50, 0x4b, 0x05, 0x06]); + + // Get comment length and consume + stream.moveForwardsBy(20); + const commentLength = stream.readInt(2, "le"); + stream.moveForwardsBy(commentLength); + + return stream.carve(); +} diff --git a/src/core/lib/Magic.mjs b/src/core/lib/Magic.mjs index b4d5a7b0..91472e21 100644 --- a/src/core/lib/Magic.mjs +++ b/src/core/lib/Magic.mjs @@ -2,6 +2,7 @@ import OperationConfig from "../config/OperationConfig.json"; import Utils from "../Utils"; import Recipe from "../Recipe"; import Dish from "../Dish"; +import {detectFileType} from "./FileType"; import chiSquared from "chi-squared"; /** @@ -92,7 +93,14 @@ class Magic { * @returns {string} [type.desc] - Description */ detectFileType() { - return Magic.magicFileType(this.inputBuffer); + const fileType = detectFileType(this.inputBuffer); + + if (!fileType.length) return null; + return { + ext: fileType[0].extension, + mime: fileType[0].mime, + desc: fileType[0].description + }; } /** @@ -784,452 +792,9 @@ class Magic { }[code]; } - - /** - * Given a buffer, detects magic byte sequences at specific positions and returns the - * extension and mime type. - * - * @param {Uint8Array} buf - * @returns {Object} type - * @returns {string} type.ext - File extension - * @returns {string} type.mime - Mime type - * @returns {string} [type.desc] - Description - */ - static magicFileType(buf) { - if (!(buf && buf.length > 1)) { - return null; - } - - if (buf[0] === 0xFF && buf[1] === 0xD8 && buf[2] === 0xFF) { - return { - ext: "jpg", - mime: "image/jpeg" - }; - } - - if (buf[0] === 0x89 && buf[1] === 0x50 && buf[2] === 0x4E && buf[3] === 0x47) { - return { - ext: "png", - mime: "image/png" - }; - } - - if (buf[0] === 0x47 && buf[1] === 0x49 && buf[2] === 0x46) { - return { - ext: "gif", - mime: "image/gif" - }; - } - - if (buf[8] === 0x57 && buf[9] === 0x45 && buf[10] === 0x42 && buf[11] === 0x50) { - return { - ext: "webp", - mime: "image/webp" - }; - } - - // needs to be before `tif` check - if (((buf[0] === 0x49 && buf[1] === 0x49 && buf[2] === 0x2A && buf[3] === 0x0) || (buf[0] === 0x4D && buf[1] === 0x4D && buf[2] === 0x0 && buf[3] === 0x2A)) && buf[8] === 0x43 && buf[9] === 0x52) { - return { - ext: "cr2", - mime: "image/x-canon-cr2" - }; - } - - if ((buf[0] === 0x49 && buf[1] === 0x49 && buf[2] === 0x2A && buf[3] === 0x0) || (buf[0] === 0x4D && buf[1] === 0x4D && buf[2] === 0x0 && buf[3] === 0x2A)) { - return { - ext: "tif", - mime: "image/tiff" - }; - } - - if (buf[0] === 0x42 && buf[1] === 0x4D) { - return { - ext: "bmp", - mime: "image/bmp" - }; - } - - if (buf[0] === 0x49 && buf[1] === 0x49 && buf[2] === 0xBC) { - return { - ext: "jxr", - mime: "image/vnd.ms-photo" - }; - } - - if (buf[0] === 0x38 && buf[1] === 0x42 && buf[2] === 0x50 && buf[3] === 0x53) { - return { - ext: "psd", - mime: "image/vnd.adobe.photoshop" - }; - } - - // needs to be before `zip` check - if (buf[0] === 0x50 && buf[1] === 0x4B && buf[2] === 0x3 && buf[3] === 0x4 && buf[30] === 0x6D && buf[31] === 0x69 && buf[32] === 0x6D && buf[33] === 0x65 && buf[34] === 0x74 && buf[35] === 0x79 && buf[36] === 0x70 && buf[37] === 0x65 && buf[38] === 0x61 && buf[39] === 0x70 && buf[40] === 0x70 && buf[41] === 0x6C && buf[42] === 0x69 && buf[43] === 0x63 && buf[44] === 0x61 && buf[45] === 0x74 && buf[46] === 0x69 && buf[47] === 0x6F && buf[48] === 0x6E && buf[49] === 0x2F && buf[50] === 0x65 && buf[51] === 0x70 && buf[52] === 0x75 && buf[53] === 0x62 && buf[54] === 0x2B && buf[55] === 0x7A && buf[56] === 0x69 && buf[57] === 0x70) { - return { - ext: "epub", - mime: "application/epub+zip" - }; - } - - if (buf[0] === 0x50 && buf[1] === 0x4B && (buf[2] === 0x3 || buf[2] === 0x5 || buf[2] === 0x7) && (buf[3] === 0x4 || buf[3] === 0x6 || buf[3] === 0x8)) { - return { - ext: "zip", - mime: "application/zip" - }; - } - - if (buf[257] === 0x75 && buf[258] === 0x73 && buf[259] === 0x74 && buf[260] === 0x61 && buf[261] === 0x72) { - return { - ext: "tar", - mime: "application/x-tar" - }; - } - - if (buf[0] === 0x52 && buf[1] === 0x61 && buf[2] === 0x72 && buf[3] === 0x21 && buf[4] === 0x1A && buf[5] === 0x7 && (buf[6] === 0x0 || buf[6] === 0x1)) { - return { - ext: "rar", - mime: "application/x-rar-compressed" - }; - } - - if (buf[0] === 0x1F && buf[1] === 0x8B && buf[2] === 0x8) { - return { - ext: "gz", - mime: "application/gzip" - }; - } - - if (buf[0] === 0x42 && buf[1] === 0x5A && buf[2] === 0x68) { - return { - ext: "bz2", - mime: "application/x-bzip2" - }; - } - - if (buf[0] === 0x37 && buf[1] === 0x7A && buf[2] === 0xBC && buf[3] === 0xAF && buf[4] === 0x27 && buf[5] === 0x1C) { - return { - ext: "7z", - mime: "application/x-7z-compressed" - }; - } - - if (buf[0] === 0x78 && buf[1] === 0x01) { - return { - ext: "dmg, zlib", - mime: "application/x-apple-diskimage, application/x-deflate" - }; - } - - if ((buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x0 && (buf[3] === 0x18 || buf[3] === 0x20) && buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70) || (buf[0] === 0x33 && buf[1] === 0x67 && buf[2] === 0x70 && buf[3] === 0x35) || (buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x0 && buf[3] === 0x1C && buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70 && buf[8] === 0x6D && buf[9] === 0x70 && buf[10] === 0x34 && buf[11] === 0x32 && buf[16] === 0x6D && buf[17] === 0x70 && buf[18] === 0x34 && buf[19] === 0x31 && buf[20] === 0x6D && buf[21] === 0x70 && buf[22] === 0x34 && buf[23] === 0x32 && buf[24] === 0x69 && buf[25] === 0x73 && buf[26] === 0x6F && buf[27] === 0x6D)) { - return { - ext: "mp4", - mime: "video/mp4" - }; - } - - if ((buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x0 && buf[3] === 0x1C && buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70 && buf[8] === 0x4D && buf[9] === 0x34 && buf[10] === 0x56)) { - return { - ext: "m4v", - mime: "video/x-m4v" - }; - } - - if (buf[0] === 0x4D && buf[1] === 0x54 && buf[2] === 0x68 && buf[3] === 0x64) { - return { - ext: "mid", - mime: "audio/midi" - }; - } - - // needs to be before the `webm` check - if (buf[31] === 0x6D && buf[32] === 0x61 && buf[33] === 0x74 && buf[34] === 0x72 && buf[35] === 0x6f && buf[36] === 0x73 && buf[37] === 0x6B && buf[38] === 0x61) { - return { - ext: "mkv", - mime: "video/x-matroska" - }; - } - - if (buf[0] === 0x1A && buf[1] === 0x45 && buf[2] === 0xDF && buf[3] === 0xA3) { - return { - ext: "webm", - mime: "video/webm" - }; - } - - if (buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x0 && buf[3] === 0x14 && buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70) { - return { - ext: "mov", - mime: "video/quicktime" - }; - } - - if (buf[0] === 0x52 && buf[1] === 0x49 && buf[2] === 0x46 && buf[3] === 0x46 && buf[8] === 0x41 && buf[9] === 0x56 && buf[10] === 0x49) { - return { - ext: "avi", - mime: "video/x-msvideo" - }; - } - - if (buf[0] === 0x30 && buf[1] === 0x26 && buf[2] === 0xB2 && buf[3] === 0x75 && buf[4] === 0x8E && buf[5] === 0x66 && buf[6] === 0xCF && buf[7] === 0x11 && buf[8] === 0xA6 && buf[9] === 0xD9) { - return { - ext: "wmv", - mime: "video/x-ms-wmv" - }; - } - - if (buf[0] === 0x0 && buf[1] === 0x0 && buf[2] === 0x1 && buf[3].toString(16)[0] === "b") { - return { - ext: "mpg", - mime: "video/mpeg" - }; - } - - if ((buf[0] === 0x49 && buf[1] === 0x44 && buf[2] === 0x33) || (buf[0] === 0xFF && buf[1] === 0xfb)) { - return { - ext: "mp3", - mime: "audio/mpeg" - }; - } - - if ((buf[4] === 0x66 && buf[5] === 0x74 && buf[6] === 0x79 && buf[7] === 0x70 && buf[8] === 0x4D && buf[9] === 0x34 && buf[10] === 0x41) || (buf[0] === 0x4D && buf[1] === 0x34 && buf[2] === 0x41 && buf[3] === 0x20)) { - return { - ext: "m4a", - mime: "audio/m4a" - }; - } - - if (buf[0] === 0x4F && buf[1] === 0x67 && buf[2] === 0x67 && buf[3] === 0x53) { - return { - ext: "ogg", - mime: "audio/ogg" - }; - } - - if (buf[0] === 0x66 && buf[1] === 0x4C && buf[2] === 0x61 && buf[3] === 0x43) { - return { - ext: "flac", - mime: "audio/x-flac" - }; - } - - if (buf[0] === 0x52 && buf[1] === 0x49 && buf[2] === 0x46 && buf[3] === 0x46 && buf[8] === 0x57 && buf[9] === 0x41 && buf[10] === 0x56 && buf[11] === 0x45) { - return { - ext: "wav", - mime: "audio/x-wav" - }; - } - - if (buf[0] === 0x23 && buf[1] === 0x21 && buf[2] === 0x41 && buf[3] === 0x4D && buf[4] === 0x52 && buf[5] === 0x0A) { - return { - ext: "amr", - mime: "audio/amr" - }; - } - - if (buf[0] === 0x25 && buf[1] === 0x50 && buf[2] === 0x44 && buf[3] === 0x46) { - return { - ext: "pdf", - mime: "application/pdf" - }; - } - - if (buf[0] === 0x4D && buf[1] === 0x5A) { - return { - ext: "exe", - mime: "application/x-msdownload" - }; - } - - if ((buf[0] === 0x43 || buf[0] === 0x46) && buf[1] === 0x57 && buf[2] === 0x53) { - return { - ext: "swf", - mime: "application/x-shockwave-flash" - }; - } - - if (buf[0] === 0x7B && buf[1] === 0x5C && buf[2] === 0x72 && buf[3] === 0x74 && buf[4] === 0x66) { - return { - ext: "rtf", - mime: "application/rtf" - }; - } - - if (buf[0] === 0x77 && buf[1] === 0x4F && buf[2] === 0x46 && buf[3] === 0x46 && buf[4] === 0x00 && buf[5] === 0x01 && buf[6] === 0x00 && buf[7] === 0x00) { - return { - ext: "woff", - mime: "application/font-woff" - }; - } - - if (buf[0] === 0x77 && buf[1] === 0x4F && buf[2] === 0x46 && buf[3] === 0x32 && buf[4] === 0x00 && buf[5] === 0x01 && buf[6] === 0x00 && buf[7] === 0x00) { - return { - ext: "woff2", - mime: "application/font-woff" - }; - } - - if (buf[34] === 0x4C && buf[35] === 0x50 && ((buf[8] === 0x02 && buf[9] === 0x00 && buf[10] === 0x01) || (buf[8] === 0x01 && buf[9] === 0x00 && buf[10] === 0x00) || (buf[8] === 0x02 && buf[9] === 0x00 && buf[10] === 0x02))) { - return { - ext: "eot", - mime: "application/octet-stream" - }; - } - - if (buf[0] === 0x00 && buf[1] === 0x01 && buf[2] === 0x00 && buf[3] === 0x00 && buf[4] === 0x00) { - return { - ext: "ttf", - mime: "application/font-sfnt" - }; - } - - if (buf[0] === 0x4F && buf[1] === 0x54 && buf[2] === 0x54 && buf[3] === 0x4F && buf[4] === 0x00) { - return { - ext: "otf", - mime: "application/font-sfnt" - }; - } - - if (buf[0] === 0x00 && buf[1] === 0x00 && buf[2] === 0x01 && buf[3] === 0x00) { - return { - ext: "ico", - mime: "image/x-icon" - }; - } - - if (buf[0] === 0x46 && buf[1] === 0x4C && buf[2] === 0x56 && buf[3] === 0x01) { - return { - ext: "flv", - mime: "video/x-flv" - }; - } - - if (buf[0] === 0x25 && buf[1] === 0x21) { - return { - ext: "ps", - mime: "application/postscript" - }; - } - - if (buf[0] === 0xFD && buf[1] === 0x37 && buf[2] === 0x7A && buf[3] === 0x58 && buf[4] === 0x5A && buf[5] === 0x00) { - return { - ext: "xz", - mime: "application/x-xz" - }; - } - - if (buf[0] === 0x53 && buf[1] === 0x51 && buf[2] === 0x4C && buf[3] === 0x69) { - return { - ext: "sqlite", - mime: "application/x-sqlite3" - }; - } - - /** - * - * Added by n1474335 [n1474335@gmail.com] from here on - * - */ - if ((buf[0] === 0x1F && buf[1] === 0x9D) || (buf[0] === 0x1F && buf[1] === 0xA0)) { - return { - ext: "z, tar.z", - mime: "application/x-gtar" - }; - } - - if (buf[0] === 0x7F && buf[1] === 0x45 && buf[2] === 0x4C && buf[3] === 0x46) { - return { - ext: "none, axf, bin, elf, o, prx, puff, so", - mime: "application/x-executable", - desc: "Executable and Linkable Format file. No standard file extension." - }; - } - - if (buf[0] === 0xCA && buf[1] === 0xFE && buf[2] === 0xBA && buf[3] === 0xBE) { - return { - ext: "class", - mime: "application/java-vm" - }; - } - - if (buf[0] === 0xEF && buf[1] === 0xBB && buf[2] === 0xBF) { - return { - ext: "txt", - mime: "text/plain", - desc: "UTF-8 encoded Unicode byte order mark detected, commonly but not exclusively seen in text files." - }; - } - - // Must be before Little-endian UTF-16 BOM - if (buf[0] === 0xFF && buf[1] === 0xFE && buf[2] === 0x00 && buf[3] === 0x00) { - return { - ext: "UTF32LE", - mime: "charset/utf32le", - desc: "Little-endian UTF-32 encoded Unicode byte order mark detected." - }; - } - - if (buf[0] === 0xFF && buf[1] === 0xFE) { - return { - ext: "UTF16LE", - mime: "charset/utf16le", - desc: "Little-endian UTF-16 encoded Unicode byte order mark detected." - }; - } - - if ((buf[0x8001] === 0x43 && buf[0x8002] === 0x44 && buf[0x8003] === 0x30 && buf[0x8004] === 0x30 && buf[0x8005] === 0x31) || - (buf[0x8801] === 0x43 && buf[0x8802] === 0x44 && buf[0x8803] === 0x30 && buf[0x8804] === 0x30 && buf[0x8805] === 0x31) || - (buf[0x9001] === 0x43 && buf[0x9002] === 0x44 && buf[0x9003] === 0x30 && buf[0x9004] === 0x30 && buf[0x9005] === 0x31)) { - return { - ext: "iso", - mime: "application/octet-stream", - desc: "ISO 9660 CD/DVD image file" - }; - } - - if (buf[0] === 0xD0 && buf[1] === 0xCF && buf[2] === 0x11 && buf[3] === 0xE0 && buf[4] === 0xA1 && buf[5] === 0xB1 && buf[6] === 0x1A && buf[7] === 0xE1) { - return { - ext: "doc, xls, ppt", - mime: "application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint", - desc: "Microsoft Office documents" - }; - } - - if (buf[0] === 0x64 && buf[1] === 0x65 && buf[2] === 0x78 && buf[3] === 0x0A && buf[4] === 0x30 && buf[5] === 0x33 && buf[6] === 0x35 && buf[7] === 0x00) { - return { - ext: "dex", - mime: "application/octet-stream", - desc: "Dalvik Executable (Android)" - }; - } - - if (buf[0] === 0x4B && buf[1] === 0x44 && buf[2] === 0x4D) { - return { - ext: "vmdk", - mime: "application/vmdk, application/x-virtualbox-vmdk" - }; - } - - if (buf[0] === 0x43 && buf[1] === 0x72 && buf[2] === 0x32 && buf[3] === 0x34) { - return { - ext: "crx", - mime: "application/crx", - desc: "Google Chrome extension or packaged app" - }; - } - - if (buf[0] === 0x78 && (buf[1] === 0x01 || buf[1] === 0x9C || buf[1] === 0xDA || buf[1] === 0x5e)) { - return { - ext: "zlib", - mime: "application/x-deflate" - }; - } - - return null; - } - } + /** * Byte frequencies of various languages generated from Wikipedia dumps taken in late 2017 and early 2018. * The Chi-Squared test cannot accept expected values of 0, so 0.0001 has been used to account for bytes diff --git a/src/core/lib/Stream.mjs b/src/core/lib/Stream.mjs index 5f84e13c..19903117 100644 --- a/src/core/lib/Stream.mjs +++ b/src/core/lib/Stream.mjs @@ -11,13 +11,6 @@ /** * A Stream can be used to traverse a binary blob, interpreting sections of it * as various data types. - * - * @param {Uint8Array} bytes - * @param {Object} fileDetail - * @param {string} fileDetail.mime - * @param {string} fileDetail.ext - * @param {number} fileDetail.offset - * @returns {File} */ export default class Stream { diff --git a/src/core/operations/DetectFileType.mjs b/src/core/operations/DetectFileType.mjs index 1d6897a0..bcb4d2a5 100644 --- a/src/core/operations/DetectFileType.mjs +++ b/src/core/operations/DetectFileType.mjs @@ -5,7 +5,7 @@ */ import Operation from "../Operation"; -import Magic from "../lib/Magic"; +import {detectFileType} from "../lib/FileType"; /** * Detect File Type operation @@ -34,17 +34,21 @@ class DetectFileType extends Operation { */ run(input, args) { const data = new Uint8Array(input), - type = Magic.magicFileType(data); + types = detectFileType(data); - if (!type) { + if (!types.length) { return "Unknown file type. Have you tried checking the entropy of this data to determine whether it might be encrypted or compressed?"; } else { - let output = "File extension: " + type.ext + "\n" + - "MIME type: " + type.mime; + let output; - if (type.desc && type.desc.length) { - output += "\nDescription: " + type.desc; - } + types.forEach(type => { + output += "File extension: " + type.extension + "\n" + + "MIME type: " + type.mime + "\n"; + + if (type.description && type.description.length) { + output += "\nDescription: " + type.description + "\n"; + } + }); return output; } diff --git a/src/core/operations/ExtractFiles.mjs b/src/core/operations/ExtractFiles.mjs index c213b256..3a87cd5e 100644 --- a/src/core/operations/ExtractFiles.mjs +++ b/src/core/operations/ExtractFiles.mjs @@ -6,9 +6,8 @@ import Operation from "../Operation"; // import OperationError from "../errors/OperationError"; -import Magic from "../lib/Magic"; import Utils from "../Utils"; -import {extractFile} from "../lib/FileExtraction"; +import {detectFileType, extractFile} from "../lib/FileType"; /** * Extract Files operation @@ -40,13 +39,13 @@ class ExtractFiles extends Operation { const bytes = new Uint8Array(input); // Scan for embedded files - const fileDetails = scanForEmbeddedFiles(bytes); + const detectedFiles = scanForEmbeddedFiles(bytes); // Extract each file that we support const files = []; - fileDetails.forEach(fileDetail => { + detectedFiles.forEach(detectedFile => { try { - files.push(extractFile(bytes, fileDetail)); + files.push(extractFile(bytes, detectedFile.fileDetails, detectedFile.offset)); } catch (err) {} }); @@ -70,22 +69,21 @@ class ExtractFiles extends Operation { * @param data */ function scanForEmbeddedFiles(data) { - let type; - const types = []; + const detectedFiles = []; for (let i = 0; i < data.length; i++) { - type = Magic.magicFileType(data.slice(i)); - if (type) { - types.push({ - offset: i, - ext: type.ext, - mime: type.mime, - desc: type.desc + const fileDetails = detectFileType(data.slice(i)); + if (fileDetails.length) { + fileDetails.forEach(match => { + detectedFiles.push({ + offset: i, + fileDetails: match, + }); }); } } - return types; + return detectedFiles; } export default ExtractFiles; diff --git a/src/core/operations/RenderImage.mjs b/src/core/operations/RenderImage.mjs index 7edd2072..4554c96b 100644 --- a/src/core/operations/RenderImage.mjs +++ b/src/core/operations/RenderImage.mjs @@ -9,7 +9,7 @@ import { fromHex } from "../lib/Hex"; import Operation from "../Operation"; import OperationError from "../errors/OperationError"; import Utils from "../Utils"; -import Magic from "../lib/Magic"; +import {detectFileType} from "../lib/FileType"; /** * Render Image operation @@ -72,8 +72,8 @@ class RenderImage extends Operation { } // Determine file type - const type = Magic.magicFileType(input); - if (!(type && type.mime.indexOf("image") === 0)) { + const types = detectFileType(input); + if (!(types.length && types[0].mime.indexOf("image") === 0)) { throw new OperationError("Invalid file type"); } @@ -92,9 +92,9 @@ class RenderImage extends Operation { let dataURI = "data:"; // Determine file type - const type = Magic.magicFileType(data); - if (type && type.mime.indexOf("image") === 0) { - dataURI += type.mime + ";"; + const types = detectFileType(data); + if (types.length && types[0].mime.indexOf("image") === 0) { + dataURI += types[0].mime + ";"; } else { throw new OperationError("Invalid file type"); } diff --git a/src/core/operations/ScanForEmbeddedFiles.mjs b/src/core/operations/ScanForEmbeddedFiles.mjs index a38c477f..41ea911b 100644 --- a/src/core/operations/ScanForEmbeddedFiles.mjs +++ b/src/core/operations/ScanForEmbeddedFiles.mjs @@ -6,7 +6,7 @@ import Operation from "../Operation"; import Utils from "../Utils"; -import Magic from "../lib/Magic"; +import {detectFileType} from "../lib/FileType"; /** * Scan for Embedded Files operation @@ -41,7 +41,7 @@ class ScanForEmbeddedFiles extends Operation { */ run(input, args) { let output = "Scanning data for 'magic bytes' which may indicate embedded files. The following results may be false positives and should not be treat as reliable. Any suffiently long file is likely to contain these magic bytes coincidentally.\n", - type, + types, numFound = 0, numCommonFound = 0; const ignoreCommon = args[0], @@ -49,20 +49,23 @@ class ScanForEmbeddedFiles extends Operation { data = new Uint8Array(input); for (let i = 0; i < data.length; i++) { - type = Magic.magicFileType(data.slice(i)); - if (type) { - if (ignoreCommon && commonExts.indexOf(type.ext) > -1) { - numCommonFound++; - continue; - } - numFound++; - output += "\nOffset " + i + " (0x" + Utils.hex(i) + "):\n" + - " File extension: " + type.ext + "\n" + - " MIME type: " + type.mime + "\n"; + types = detectFileType(data.slice(i)); + if (types.length) { + types.forEach(type => { + if (ignoreCommon && commonExts.indexOf(type.extension) > -1) { + numCommonFound++; + return; + } - if (type.desc && type.desc.length) { - output += " Description: " + type.desc + "\n"; - } + numFound++; + output += "\nOffset " + i + " (0x" + Utils.hex(i) + "):\n" + + " File extension: " + type.extension + "\n" + + " MIME type: " + type.mime + "\n"; + + if (type.description && type.description.length) { + output += " Description: " + type.description + "\n"; + } + }); } }