2019-09-04 20:37:02 +02:00
/ * *
2019-09-13 15:34:08 +02:00
* @ author n1474335 [ n1474335 @ gmail . com ]
2019-09-04 20:37:02 +02:00
* @ author mshwed [ m @ ttshwed . com ]
2020-03-27 14:12:08 +01:00
* @ author Matt C [ me @ mitt . dev ]
2019-09-04 20:37:02 +02:00
* @ copyright Crown Copyright 2019
* @ license Apache - 2.0
* /
import Operation from "../Operation.mjs" ;
import OperationError from "../errors/OperationError.mjs" ;
import { isImage } from "../lib/FileType.mjs" ;
2019-09-13 15:34:08 +02:00
import { toBase64 } from "../lib/Base64.mjs" ;
2019-09-04 20:37:02 +02:00
import { isWorkerEnvironment } from "../Utils.mjs" ;
import Tesseract from "tesseract.js" ;
2020-03-27 14:12:08 +01:00
const { createWorker } = Tesseract ;
2019-09-04 20:37:02 +02:00
2019-09-13 18:40:20 +02:00
import process from "process" ;
2019-09-04 20:37:02 +02:00
/ * *
2019-09-13 15:34:08 +02:00
* Optical Character Recognition operation
2019-09-04 20:37:02 +02:00
* /
2019-09-13 15:34:08 +02:00
class OpticalCharacterRecognition extends Operation {
2019-09-04 20:37:02 +02:00
/ * *
2019-09-13 15:34:08 +02:00
* OpticalCharacterRecognition constructor
2019-09-04 20:37:02 +02:00
* /
constructor ( ) {
super ( ) ;
2019-09-13 15:34:08 +02:00
this . name = "Optical Character Recognition" ;
2019-09-13 18:40:20 +02:00
this . module = "OCR" ;
2019-09-13 15:34:08 +02:00
this . description = "Optical character recognition or optical character reader (OCR) is the mechanical or electronic conversion of images of typed, handwritten or printed text into machine-encoded text.<br><br>Supported image formats: png, jpg, bmp, pbm." ;
this . infoURL = "https://wikipedia.org/wiki/Optical_character_recognition" ;
2019-09-04 20:37:02 +02:00
this . inputType = "ArrayBuffer" ;
this . outputType = "string" ;
2019-09-13 15:34:08 +02:00
this . args = [
{
name : "Show confidence" ,
type : "boolean" ,
value : true
}
] ;
2019-09-04 20:37:02 +02:00
}
/ * *
* @ param { ArrayBuffer } input
* @ param { Object [ ] } args
2019-09-05 15:20:59 +02:00
* @ returns { string }
2019-09-04 20:37:02 +02:00
* /
async run ( input , args ) {
2019-09-13 15:34:08 +02:00
const [ showConfidence ] = args ;
2021-02-10 14:13:19 +01:00
if ( ! isWorkerEnvironment ( ) ) throw new OperationError ( "This operation only works in a browser" ) ;
2019-09-13 18:40:20 +02:00
2019-09-13 15:34:08 +02:00
const type = isImage ( input ) ;
if ( ! type ) {
2019-09-04 20:37:02 +02:00
throw new OperationError ( "Invalid File Type" ) ;
}
2019-09-13 18:40:20 +02:00
const assetDir = isWorkerEnvironment ( ) ? ` ${ self . docURL } /assets/ ` : ` ${ process . cwd ( ) } /src/core/vendor/ ` ;
2019-09-04 20:37:02 +02:00
try {
2020-03-27 14:12:08 +01:00
self . sendStatusMessage ( "Spinning up Tesseract worker..." ) ;
2019-09-13 15:34:08 +02:00
const image = ` data: ${ type } ;base64, ${ toBase64 ( input ) } ` ;
2020-03-27 14:12:08 +01:00
const worker = createWorker ( {
2019-09-25 16:39:04 +02:00
workerPath : ` ${ assetDir } tesseract/worker.min.js ` ,
langPath : ` ${ assetDir } tesseract/lang-data ` ,
corePath : ` ${ assetDir } tesseract/tesseract-core.wasm.js ` ,
2020-03-27 14:12:08 +01:00
logger : progress => {
2019-09-13 15:34:08 +02:00
if ( isWorkerEnvironment ( ) ) {
2020-03-27 14:12:08 +01:00
self . sendStatusMessage ( ` Status: ${ progress . status } ${ progress . status === "recognizing text" ? ` - ${ ( parseFloat ( progress . progress ) * 100 ) . toFixed ( 2 ) } % ` : "" } ` ) ;
2019-09-13 15:34:08 +02:00
}
2020-03-27 14:12:08 +01:00
}
} ) ;
await worker . load ( ) ;
self . sendStatusMessage ( "Loading English language..." ) ;
await worker . loadLanguage ( "eng" ) ;
self . sendStatusMessage ( "Intialising Tesseract API..." ) ;
await worker . initialize ( "eng" ) ;
self . sendStatusMessage ( "Finding text..." ) ;
const result = await worker . recognize ( image ) ;
2019-09-04 20:37:02 +02:00
2019-09-13 15:34:08 +02:00
if ( showConfidence ) {
2020-03-27 14:12:08 +01:00
return ` Confidence: ${ result . data . confidence } % \n \n ${ result . data . text } ` ;
2019-09-13 15:34:08 +02:00
} else {
2020-03-27 14:12:08 +01:00
return result . data . text ;
2019-09-13 15:34:08 +02:00
}
2019-09-04 20:37:02 +02:00
} catch ( err ) {
throw new OperationError ( ` Error performing OCR on image. ( ${ err } ) ` ) ;
}
}
}
2019-09-13 15:34:08 +02:00
export default OpticalCharacterRecognition ;