Example
This example will OCR all the images in given folder and convert them to PDF documents. It will use multiple threads to maximize the recognition performance and supports abortion and continue on non-critical errors. This example supports converting images of any number of pages.
using Leadtools;
using Leadtools.Codecs;
using Leadtools.Forms.Ocr;
using Leadtools.Forms.DocumentWriters;
using Leadtools.Forms;
using Leadtools.WinForms;
public class RunJobExample
{
// Number of documents that are pending
private int _documentsPending;
// Event to trigger when all documents are finished
private AutoResetEvent _allDocumentsFinishedEvent;
public void Start()
{
string imagesDirectory = LEAD_VARS.ImagesDir;
string documentsDirectory = Path.Combine(LEAD_VARS.ImagesDir, "RunJobExample");
// Create the output (documents) directory
if (!Directory.Exists(documentsDirectory))
{
Directory.CreateDirectory(documentsDirectory);
}
// Get all TIF files in input (images) directory
string[] imageFileNames = Directory.GetFiles(imagesDirectory, "*.tif");
if (imageFileNames.Length == 0)
{
Console.WriteLine("No images to OCR");
return;
}
// Create a new OCR engine instance
OcrEngineType engineType = OcrEngineType.Advantage;
Console.WriteLine(string.Format("Starting up {0} engine", engineType));
using (IOcrEngine ocrEngine = OcrEngineManager.CreateEngine(engineType, false))
{
ocrEngine.Startup(null, null, null, LEAD_VARS.OcrAdvantageRuntimeDir);
// Setup document PDF save options: Image/Text with CCITT G4 encoding for B/W
DocumentWriter docWriter = ocrEngine.DocumentWriterInstance;
PdfDocumentOptions pdfOptions = docWriter.GetOptions(DocumentFormat.Pdf) as PdfDocumentOptions;
pdfOptions.ImageOverText = true;
pdfOptions.DocumentType = PdfDocumentType.Pdf;
pdfOptions.FontEmbedMode = DocumentFontEmbedMode.None;
pdfOptions.OneBitImageCompression = OneBitImageCompressionType.FaxG4;
docWriter.SetOptions(DocumentFormat.Pdf, pdfOptions);
// We are going to use multiple-threads, so disable threading in
// IOcrAutoRecognizeManager
IOcrAutoRecognizeManager autoRecognizeManager = ocrEngine.AutoRecognizeManager;
autoRecognizeManager.MaximumThreadsPerJob = 1;
// Tell the recognize manager to continue on errors
autoRecognizeManager.JobErrorMode = OcrAutoRecognizeManagerJobErrorMode.Continue;
// Instead of using events to trigger when documents are done,
// we will use the JobCompleted events of IOcrAutoRecognizeManager
// to decrement a counter and trigger one event when the counter reaches 0
autoRecognizeManager.JobStarted += new EventHandler<OcrAutoRecognizeRunJobEventArgs>(autoRecognizeManager_JobStarted);
autoRecognizeManager.JobCompleted += new EventHandler<OcrAutoRecognizeRunJobEventArgs>(autoRecognizeManager_JobCompleted);
int count = imageFileNames.Length;
_documentsPending = count;
_allDocumentsFinishedEvent = new AutoResetEvent(false);
for (int i = 0; i < count; i++)
{
// Create the job data
string imageFileName = imageFileNames[i];
string name = "Document " + (i + 1).ToString();
Console.WriteLine("Queuing {0} file {1}", name, imageFileName);
JobData data = new JobData();
data.AutoRecognizeManager = autoRecognizeManager;
data.ImageFileName = imageFileName;
data.DocumentFileName = Path.Combine(documentsDirectory, Path.GetFileNameWithoutExtension(imageFileName) + ".pdf");
data.JobName = name;
// Queue this job
ThreadPool.QueueUserWorkItem(new WaitCallback(RunJob), data);
}
// Wait for all documents to finish
_allDocumentsFinishedEvent.WaitOne();
_allDocumentsFinishedEvent.Close();
autoRecognizeManager.JobStarted -= new EventHandler<OcrAutoRecognizeRunJobEventArgs>(autoRecognizeManager_JobStarted);
autoRecognizeManager.JobCompleted -= new EventHandler<OcrAutoRecognizeRunJobEventArgs>(autoRecognizeManager_JobCompleted);
Console.WriteLine("All documents finished, check the result files in {0}", documentsDirectory);
}
}
private void autoRecognizeManager_JobStarted(object sender, OcrAutoRecognizeRunJobEventArgs e)
{
// This is not strictly needed in this example, we will
// use it to show information
Console.WriteLine("{0} started...", e.Job.JobData.JobName);
// Check if we need to abort
if (AbortJobs(e.Job))
{
// Yes, abort all jobs
e.Job.AutoRecognizeManager.AbortAllJobs();
}
}
private void autoRecognizeManager_JobCompleted(object sender, OcrAutoRecognizeRunJobEventArgs e)
{
string message = string.Format("{0} completed ", e.Job.JobData.JobName);
IOcrAutoRecognizeJob job = e.Job;
// Show any errors
if (job.Errors.Count == 0)
{
message += "successfully...";
}
else
{
message += "with errors, first error is " + job.Errors[0].Exception.Message;
// And save the errors to a text file in the document directory
string documentFileName = job.JobData.DocumentFileName;
string textPathName = Path.Combine(Path.GetDirectoryName(documentFileName), Path.GetFileNameWithoutExtension(documentFileName) + "_errors.txt");
using (StreamWriter writer = File.CreateText(textPathName))
{
writer.WriteLine(job.JobData.JobName);
writer.WriteLine("Data:");
writer.WriteLine(" Image file name: " + job.JobData.ImageFileName);
writer.WriteLine(" First page number: " + job.JobData.FirstPageNumber);
writer.WriteLine(" Last page number: " + job.JobData.LastPageNumber);
writer.WriteLine(" Format:" + job.JobData.Format);
writer.WriteLine(" Document file name: " + job.JobData.DocumentFileName);
writer.WriteLine("Errors:");
foreach (OcrAutoRecognizeManagerJobError error in job.Errors)
{
writer.WriteLine(" Page: {0} during {1}. Error: {2}", error.ImagePageNumber, error.Operation, error.Exception.Message);
}
}
}
Console.WriteLine(message);
// Decrement the documents count, when we reach 0, we are done
// Since this will be called from multiple threads, we need
// to use a thread-safety procedure
int pending = Interlocked.Decrement(ref _documentsPending);
// If we are the last document, wait up main thread
if (pending == 0)
{
_allDocumentsFinishedEvent.Set();
}
}
private class JobData
{
public IOcrAutoRecognizeManager AutoRecognizeManager;
public string ImageFileName;
public string DocumentFileName;
public string JobName;
}
private void RunJob(object state)
{
JobData data = state as JobData;
Console.WriteLine("Running {0}", data.JobName);
// Run it
OcrAutoRecognizeJobData jobData = new OcrAutoRecognizeJobData(data.ImageFileName, DocumentFormat.Pdf, data.DocumentFileName);
jobData.JobName = data.JobName;
IOcrAutoRecognizeJob job = data.AutoRecognizeManager.CreateJob(jobData);
data.AutoRecognizeManager.RunJob(job);
}
private bool AbortJobs(IOcrAutoRecognizeJob ocrJob)
{
// In your application, you can check if abortion is required, for example, if the user
// has pressed the Cancel button on a progress bar or if your service is shutting down.
// In this example, we will never abort, but you can change this code to return true
// upon any condition (or when a specific job is about to start)
// and the engine will abort all current and pending jobs
return false;
}
}
static class LEAD_VARS
{
public const string ImagesDir = @"C:\Users\Public\Documents\LEADTOOLS Images";
public const string OcrAdvantageRuntimeDir = @"C:\LEADTOOLS 19\Bin\Common\OcrAdvantageRuntime";
}
Imports Leadtools
Imports Leadtools.Codecs
Imports Leadtools.Forms.Ocr
Imports Leadtools.Forms.DocumentWriters
Imports Leadtools.Forms
Imports Leadtools.WinForms
Public Class RunJobExample
' Number of documents that are pending
Private _documentsPending As Integer
' Event to trigger when all documents are finished
Private _allDocumentsFinishedEvent As AutoResetEvent
Public Sub Start()
Dim imagesDirectory As String = LEAD_VARS.ImagesDir
Dim documentsDirectory As String = Path.Combine(LEAD_VARS.ImagesDir, "RunJobExample")
' Create the output (documents) directory
If Not Directory.Exists(documentsDirectory) Then
Directory.CreateDirectory(documentsDirectory)
End If
' Get all TIF files in input (images) directory
Dim imageFileNames As String() = Directory.GetFiles(imagesDirectory, "*.tif")
If imageFileNames.Length = 0 Then
Console.WriteLine("No images to OCR")
Return
End If
' Create a new OCR engine instance
Dim engineType As OcrEngineType = OcrEngineType.Advantage
Console.WriteLine(String.Format("Starting up {0} engine", engineType))
Using ocrEngine As IOcrEngine = OcrEngineManager.CreateEngine(engineType, False)
ocrEngine.Startup(Nothing, Nothing, Nothing, LEAD_VARS.OcrAdvantageRuntimeDir)
' Setup document PDF save options: Image/Text with CCITT G4 encoding for B/W
Dim docWriter As DocumentWriter = ocrEngine.DocumentWriterInstance
Dim pdfOptions As PdfDocumentOptions = TryCast(docWriter.GetOptions(DocumentFormat.Pdf), PdfDocumentOptions)
pdfOptions.ImageOverText = True
pdfOptions.DocumentType = PdfDocumentType.Pdf
pdfOptions.FontEmbedMode = DocumentFontEmbedMode.None
pdfOptions.OneBitImageCompression = OneBitImageCompressionType.FaxG4
docWriter.SetOptions(DocumentFormat.Pdf, pdfOptions)
' We are going to use multiple-threads, so disable threading in
' IOcrAutoRecognizeManager
Dim autoRecognizeManager As IOcrAutoRecognizeManager = ocrEngine.AutoRecognizeManager
autoRecognizeManager.MaximumThreadsPerJob = 1
' Tell the recognize manager to continue on errors
autoRecognizeManager.JobErrorMode = OcrAutoRecognizeManagerJobErrorMode.[Continue]
' Instead of using events to trigger when documents are done,
' we will use the JobCompleted events of IOcrAutoRecognizeManager
' to decrement a counter and trigger one event when the counter reaches 0
AddHandler autoRecognizeManager.JobStarted, AddressOf autoRecognizeManager_JobStarted
AddHandler autoRecognizeManager.JobCompleted, AddressOf autoRecognizeManager_JobCompleted
Dim count As Integer = imageFileNames.Length
_documentsPending = count
_allDocumentsFinishedEvent = New AutoResetEvent(False)
For i As Integer = 0 To count - 1
' Create the job data
Dim imageFileName As String = imageFileNames(i)
Dim name As String = "Document " & (i + 1).ToString()
Console.WriteLine("Queuing {0} file {1}", name, imageFileName)
Dim data As New JobData()
data.AutoRecognizeManager = autoRecognizeManager
data.ImageFileName = imageFileName
data.DocumentFileName = Path.Combine(documentsDirectory, Path.GetFileNameWithoutExtension(imageFileName) & ".pdf")
data.JobName = name
' Queue this job
ThreadPool.QueueUserWorkItem(New WaitCallback(AddressOf RunJob), data)
Next
' Wait for all documents to finish
_allDocumentsFinishedEvent.WaitOne()
_allDocumentsFinishedEvent.Close()
RemoveHandler autoRecognizeManager.JobStarted, AddressOf autoRecognizeManager_JobStarted
RemoveHandler autoRecognizeManager.JobCompleted, AddressOf autoRecognizeManager_JobCompleted
Console.WriteLine("All documents finished, check the result files in {0}", documentsDirectory)
End Using
End Sub
Private Sub autoRecognizeManager_JobStarted(sender As Object, e As OcrAutoRecognizeRunJobEventArgs)
' This is not strictly needed in this example, we will
' use it to show information
Console.WriteLine("{0} started...", e.Job.JobData.JobName)
' Check if we need to abort
If AbortJobs(e.Job) Then
' Yes, abort all jobs
e.Job.AutoRecognizeManager.AbortAllJobs()
End If
End Sub
Private Sub autoRecognizeManager_JobCompleted(sender As Object, e As OcrAutoRecognizeRunJobEventArgs)
Dim message As String = String.Format("{0} completed ", e.Job.JobData.JobName)
Dim job As IOcrAutoRecognizeJob = e.Job
' Show any errors
If job.Errors.Count = 0 Then
message += "successfully..."
Else
message += "with errors, first error is " + job.Errors(0).Exception.Message
' And save the errors to a text file in the document directory
Dim documentFileName As String = job.JobData.DocumentFileName
Dim textPathName As String =
Path.Combine(Path.GetDirectoryName(documentFileName), Path.GetFileNameWithoutExtension(documentFileName) & "_errors.txt")
Using writer As StreamWriter = File.CreateText(textPathName)
writer.WriteLine(job.JobData.JobName)
writer.WriteLine("Data:")
writer.WriteLine(" Image file name: " & job.JobData.ImageFileName)
writer.WriteLine(" First page number: " & job.JobData.FirstPageNumber)
writer.WriteLine(" Last page number: " & job.JobData.LastPageNumber)
writer.WriteLine(" Format:" & job.JobData.Format)
writer.WriteLine(" Document file name: " & job.JobData.DocumentFileName)
writer.WriteLine("Errors:")
For Each [error] As OcrAutoRecognizeManagerJobError In job.Errors
writer.WriteLine(" Page: {0} during {1}. Error: {2}", [error].ImagePageNumber, [error].Operation, [error].Exception.Message)
Next
End Using
End If
Console.WriteLine(message)
' Decrement the documents count, when we reach 0, we are done
' Since this will be called from multiple threads, we need
' to use a thread-safety procedure
Dim pending As Integer = Interlocked.Decrement(_documentsPending)
' If we are the last document, wait up main thread
If pending = 0 Then
_allDocumentsFinishedEvent.Set()
End If
End Sub
Private Class JobData
Public AutoRecognizeManager As IOcrAutoRecognizeManager
Public ImageFileName As String
Public DocumentFileName As String
Public JobName As String
End Class
Private Sub RunJob(state As Object)
Dim data As JobData = TryCast(state, JobData)
Console.WriteLine("Running {0}", data.JobName)
' Run it
Dim jobData As New OcrAutoRecognizeJobData(data.ImageFileName, DocumentFormat.Pdf, data.DocumentFileName)
jobData.JobName = data.JobName
Dim job As IOcrAutoRecognizeJob = data.AutoRecognizeManager.CreateJob(jobData)
data.AutoRecognizeManager.RunJob(job)
End Sub
Private Function AbortJobs(ocrJob As IOcrAutoRecognizeJob) As Boolean
' In your application, you can check if abortion is required, for example, if the user
' has pressed the Cancel button on a progress bar or if your service is shutting down.
' In this example, we will never abort, but you can change this code to return true
' upon any condition (or when a specific job is about to start)
' and the engine will abort all current and pending jobs
Return False
End Function
End Class
Public NotInheritable Class LEAD_VARS
Public Const ImagesDir As String = "C:\Users\Public\Documents\LEADTOOLS Images"
Public Const OcrAdvantageRuntimeDir As String = "C:\LEADTOOLS 19\Bin\Common\OcrAdvantageRuntime"
End Class