Private Sub OcrUpdateWordExample()
Dim tifFileName As String = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1.tif")
Dim pdfFileName1 As String = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Original.pdf")
Dim pdfFileName2 As String = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Modified.pdf")
Using ocrEngine As IOcrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Professional, False)
ocrEngine.Startup(Nothing, Nothing, Nothing, Nothing)
Using ocrDocument As IOcrDocument = ocrEngine.DocumentManager.CreateDocument()
' Recognize the TIFF file
Dim ocrPage As IOcrPage = ocrDocument.Pages.AddPage(tifFileName, Nothing)
ocrPage.Recognize(Nothing)
' Save the original recognition results to compare with the results
' we will modify
ocrDocument.Save(pdfFileName1, DocumentFormat.Pdf, Nothing)
' Get the recognized words
Dim pageCharacters As IOcrPageCharacters = ocrPage.GetRecognizedCharacters()
For Each zoneCharacters As IOcrZoneCharacters In pageCharacters
Dim words As IList(Of OcrWord) = zoneCharacters.GetWords(ocrPage.DpiX, ocrPage.DpiY, LogicalUnit.Pixel)
' Check for our words
' Note, we should not loop through words and change the collection. Remember, collections cannot
' be modified this way, instead, we have a helper method that returns the index of the word
' we are looking for then change the word. Repeat while no more words are found
Dim index As Integer
' Capitilize all "the"
Do
index = FindWord(words, "the", False)
If index <> -1 Then
' We have one, update it
pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "THE")
End If
Loop While index <> -1
' Delete all "a"
Do
index = FindWord(words, "a", True)
If index <> -1 Then
' We have one, update it
pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, Nothing)
End If
Loop While index <> -1
' Replace all "color" with "water"
Do
index = FindWord(words, "color", True)
If index <> -1 Then
' We have one, update it
pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "water")
End If
Loop While index <> -1
Next
' We are done, update the page recognized results
ocrPage.SetRecognizedCharacters(pageCharacters)
' Save this new results
ocrDocument.Save(pdfFileName2, DocumentFormat.Pdf, Nothing)
End Using
End Using
End Sub
Private Shared Function FindWord(ByVal words As IList(Of OcrWord), ByVal value As String, ByVal ignoreCase As Boolean) As Integer
If IsNothing(words) OrElse words.Count = 0 Then
Return -1
End If
For i As Integer = 0 To words.Count - 1
If String.Compare(words(i).Value, value, ignoreCase) = 0 Then
' Found it
Return i
End If
Next
' Not found
Return -1
End Function
Public NotInheritable Class LEAD_VARS
Public Const ImagesDir As String = "C:\Users\Public\Documents\LEADTOOLS Images"
End Class
private void OcrUpdateWordExample()
{
string tifFileName = Path.Combine(LEAD_VARS.ImagesDir,"Ocr1.tif");
string pdfFileName1 = Path.Combine(LEAD_VARS.ImagesDir,"Ocr1_Original.pdf");
string pdfFileName2 = Path.Combine(LEAD_VARS.ImagesDir,"Ocr1_Modified.pdf");
using(IOcrEngine ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Professional, false))
{
ocrEngine.Startup(null, null, null, null);
using(IOcrDocument ocrDocument = ocrEngine.DocumentManager.CreateDocument())
{
// Recognize the TIFF file
IOcrPage ocrPage = ocrDocument.Pages.AddPage(tifFileName, null);
ocrPage.Recognize(null);
// Save the original recognition results to compare with the results
// we will modify
ocrDocument.Save(pdfFileName1, DocumentFormat.Pdf, null);
// Get the recognized words
IOcrPageCharacters pageCharacters = ocrPage.GetRecognizedCharacters();
foreach(IOcrZoneCharacters zoneCharacters in pageCharacters)
{
IList<OcrWord> words = zoneCharacters.GetWords(ocrPage.DpiX, ocrPage.DpiY, LogicalUnit.Pixel);
// Check for our words
// Note, we should not loop through words and change the collection. Remember, collections cannot
// be modified this way, instead, we have a helper method that returns the index of the word
// we are looking for then change the word. Repeat while no more words are found
int index;
// Capitilize all "the"
do
{
index = FindWord(words, "the", false);
if(index != -1)
{
// We have one, update it
pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "THE");
}
}
while(index != -1);
// Delete all "a"
do
{
index = FindWord(words, "a", true);
if(index != -1)
{
// We have one, update it
pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, null);
}
}
while(index != -1);
// Replace all "color" with "water"
do
{
index = FindWord(words, "color", true);
if(index != -1)
{
// We have one, update it
pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "water");
}
}
while(index != -1);
}
// We are done, update the page recognized results
ocrPage.SetRecognizedCharacters(pageCharacters);
// Save this new results
ocrDocument.Save(pdfFileName2, DocumentFormat.Pdf, null);
}
}
}
private static int FindWord(IList<OcrWord> words, string value, bool ignoreCase)
{
if(words == null || words.Count == 0)
{
return -1;
}
for(int i = 0; i < words.Count; i++)
{
if(string.Compare(words[i].Value, value, ignoreCase) == 0)
{
// Found it
return i;
}
}
// Not found
return -1;
}
static class LEAD_VARS
{
public const string ImagesDir = @"C:\Users\Public\Documents\LEADTOOLS Images";
}