words
A list of OcrWord. In most cases, the same list obtained from IOcrZoneCharacters.GetWords.
zoneIndex
The 0-based zone index of the words.
wordIndex
The 0-based index of the word in words to update.
newValue
The value of the new word. Use null to delete the word.
You can use UpdateWord to modify the OCR recognition results by updating or deleting the words before optionally saving the results to the final output document. The C# OCR Edit Demo uses this technique as well as the example below.
This example will recognize a document, then: capitalize all "the", delete all "a" and replaces all instance of "color" with "water".
using Leadtools;
using Leadtools.Codecs;
using Leadtools.Ocr;
using Leadtools.Forms.Common;
using Leadtools.Document.Writer;
using Leadtools.WinForms;
using Leadtools.Drawing;
using Leadtools.ImageProcessing;
using Leadtools.ImageProcessing.Color;
public void OcrUpdateWordExample()
{
string tifFileName = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1.tif");
string pdfFileName1 = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Original.pdf");
string pdfFileName2 = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Modified.pdf");
using (IOcrEngine ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.LEAD))
{
ocrEngine.Startup(null, null, null, LEAD_VARS.OcrLEADRuntimeDir);
using (IOcrDocument ocrDocument = ocrEngine.DocumentManager.CreateDocument())
{
// Recognize the TIFF file
IOcrPage ocrPage = ocrDocument.Pages.AddPage(tifFileName, null);
ocrPage.Recognize(null);
// Save the original recognition results to compare with the results
// we will modify
ocrDocument.Save(pdfFileName1, DocumentFormat.Pdf, null);
// Get the recognized words
IOcrPageCharacters pageCharacters = ocrPage.GetRecognizedCharacters();
foreach (IOcrZoneCharacters zoneCharacters in pageCharacters)
{
IList<OcrWord> words = zoneCharacters.GetWords();
// Check for our words
// Note, we should not loop through words and change the collection. Remember, collections cannot
// be modified this way, instead, we have a helper method that returns the index of the word
// we are looking for then change the word. Repeat while no more words are found
int index;
// Capitalize all "the"
do
{
index = FindWord(words, "the", false);
if (index != -1)
{
// We have one, update it
pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "THE");
}
}
while (index != -1);
// Delete all "a"
do
{
index = FindWord(words, "a", true);
if (index != -1)
{
// We have one, update it
pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, null);
}
}
while (index != -1);
// Replace all "color" with "water"
do
{
index = FindWord(words, "color", true);
if (index != -1)
{
// We have one, update it
pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "water");
}
}
while (index != -1);
}
// We are done, update the page recognized results
ocrPage.SetRecognizedCharacters(pageCharacters);
// Save this new results
ocrDocument.Save(pdfFileName2, DocumentFormat.Pdf, null);
}
}
}
private static int FindWord(IList<OcrWord> words, string value, bool ignoreCase)
{
if (words == null || words.Count == 0)
{
return -1;
}
for (int i = 0; i < words.Count; i++)
{
if (string.Compare(words[i].Value, value, ignoreCase) == 0)
{
// Found it
return i;
}
}
// Not found
return -1;
}
static class LEAD_VARS
{
public const string ImagesDir = @"C:\LEADTOOLS23\Resources\Images";
public const string OcrLEADRuntimeDir = @"C:\LEADTOOLS23\Bin\Common\OcrLEADRuntime";
}