Leadtools.Forms.Ocr Namespace > IOcrPageCharacters Interface : UpdateWord Method |
'Declaration Sub UpdateWord( _ ByVal words As IList(Of OcrWord), _ ByVal zoneIndex As Integer, _ ByVal wordIndex As Integer, _ ByVal newValue As String _ )
'Usage Dim instance As IOcrPageCharacters Dim words As IList(Of OcrWord) Dim zoneIndex As Integer Dim wordIndex As Integer Dim newValue As String instance.UpdateWord(words, zoneIndex, wordIndex, newValue)
void UpdateWord( Windows.Foundation.Collections.IVector //In WinRT the IListinterface is replaced by IVector <OcrWord> words, int zoneIndex, int wordIndex, string newValue )
You can use UpdateWord to modify the OCR recognition results by updating or deleting the words before optionally saving the results to the final output document. The C# and VB OCR Edit Demo uses this technique as well as the example below.
Imports Leadtools Imports Leadtools.Codecs Imports Leadtools.Forms.Ocr Imports Leadtools.Forms Imports Leadtools.Forms.DocumentWriters Imports Leadtools.WinForms Imports Leadtools.ImageProcessing.Core Imports Leadtools.Drawing Private Sub OcrUpdateWordExample() Dim tifFileName As String = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1.tif") Dim pdfFileName1 As String = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Original.pdf") Dim pdfFileName2 As String = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Modified.pdf") Using ocrEngine As IOcrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, False) ocrEngine.Startup(Nothing, Nothing, Nothing, LEAD_VARS.OcrAdvantageRuntimeDir) Using ocrDocument As IOcrDocument = ocrEngine.DocumentManager.CreateDocument() ' Recognize the TIFF file Dim ocrPage As IOcrPage = ocrDocument.Pages.AddPage(tifFileName, Nothing) ocrPage.Recognize(Nothing) ' Save the original recognition results to compare with the results ' we will modify ocrDocument.Save(pdfFileName1, DocumentFormat.Pdf, Nothing) ' Get the recognized words Dim pageCharacters As IOcrPageCharacters = ocrPage.GetRecognizedCharacters() For Each zoneCharacters As IOcrZoneCharacters In pageCharacters Dim words As IList(Of OcrWord) = zoneCharacters.GetWords(ocrPage.DpiX, ocrPage.DpiY, LogicalUnit.Pixel) ' Check for our words ' Note, we should not loop through words and change the collection. Remember, collections cannot ' be modified this way, instead, we have a helper method that returns the index of the word ' we are looking for then change the word. Repeat while no more words are found Dim index As Integer ' Capitilize all "the" Do index = FindWord(words, "the", False) If index <> -1 Then ' We have one, update it pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "THE") End If Loop While index <> -1 ' Delete all "a" Do index = FindWord(words, "a", True) If index <> -1 Then ' We have one, update it pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, Nothing) End If Loop While index <> -1 ' Replace all "color" with "water" Do index = FindWord(words, "color", True) If index <> -1 Then ' We have one, update it pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "water") End If Loop While index <> -1 Next ' We are done, update the page recognized results ocrPage.SetRecognizedCharacters(pageCharacters) ' Save this new results ocrDocument.Save(pdfFileName2, DocumentFormat.Pdf, Nothing) End Using End Using End Sub Private Shared Function FindWord(ByVal words As IList(Of OcrWord), ByVal value As String, ByVal ignoreCase As Boolean) As Integer If IsNothing(words) OrElse words.Count = 0 Then Return -1 End If For i As Integer = 0 To words.Count - 1 If String.Compare(words(i).Value, value, ignoreCase) = 0 Then ' Found it Return i End If Next ' Not found Return -1 End Function Public NotInheritable Class LEAD_VARS Public Const ImagesDir As String = "C:\Users\Public\Documents\LEADTOOLS Images" Public Const OcrAdvantageRuntimeDir As String = "C:\LEADTOOLS 18\Bin\Common\OcrAdvantageRuntime" End Class
using Leadtools; using Leadtools.Codecs; using Leadtools.Forms.Ocr; using Leadtools.Forms; using Leadtools.Forms.DocumentWriters; using Leadtools.WinForms; using Leadtools.Drawing; private void OcrUpdateWordExample() { string tifFileName = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1.tif"); string pdfFileName1 = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Original.pdf"); string pdfFileName2 = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Modified.pdf"); using (IOcrEngine ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, false)) { ocrEngine.Startup(null, null, null, LEAD_VARS.OcrAdvantageRuntimeDir); using (IOcrDocument ocrDocument = ocrEngine.DocumentManager.CreateDocument()) { // Recognize the TIFF file IOcrPage ocrPage = ocrDocument.Pages.AddPage(tifFileName, null); ocrPage.Recognize(null); // Save the original recognition results to compare with the results // we will modify ocrDocument.Save(pdfFileName1, DocumentFormat.Pdf, null); // Get the recognized words IOcrPageCharacters pageCharacters = ocrPage.GetRecognizedCharacters(); foreach (IOcrZoneCharacters zoneCharacters in pageCharacters) { IList<OcrWord> words = zoneCharacters.GetWords(ocrPage.DpiX, ocrPage.DpiY, LogicalUnit.Pixel); // Check for our words // Note, we should not loop through words and change the collection. Remember, collections cannot // be modified this way, instead, we have a helper method that returns the index of the word // we are looking for then change the word. Repeat while no more words are found int index; // Capitilize all "the" do { index = FindWord(words, "the", false); if (index != -1) { // We have one, update it pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "THE"); } } while (index != -1); // Delete all "a" do { index = FindWord(words, "a", true); if (index != -1) { // We have one, update it pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, null); } } while (index != -1); // Replace all "color" with "water" do { index = FindWord(words, "color", true); if (index != -1) { // We have one, update it pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "water"); } } while (index != -1); } // We are done, update the page recognized results ocrPage.SetRecognizedCharacters(pageCharacters); // Save this new results ocrDocument.Save(pdfFileName2, DocumentFormat.Pdf, null); } } } private static int FindWord(IList<OcrWord> words, string value, bool ignoreCase) { if (words == null || words.Count == 0) { return -1; } for (int i = 0; i < words.Count; i++) { if (string.Compare(words[i].Value, value, ignoreCase) == 0) { // Found it return i; } } // Not found return -1; } static class LEAD_VARS { public const string ImagesDir = @"C:\Users\Public\Documents\LEADTOOLS Images"; public const string OcrAdvantageRuntimeDir = @"C:\LEADTOOLS 18\Bin\Common\OcrAdvantageRuntime"; }
IOcrPageCharacters Interface
IOcrPageCharacters Members
IOcrPage.SetRecognizedCharacters
IOcrPage.GetRecognizedCharacters
IOcrPage.Recognize
IOcrPage.IsRecognized
OcrCharacter Structure
IOcrPageCharacters Interface
IOcrZoneCharacters Interface
IOcrPageCollection Interface
IOcrZoneCollection Interface
OcrZone Structure
Programming with the LEADTOOLS .NET OCR