LEADTOOLS OCR (Leadtools.Forms.Ocr assembly)
LEAD Technologies, Inc

UpdateWord Method

Example 





A list of OcrWord. In most cases, the same list obtained from IOcrZoneCharacters.GetWords.
The 0-based zone index of the words.
The 0-based index of the word in words to update.
The value of the new word. Use null to delete the word.
Updates or deletes a word in the recognized words list. .NET support
Syntax
void UpdateWord( 
   IList<OcrWord> words,
   int zoneIndex,
   int wordIndex,
   string newValue
)
function Leadtools.Forms.Ocr.IOcrPageCharacters.UpdateWord( 
   words ,
   zoneIndex ,
   wordIndex ,
   newValue 
)
void UpdateWord( 
   IList<OcrWord>^ words,
   int zoneIndex,
   int wordIndex,
   String^ newValue
) 
'Declaration
 
Sub UpdateWord( _
   ByVal words As IList(Of OcrWord), _
   ByVal zoneIndex As Integer, _
   ByVal wordIndex As Integer, _
   ByVal newValue As String _
) 
 
'Usage
 
Dim instance As IOcrPageCharacters
Dim words As IList(Of OcrWord)
Dim zoneIndex As Integer
Dim wordIndex As Integer
Dim newValue As String
 
instance.UpdateWord(words, zoneIndex, wordIndex, newValue)

Parameters

words
A list of OcrWord. In most cases, the same list obtained from IOcrZoneCharacters.GetWords.
zoneIndex
The 0-based zone index of the words.
wordIndex
The 0-based index of the word in words to update.
newValue
The value of the new word. Use null to delete the word.
Remarks

You can use UpdateWord to modify the OCR recognition results by updating or deleting the words before optionally saving the results to the final output document. The C# and VB OCR Edit Demo uses this technique as well as the example below.

Example
Copy CodeCopy Code  
Private Sub OcrUpdateWordExample()
      Dim tifFileName As String = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1.tif")
      Dim pdfFileName1 As String = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Original.pdf")
      Dim pdfFileName2 As String = Path.Combine(LEAD_VARS.ImagesDir, "Ocr1_Modified.pdf")
      Using ocrEngine As IOcrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Professional, False)
         ocrEngine.Startup(Nothing, Nothing, Nothing, Nothing)
         Using ocrDocument As IOcrDocument = ocrEngine.DocumentManager.CreateDocument()
            ' Recognize the TIFF file
            Dim ocrPage As IOcrPage = ocrDocument.Pages.AddPage(tifFileName, Nothing)
            ocrPage.Recognize(Nothing)

            ' Save the original recognition results to compare with the results
            ' we will modify
            ocrDocument.Save(pdfFileName1, DocumentFormat.Pdf, Nothing)

            ' Get the recognized words
            Dim pageCharacters As IOcrPageCharacters = ocrPage.GetRecognizedCharacters()
            For Each zoneCharacters As IOcrZoneCharacters In pageCharacters
               Dim words As IList(Of OcrWord) = zoneCharacters.GetWords(ocrPage.DpiX, ocrPage.DpiY, LogicalUnit.Pixel)

               ' Check for our words

               ' Note, we should not loop through words and change the collection. Remember, collections cannot
               ' be modified this way, instead, we have a helper method that returns the index of the word
               ' we are looking for then change the word. Repeat while no more words are found

               Dim index As Integer

               ' Capitilize all "the"
               Do
                  index = FindWord(words, "the", False)
                  If index <> -1 Then
                     ' We have one, update it
                     pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "THE")
                  End If
               Loop While index <> -1


               ' Delete all "a"
               Do
                  index = FindWord(words, "a", True)
                  If index <> -1 Then
                     ' We have one, update it
                     pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, Nothing)
                  End If
               Loop While index <> -1

               ' Replace all "color" with "water"
               Do
                  index = FindWord(words, "color", True)
                  If index <> -1 Then
                     ' We have one, update it
                     pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "water")
                  End If
               Loop While index <> -1
            Next

            ' We are done, update the page recognized results
            ocrPage.SetRecognizedCharacters(pageCharacters)

            ' Save this new results
            ocrDocument.Save(pdfFileName2, DocumentFormat.Pdf, Nothing)
         End Using
      End Using
   End Sub

   Private Shared Function FindWord(ByVal words As IList(Of OcrWord), ByVal value As String, ByVal ignoreCase As Boolean) As Integer
      If IsNothing(words) OrElse words.Count = 0 Then
         Return -1
      End If

      For i As Integer = 0 To words.Count - 1
         If String.Compare(words(i).Value, value, ignoreCase) = 0 Then
            ' Found it
            Return i
         End If
      Next

      ' Not found
      Return -1
   End Function

Public NotInheritable Class LEAD_VARS
   Public Const ImagesDir As String = "C:\Users\Public\Documents\LEADTOOLS Images"
End Class
private void OcrUpdateWordExample()
   {
      string tifFileName = Path.Combine(LEAD_VARS.ImagesDir,"Ocr1.tif");
      string pdfFileName1 = Path.Combine(LEAD_VARS.ImagesDir,"Ocr1_Original.pdf");
      string pdfFileName2 = Path.Combine(LEAD_VARS.ImagesDir,"Ocr1_Modified.pdf");
      using(IOcrEngine ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Professional, false))
      {
         ocrEngine.Startup(null, null, null, null);
         using(IOcrDocument ocrDocument = ocrEngine.DocumentManager.CreateDocument())
         {
            // Recognize the TIFF file
            IOcrPage ocrPage = ocrDocument.Pages.AddPage(tifFileName, null);
            ocrPage.Recognize(null);

            // Save the original recognition results to compare with the results
            // we will modify
            ocrDocument.Save(pdfFileName1, DocumentFormat.Pdf, null);

            // Get the recognized words
            IOcrPageCharacters pageCharacters = ocrPage.GetRecognizedCharacters();
            foreach(IOcrZoneCharacters zoneCharacters in pageCharacters)
            {
               IList<OcrWord> words = zoneCharacters.GetWords(ocrPage.DpiX, ocrPage.DpiY, LogicalUnit.Pixel);

               // Check for our words

               // Note, we should not loop through words and change the collection. Remember, collections cannot
               // be modified this way, instead, we have a helper method that returns the index of the word
               // we are looking for then change the word. Repeat while no more words are found

               int index;

               // Capitilize all "the"
               do
               {
                  index = FindWord(words, "the", false);
                  if(index != -1)
                  {
                     // We have one, update it
                     pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "THE");
                  }
               }
               while(index != -1);


               // Delete all "a"
               do
               {
                  index = FindWord(words, "a", true);
                  if(index != -1)
                  {
                     // We have one, update it
                     pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, null);
                  }
               }
               while(index != -1);


               // Replace all "color" with "water"
               do
               {
                  index = FindWord(words, "color", true);
                  if(index != -1)
                  {
                     // We have one, update it
                     pageCharacters.UpdateWord(words, zoneCharacters.ZoneIndex, index, "water");
                  }
               }
               while(index != -1);
            }

            // We are done, update the page recognized results
            ocrPage.SetRecognizedCharacters(pageCharacters);

            // Save this new results
            ocrDocument.Save(pdfFileName2, DocumentFormat.Pdf, null);
         }
      }
   }

   private static int FindWord(IList<OcrWord> words, string value, bool ignoreCase)
   {
      if(words == null || words.Count == 0)
      {
         return -1;
      }

      for(int i = 0; i < words.Count; i++)
      {
         if(string.Compare(words[i].Value, value, ignoreCase) == 0)
         {
            // Found it
            return i;
         }
      }

      // Not found
      return -1;
   }

static class LEAD_VARS
{
   public const string ImagesDir = @"C:\Users\Public\Documents\LEADTOOLS Images";
}
Requirements

Target Platforms: Windows 7, Windows Vista SP1 or later, Windows XP SP3, Windows Server 2008 (Server Core not supported), Windows Server 2008 R2 (Server Core supported with SP1 or later), Windows Server 2003 SP2

See Also

Reference

IOcrPageCharacters Interface
IOcrPageCharacters Members
IOcrPage.SetRecognizedCharacters
IOcrPage.GetRecognizedCharacters
IOcrPage.Recognize
IOcrPage.IsRecognized
OcrCharacter Structure
IOcrPageCharacters Interface
IOcrZoneCharacters Interface
IOcrPageCollection Interface
IOcrZoneCollection Interface
OcrZone Structure
Programming with the LEADTOOLS .NET OCR

 

 


Products | Support | Contact Us | Copyright Notices

© 2006-2012 All Rights Reserved. LEAD Technologies, Inc.

UpdateWord requires an OCR module license and unlock key. For more information, refer to: Imaging Pro/Document/Medical Features