L_OcrPage_GetRecognizedCharacters

#include "ltocr.h"

L_LTOCR_API L_INT EXT_FUNCTION L_OcrPage_GetRecognizedCharacters(page, pageCharacters)

Gets the last recognized character data of this L_OcrPage.

Parameters

L_OcrPage page

Handle to the OCR page.

L_OcrPageCharacters* pageCharacters

Address to L_OcrPageCharacters structure to be updated with page recognized characters. You should call L_OcrPage_FreePageCharacters on the 'pageCharacters' parameter to free its allocated memory when no longer needed.

Returns

Value Meaning
SUCCESS The function was successful.
< 1 An error occurred. Refer to Return Codes.

Comments

You must call this method after the L_OcrPage has been recognized with the L_OcrPage_Recognize method. i.e., if the value of the L_OcrPage_IsRecognized method of this page is L_FALSE, then calling this method will return SUCCESS and 'pageCharacters' parameter won't be updated.

You can use the L_OcrPage_GetRecognizedCharacters to examine the recognized character data. This data contain information about the character codes, their confidence, guess codes, location and position in the page as well as font information. For more information, refer to L_OcrCharacter.

If you wish to modify and the apply recognition data back to the page, Use L_OcrPage_SetRecognizedCharacters.

Use L_OcrPage_GetZoneWords to get the recognized words of a zone.

Note: The LEADTOOLS OCR Module - LEAD Engine will not return any space characters when using the L_OcrPage_GetRecognizedCharacters method.

The L_OcrPage_SetRecognizedCharacters method will accept space characters in the LEADTOOLS LEAD engine. However, these space characters will be used when generating the final document (PDF) and might affect the final output. Therefore, it is not recommended that you insert space characters when using the LEADTOOLS LEAD engine.

Note: You should call L_OcrPage_FreePageCharacters on the 'pageCharacters' parameter to free its allocated memory when no longer needed.

Required DLLs and Libraries

See Also

Functions

Topics

Example

L_INT L_OcrPage_GetRecognizedCharactersExample() 
{ 
   // Create an image with some text in it 
   BITMAPHANDLE bitmap = { 0 }; 
   L_OcrEngine ocrEngine = NULL; 
   L_OcrPage ocrPage = NULL; 
   L_OcrPageCharacters ocrPageCharacters = { 0 }; 
   L_OcrDocumentManager ocrDocumentManager = NULL; 
   L_OcrDocument ocrDocument = NULL; 
 
   // Create an image to write text on 
   L_CreateBitmap(&bitmap, sizeof(BITMAPHANDLE), TYPE_CONV, 640, 200, 24, ORDER_BGR, NULL, TOP_LEFT, NULL, 0); 
 
   // Create a device context to write with 
   L_HDC LeadDC = L_CreateLeadDC(&bitmap); 
   L_INT StartGDIX = 0,   /* Drawing coordinates */ 
      StartGDIY = 0, 
      EndGDIX = BITMAPWIDTH(&bitmap),  
      EndGDIY = BITMAPHEIGHT(&bitmap);  
 
   if(LeadDC != NULL) 
   { 
      HFONT hFont; 
      RECT drawArea; 
 
      // Correct viewer coordinates if necessary 
      if (bitmap.ViewPerspective != TOP_LEFT) 
      { 
         L_PointToBitmap ( &bitmap, TOP_LEFT, & StartGDIX, & StartGDIY ); 
         L_PointToBitmap ( &bitmap, TOP_LEFT, & EndGDIX, & EndGDIY ); 
      } 
 
      SelectObject(LeadDC, GetStockObject(WHITE_PEN)); 
      SelectObject(LeadDC, GetStockObject(NULL_BRUSH)); 
 
      SetRect(&drawArea, StartGDIX, StartGDIY, EndGDIX, EndGDIY); 
 
      // Make the image white 
      FillRect(LeadDC, &drawArea, CreateSolidBrush(RGB(255,255,255))); 
 
      // Set font properties for drawing 
      hFont = CreateFont(20, 0, 0, 0, FW_NORMAL, FALSE, FALSE, FALSE, DEFAULT_CHARSET, OUT_OUTLINE_PRECIS, 
         CLIP_DEFAULT_PRECIS, DEFAULT_QUALITY, VARIABLE_PITCH, TEXT("Arial")); 
      SelectObject(LeadDC, hFont); 
 
      // Now write some text 
      SetRect(&drawArea, 0, 0, 100, 20); 
      int numChars = 11; 
      DrawText(LeadDC, TEXT("Normal line"), numChars, &drawArea, DT_TOP | DT_LEFT); 
 
      // Change font properties 
      hFont = CreateFont(20, 0, 0, 0, FW_BOLD, TRUE, TRUE, FALSE, DEFAULT_CHARSET, OUT_OUTLINE_PRECIS, 
         CLIP_DEFAULT_PRECIS, CLEARTYPE_QUALITY, VARIABLE_PITCH, TEXT("Arial")); 
      SelectObject(LeadDC,hFont); 
 
      // Write a second line 
      SetRect(&drawArea, 0, 40, 200, 100); 
      numChars = 26; 
      DrawText(LeadDC, TEXT("Bold, italic and underline"), numChars, &drawArea, DT_TOP | DT_LEFT); 
 
      // Change font properties again 
      hFont = CreateFont(20, 0, 0, 0, FW_DONTCARE, FALSE, FALSE, FALSE, DEFAULT_CHARSET, OUT_OUTLINE_PRECIS, 
         CLIP_DEFAULT_PRECIS, ANTIALIASED_QUALITY, VARIABLE_PITCH, TEXT("Courier New")); 
      SelectObject(LeadDC,hFont); 
 
      // Write a third line 
      SetRect(&drawArea, 0, 80, 160, 100); 
      numChars = 15; 
      DrawText(LeadDC, TEXT("Monospaced line"), numChars, &drawArea, DT_TOP | DT_LEFT); 
 
      DeleteObject(hFont);  
   } 
 
   // We don't need this context anymore, so free it 
   L_DeleteLeadDC(LeadDC); 
 
   // Create an instance of the engine 
   L_INT retCode = L_OcrEngineManager_CreateEngine(L_OcrEngineType_Advantage, &ocrEngine); 
   if(retCode != SUCCESS) 
      return retCode; 
 
   // Start the engine using default parameters 
   L_OcrEngine_Startup(ocrEngine, NULL, OCR_ADVANTAGE_RUNTIME_DIR); 
 
   // Add this image toan OCR page 
   L_OcrPage_FromBitmap(ocrEngine, &ocrPage, &bitmap, L_OcrBitmapSharingMode_AutoFree, NULL, NULL); 
 
   // Transfer ownership to the page 
   bitmap.Flags.Allocated = 0; 
 
   // Recognize this page 
   L_OcrPage_Recognize(ocrPage, NULL, NULL); 
 
   // Dump the characters to standard output 
   ocrPageCharacters.StructSize = sizeof(L_OcrPageCharacters); 
   L_OcrPage_GetRecognizedCharacters(ocrPage, &ocrPageCharacters); 
 
   L_UINT*map = NULL; 
   L_UINT mapSize = 0; 
   L_OcrPageSortedZonesIndexMapOptions mapOptions = { 0 }; 
   mapOptions.StructSize = sizeof(L_OcrPageSortedZonesIndexMapOptions); 
   mapOptions.Flags = L_OcrPageSortedZonesIndexMapFlags_TableCellsAsOne; 
 
   L_OcrPage_GetSortedZonesIndexMap(ocrPage, &mapOptions, &map, &mapSize); 
 
   L_UINT zoneCount = 0; 
   L_OcrPage_GetZoneCount(ocrPage, &zoneCount); 
   for(L_UINT zoneNum = 0; zoneNum < zoneCount; zoneNum++) 
   { 
      // Get the recognized words 
      L_OcrWords ocrWords = { 0 }; 
      ocrWords.StructSize = sizeof(L_OcrWords); 
      L_OcrPage_GetZoneWords(&ocrPageCharacters, map[zoneNum], &ocrWords); 
      std::wcout << L"Words in zone " << zoneNum << ":\n"; 
 
      for(L_UINT wordIndex = 0; wordIndex < ocrWords.WordCount; wordIndex++) 
      { 
         L_OcrWord ocrWord = ocrWords.Words[wordIndex]; 
 
         // Output word info 
         std::wcout << L"Word: " << ocrWord.Buffer << L", at ("  
            << ocrWord.Bounds.left << L", " << ocrWord.Bounds.top 
            << L", " << ocrWords.Words[wordIndex].Bounds.right << L", "  
            << ocrWord.Bounds.bottom << L"), characters index from "  
            << ocrWord.FirstCharacterIndex << L" to "  
            << ocrWord.LastCharacterIndex << std::endl; 
      } 
 
      // Get the data on the individual characters 
      L_OcrZoneCharacters* zoneChars = ocrPageCharacters.ZoneCharacters; 
 
      bool nextCharacterIsNewWord = true; 
      L_UINT charIndex = 0; 
      while(charIndex < zoneChars->CharacterCount) 
      { 
         // Get a specific character 
         L_OcrCharacter ocrCharacter = ocrPageCharacters.ZoneCharacters[zoneNum].Characters[charIndex]; 
 
         // Capitalize the first letter if this is a new word 
         if (nextCharacterIsNewWord) 
            ocrCharacter.Code = (L_WCHAR)toupper(ocrCharacter.Code); 
 
         // Output individual character information 
         std::wcout << L"Code: "             << ocrCharacter.Code  
            << L", Confidence: "     << ocrCharacter.Confidence  
            << L", WordIsCertain: "  << ocrCharacter.WordIsCertain 
            << L", Bounds: ("        << ocrCharacter.Bounds.left << L", " << ocrCharacter.Bounds.top << L", " 
            << ocrCharacter.Bounds.right << L", " << ocrCharacter.Bounds.bottom 
            << L") , Position: "       << ocrCharacter.Positions 
            << L", FontSize: "       << ocrCharacter.FontSize 
            << L", FontStyle: "      << ocrCharacter.FontStyles  
            << std::endl; 
 
         // If the charcater is bold, make it underline 
         if ((ocrCharacter.FontStyles & L_OcrCharacterFontStyles_Bold) == L_OcrCharacterFontStyles_Bold) 
         { 
            ocrCharacter.FontStyles |= L_OcrCharacterFontStyles_Italic; 
            ocrCharacter.FontStyles |= L_OcrCharacterFontStyles_Underline; 
         } 
 
         // Check if next character is the start of a new word 
         if ((ocrCharacter.Positions & L_OcrCharacterPositions_EndOfWord) == L_OcrCharacterPositions_EndOfWord || 
            (ocrCharacter.Positions & L_OcrCharacterPositions_EndOfLine) == L_OcrCharacterPositions_EndOfLine) 
            nextCharacterIsNewWord = true; 
         else 
            nextCharacterIsNewWord = false; 
 
         // Make change with our copy of data 
         ocrPageCharacters.ZoneCharacters[zoneNum].Characters[charIndex] = ocrCharacter; 
 
         // Go to the next character 
         charIndex++; 
      } 
 
      // For output spacing 
      std::wcout << std::endl; 
 
      // Free this now that we are done with it 
      L_OcrPage_FreeWords(&ocrWords); 
   } 
 
   // Update the engine with our character changes 
   L_OcrPage_SetRecognizedCharacters(ocrPage, &ocrPageCharacters); 
 
   // Release the data 
   L_OcrPage_FreePageCharacters(&ocrPageCharacters); 
 
   // Create an OCR document 
   L_OcrEngine_GetDocumentManager(ocrEngine, &ocrDocumentManager); 
 
   // Show the recognition results 
   // Set the PDF options to save as PDF/A text only 
   DOCWRTPDFOPTIONS pdfOptions; 
   pdfOptions.Options.uStructSize = sizeof(DOCWRTPDFOPTIONS); 
   L_OcrDocumentManager_GetFormatOptions(ocrDocumentManager, DOCUMENTFORMAT_PDF, &pdfOptions.Options); 
 
   // Set the specific PDF options we want 
   pdfOptions.FontEmbed = DOCWRTFONTEMBED_AUTO; 
   pdfOptions.bImageOverText = false; 
   pdfOptions.PdfProfile = DOCWRTPDFPROFILE_PDFA; 
 
   // Give the engine our updated PDF options 
   L_OcrDocumentManager_SetFormatOptions(ocrDocumentManager, DOCUMENTFORMAT_PDF, &pdfOptions.Options); 
 
   // Create an OCR document 
   L_OcrDocumentManager_CreateDocument(ocrDocumentManager, &ocrDocument, L_OcrCreateDocumentOptions_AutoDeleteFile, NULL); 
 
   // In Document File Mode, add OcrPage to OcrDocument after recognition 
   L_OcrDocument_AddPage(ocrDocument, ocrPage); 
 
   // Free this now that we are done with it 
   L_OcrPage_Destroy(ocrPage); 
 
   // Save the output 
   L_OcrDocument_Save(ocrDocument, MAKE_IMAGE_PATH(L_TEXT("MyImageWithTest.pdf")), DOCUMENTFORMAT_PDF, NULL, NULL); 
 
   // CLEANUP 
   if(bitmap.Flags.Allocated) 
      L_FreeBitmap(&bitmap); 
 
   // Free allocated sorted zones map buffer 
   if(map != NULL) 
      L_OcrMemory_Free(map); 
 
   // Destroy the document 
   L_OcrDocument_Destroy(ocrDocument); 
 
   // Shutdown the engine 
   L_OcrEngine_Destroy(ocrEngine); 
 
   // Open and check the result file, it should contain the following text 
   // "Normal Line" 
   // "Bold And Italic Line" 
   // "Monospaced Line" 
   // With the second line bold and underlined now 
 
   return SUCCESS; 
} 
Help Version 21.0.2021.7.2
Products | Support | Contact Us | Intellectual Property Notices
© 1991-2021 LEAD Technologies, Inc. All Rights Reserved.

LEADTOOLS OCR Module - LEAD Engine C API Help
Products | Support | Contact Us | Intellectual Property Notices
© 1991-2021 LEAD Technologies, Inc. All Rights Reserved.