BuildWords Method

Summary

Build a list of the words found in the document page.

Syntax

C++/CLI

Java

Python

public void BuildWords()

public:  
   void BuildWords()

public void buildWords()

def BuildWords(self):

Remarks

The text words are created from the characters found in the document based on the IsEndOfWord returned by document parsing engine. Whenever an "end of word" is found, the last set of characters are grouped together and stored as an item in the Words list. This is not performed automatically, instead, you must call BuildWords to populate the Words list from the Characters.

The following explains how this method works. If the page text consists of the following string Hello World, then the text parser engine will populate Characters as follows (ignoring Bounds):

Index	Code	IsEndOfWord	IsEndOfLine
0	H	false	false
1	e	false	false
2	l	false	false
3	l	false	false
4	o	true	false
5	W	false	false
6	o	false	false
7	r	false	false
8	l	false	false
9	d	true	true

BuildWords will loop through each character till an item with IsEndOfWord equals to true is reached, it will then create a word from the characters found so far, in this case from index 0 to 4. The characters are combined into DocumentWord.Value (The string Hello) and the union of these characters position and location (DocumentCharacter.Bounds) are set into DocumentWord.Bounds. The first index (0) and the last index (4) are set into DocumentWord.FirstCharacterIndex and DocumentWord.LastCharacterIndex.

The method then continues to the next character (index 5) and repeat the operation, this time using indices 5 and 9 and the result is DocumentWord with Value set to 5 and LastCharacterIndex set to 9.

For more information, refer to Parsing Text with the Document Library.

Example

Java

using Leadtools; 
using Leadtools.Codecs; 
using Leadtools.Document.Writer; 
 
using Leadtools.Document; 
using Leadtools.Caching; 
using Leadtools.Annotations.Engine; 
using Leadtools.Ocr; 
using Leadtools.Barcode; 
using Leadtools.Document.Converter; 
 
public void DocumentPageTextExample() 
{ 
   var options = new LoadDocumentOptions(); 
   using (var document = DocumentFactory.LoadFromFile(Path.Combine(LEAD_VARS.ImagesDir, "Leadtools.doc"), options)) 
   { 
      // Get page text  
      var page = document.Pages[0]; 
            
      // Get all of the DocumentTextExtractionModes (DocumentTextExtractionMode reference) 
      DocumentTextExtractionMode[] textExtractionModes = (DocumentTextExtractionMode[])Enum.GetValues(typeof(DocumentTextExtractionMode)); 
      foreach (var modes in textExtractionModes) 
      { 
         Console.WriteLine($"Text extraction mode: {modes}"); 
      } 
 
      // Text extraction mode. Auto is default 
      document.Text.TextExtractionMode = DocumentTextExtractionMode.Auto; 
 
      // DocumentPageText reference 
      var pageText = page.GetText(); 
      if (pageText != null) 
      { 
         pageText.BuildText(); 
         var characters = pageText.Characters; 
         var text = pageText.Text; 
 
         Console.WriteLine(text); 
         Console.WriteLine($"Total number of characters: {characters.Count}"); 
 
         pageText.BuildWords(); 
         Console.WriteLine($"Total number of words: {pageText.Words.Count}"); 
         // Get each word 
         foreach (DocumentWord word in pageText.Words) 
         { 
            Console.WriteLine($"Bounds: {word.Bounds} | First character index: {word.FirstCharacterIndex} " + 
               $"| Last character index: {word.LastCharacterIndex} | Value: {word.Value}"); 
         } 
      } 
      else 
      { 
         Console.WriteLine("Failed!"); 
      } 
 
   } 
} 
 
static class LEAD_VARS 
{ 
   public const string ImagesDir = @"C:\LEADTOOLS23\Resources\Images"; 
}

 
import java.io.File; 
import java.io.FileOutputStream; 
import java.io.IOException; 
import java.net.MalformedURLException; 
import java.net.URI; 
import java.net.URISyntaxException; 
import java.net.URL; 
import java.nio.file.Files; 
import java.nio.file.Paths; 
import java.util.ArrayList; 
import java.util.Calendar; 
import java.util.List; 
import java.util.concurrent.Callable; 
import java.util.concurrent.ExecutorService; 
import java.util.concurrent.Executors; 
import java.util.concurrent.Future; 
import java.util.regex.Pattern; 
 
import org.junit.*; 
import org.junit.runner.JUnitCore; 
import org.junit.runner.Result; 
import org.junit.runner.notification.Failure; 
import static org.junit.Assert.*; 
 
import leadtools.*; 
import leadtools.annotations.engine.*; 
import leadtools.barcode.*; 
import leadtools.caching.*; 
import leadtools.codecs.*; 
import leadtools.document.*; 
import leadtools.document.DocumentMimeTypes.UserGetDocumentStatusHandler; 
import leadtools.document.converter.*; 
import leadtools.document.writer.*; 
import leadtools.ocr.*; 
 
 
public void documentPageTextExample() { 
   final String LEAD_VARS_IMAGES_DIR = "C:\\LEADTOOLS23\\Resources\\Images"; 
   LoadDocumentOptions options = new LoadDocumentOptions(); 
   LEADDocument document = DocumentFactory.loadFromFile(combine(LEAD_VARS_IMAGES_DIR, "Leadtools.pdf"), 
         options); 
   // Get page text 
   DocumentPage page = document.getPages().get(0); 
 
   // Get all of the DocumentTextExtractionModes (DocumentTextExtractionMode 
   // reference) 
   DocumentTextExtractionMode[] textExtractionModes = DocumentTextExtractionMode.values(); 
   for (DocumentTextExtractionMode modes : textExtractionModes) { 
      System.out.println("Text extraction mode: " + modes); 
   } 
 
   // Text extraction mode. Auto is default 
   document.getText().setTextExtractionMode(DocumentTextExtractionMode.AUTO); 
 
   // DocumentPageText reference 
   String text = ""; 
   DocumentPageText pageText = page.getText(); 
   assertTrue(pageText != null); 
   if (pageText != null) { 
      pageText.buildText(); 
      List<DocumentCharacter> characters = pageText.getCharacters(); 
      text = pageText.getText(); 
 
      System.out.println(text); 
      System.out.println("Total number of characters: " + characters.size()); 
 
      pageText.buildWords(); 
      System.out.println("Total number of words: " + pageText.getWords().size()); 
      // Get each word 
      for (DocumentWord word : pageText.getWords()) { 
         System.out.println( 
               "Bounds: " + word.getBounds() + " | First character index: " + word.getFirstCharacterIndex() + " " + 
                     "| Last character index: " + word.getLastCharacterIndex() + " | Value: " + word.getValue()); 
      } 
   } 
}

Requirements

Target Platforms

Reference

DocumentPageText Class

DocumentPageText Members

Leadtools.Document Namespace

Download our FREE evaluation

Help Version 23.0.2024.2.29

Leadtools.Document Assembly

Introduction

Getting Started

Namespaces

Leadtools.Document Namespace

Assemblies