TextExtractionMode Property

Summary

Mode to use when extracting text from this document.

Syntax

C++/CLI

Java

Python

public DocumentTextExtractionMode TextExtractionMode { get; set; }

public:  
   property DocumentTextExtractionMode^ TextExtractionMode 
   { 
      DocumentTextExtractionMode^ get() 
      void set(DocumentTextExtractionMode^ value) 
   }

public DocumentTextExtractionMode getTextExtractionMode() 
public void setTextExtractionMode(DocumentTextExtractionMode value)

TextExtractionMode # get and set (DocumentText)

Property Value

The mode to use when extracting text from this document. Default value is DocumentTextExtractionMode.Auto.

Remarks

TextExtractionMode is used to control how DocumentPage.GetText extracts the text from the page.

If the value of TextExtractionMode is DocumentTextExtractionMode.Auto or DocumentTextExtractionMode.OcrOnly then an OCR engine instance is required in order to extract the text correctly. This instance must be set in the OcrEngine property prior to calling DocumentPage.GetText.

DocumentImages.IsSvgSupported is used to determine whether the document supports extracting text using SVG.

For more information, refer to Parsing Text with the Document Library.

Note that setting this property to a value will update the same value in each child document.

Example

Java

using Leadtools; 
using Leadtools.Codecs; 
using Leadtools.Document.Writer; 
 
using Leadtools.Document; 
using Leadtools.Caching; 
using Leadtools.Annotations.Engine; 
using Leadtools.Ocr; 
using Leadtools.Barcode; 
using Leadtools.Document.Converter; 
 
public void DocumentPageGetLinksExample() 
{ 
   var cache = GetCache(); 
   var options = new LoadDocumentOptions(); 
   options.Cache = cache; 
   using (var document = DocumentFactory.LoadFromFile(Path.Combine(LEAD_VARS.ImagesDir, "Leadtools.pdf"), options)) 
   { 
      document.IsReadOnly = false; 
      // Show the links before parsing the URL in the text 
      Console.WriteLine("Before get text"); 
      Console.WriteLine("---------"); 
      var page = document.Pages[0]; 
      page.SetLinks(page.GetLinks()); 
      ShowLinks(page); 
 
      // Get all of the DocumentPageFitTypes 
      DocumentPageFitType[] pageFitType = (DocumentPageFitType[])Enum.GetValues(typeof(DocumentPageFitType)); 
      foreach (var type in pageFitType) 
      { 
         Console.WriteLine($"Page fit type: {type}"); 
      } 
 
      // Make sure we will parse the hyper links 
      // DocumentText reference 
      document.Text.AutoParseLinks = true; 
      document.Text.TextExtractionMode = DocumentTextExtractionMode.Auto; 
 
      // Show the regular expressions 
      Console.WriteLine("Parsing links from the text using these regular expressions:"); 
      foreach (var regex in DocumentText.LinkPatterns) 
      { 
         Console.WriteLine(regex.ToString()); 
      } 
 
      // Now, get the text to parse the links from it 
      page.GetText(); 
 
      page.IsLinksModified = false; 
 
      // Show the links before parsing the URL in the text. It should now show the original plus any parsed URLs from the text 
      Console.WriteLine("After get text"); 
      Console.WriteLine("---------"); 
      ShowLinks(page); 
   } 
} 
 
private static void ShowLinks(Leadtools.Document.DocumentPage page) 
{ 
   // DocumentLink reference 
   var links = page.GetLinks(); 
   if (links != null) 
   { 
      int index = 0; 
      Console.WriteLine("Page " + page.PageNumber); 
      foreach (var link in links) 
      { 
         Console.WriteLine(index++); 
         Console.WriteLine("  Bounds:" + link.Bounds); 
         Console.WriteLine("  LinkType:" + link.LinkType); 
         if (link.LinkType == DocumentLinkType.Value) 
         { 
            Console.WriteLine("  Value:" + link.Value); 
         } 
         else 
         { 
            // DocumentLinkTarget reference 
            Console.WriteLine("  Target.PageFitType:" + link.Target.PageFitType); 
            Console.WriteLine("  Target.PageNumber:" + link.Target.PageNumber); 
            Console.WriteLine("  Target.Position:" + link.Target.Position); 
            Console.WriteLine("  Target.ZoomPercent:" + link.Target.ZoomPercent); 
         } 
         Console.WriteLine(); 
      } 
   } 
} 
 
static class LEAD_VARS 
{ 
   public const string ImagesDir = @"C:\LEADTOOLS23\Resources\Images"; 
}

 
import java.io.File; 
import java.io.FileOutputStream; 
import java.io.IOException; 
import java.net.MalformedURLException; 
import java.net.URI; 
import java.net.URISyntaxException; 
import java.net.URL; 
import java.nio.file.Files; 
import java.nio.file.Paths; 
import java.util.ArrayList; 
import java.util.Calendar; 
import java.util.List; 
import java.util.concurrent.Callable; 
import java.util.concurrent.ExecutorService; 
import java.util.concurrent.Executors; 
import java.util.concurrent.Future; 
import java.util.regex.Pattern; 
 
import org.junit.*; 
import org.junit.runner.JUnitCore; 
import org.junit.runner.Result; 
import org.junit.runner.notification.Failure; 
import static org.junit.Assert.*; 
 
import leadtools.*; 
import leadtools.annotations.engine.*; 
import leadtools.barcode.*; 
import leadtools.caching.*; 
import leadtools.codecs.*; 
import leadtools.document.*; 
import leadtools.document.DocumentMimeTypes.UserGetDocumentStatusHandler; 
import leadtools.document.converter.*; 
import leadtools.document.writer.*; 
import leadtools.ocr.*; 
 
 
public void documentPageGetLinksExample() { 
   final String LEAD_VARS_IMAGES_DIR = "C:\\LEADTOOLS23\\Resources\\Images"; 
   FileCache cache = getCache(); 
   LoadDocumentOptions options = new LoadDocumentOptions(); 
   options.setCache(cache); 
   LEADDocument document = DocumentFactory.loadFromFile(combine(LEAD_VARS_IMAGES_DIR, "Leadtools.pdf"), options); 
   document.setReadOnly(false); 
   // Show the links before parsing the URL in the text 
   System.out.println("Before get text"); 
   System.out.println("---------"); 
   DocumentPage page = document.getPages().get(0); 
   page.setLinks(page.getLinks()); 
   showLinks(page); 
 
   // Get all of the DocumentPageFitTypes 
   DocumentPageFitType[] pageFitType = DocumentPageFitType.values(); 
   for (DocumentPageFitType type : pageFitType) { 
      System.out.println("Page fit type: " + type); 
   } 
 
   // Make sure we will parse the hyper links 
   // DocumentText reference 
   document.getText().setAutoParseLinks(true); 
   document.getText().setTextExtractionMode(DocumentTextExtractionMode.AUTO); 
 
   // Show the regular expressions 
   System.out.println("Parsing links from the text using these regular expressions:"); 
   for (Pattern regex : DocumentText.getLinkPatterns()) { 
      System.out.println(regex.toString()); 
   } 
 
   // Now, get the text to parse the links from it 
   page.getText(); 
 
   page.setLinkedModified(false); 
 
   // Show the links before parsing the URL in the text. It should now show the 
   // original plus any parsed URLs from the text 
   System.out.println("After get text"); 
   System.out.println("---------"); 
   showLinks(page); 
   assertTrue(page.getLinks() != null); 
} 
 
private void showLinks(DocumentPage page) { 
   // DocumentLink reference 
   DocumentLink[] links = page.getLinks(); 
   if (links != null) { 
      int index = 0; 
      System.out.println("Page " + page.getPageNumber()); 
      for (DocumentLink link : links) { 
         System.out.println("Index: " + index++); 
         System.out.println("  Bounds:" + link.getBounds()); 
         System.out.println("  LinkType:" + link.getLinkType()); 
         if (link.getLinkType() == DocumentLinkType.VALUE) { 
            System.out.println("  Value:" + link.getValue()); 
         } else { 
            // DocumentLinkTarget reference 
            System.out.println("  Target.PageFitType:" + link.getTarget().getPageFitType()); 
            System.out.println("  Target.PageNumber:" + link.getTarget().getPageNumber()); 
            System.out.println("  Target.Position:" + link.getTarget().getPosition()); 
            System.out.println("  Target.ZoomPercent:" + link.getTarget().getZoomPercent()); 
         } 
      } 
   } 
}

Requirements

Target Platforms

Reference

DocumentText Class

DocumentText Members

Leadtools.Document Namespace

Download our FREE evaluation

Help Version 23.0.2024.2.29

Leadtools.Document Assembly

Introduction

Getting Started

Namespaces

Leadtools.Document Namespace

Assemblies