Leadtools.Forms.DocumentReaders Namespace : DocumentReader Class |
public abstract class DocumentReader : System.IDisposable
'Declaration Public MustInherit Class DocumentReader Implements System.IDisposable
'Usage Dim instance As DocumentReader
public ref class DocumentReader abstract : public System.IDisposable
The DocumentReader class allows reading images, thumbnails, text data and metadata from any of the supported types using a uniform set of methods and properties, regardless of the document type.
The current implementation of the LEADTOOLS Document Readers support reading the following document types:
DocumentReaderType.Pdf: This is the document reader responsible for parsing PDF documents. PDF document text is parsed without the need of an OCR engine. PDF support is provided through the Leadtools.Forms.DocumentReaders.Pdf assembly.
DocumentReaderType.Xps: This is the document reader responsible for parsing XPS documents. XPS document text is parsed without the need of an OCR engine. XPS support is provided through the Leadtools.Forms.DocumentReaders.Xps assembly.
DocumentReaderType.Raster: This is the document reader responsible for parsing everything else, such as TIFF and JPEG documents. An OCR engine is required to parse the text of the document (by passing a started object of type Leadtools.Forms.Ocr.IOcrEngine to BeginParse. Raster support is provided through the Leadtools.Forms.DocumentReaders.Raster assembly.
LEADTOOLS will add more document readers and functionality in the near future for documents such as DICOM, DOC/DOCX(2007/2010), XLS/XLSX(2007/2010) and RTF. More objects types such as images, bookmarks, hyperlinks and annotations will also be added in the near future. Currently, support for these formats is provided by the Raster document reader (with text parsing supported by an external OCR engine).
DocumentReader is an abstract class and cannot be initialized directly. The derived classes to support PDF, XPS and the various other formats are internal to LEADTOOLS. Instead, get a DocumentReader object by using the DocumentReader.Create static (Shared in Visual Basic) method. This method will try to load the document in the supported readers and if successful, will return an instance of DocumentReader ready to use.
Once you obtain a valid instance of a DocumentReader object with a document loaded into it, you can use the following features:
Use the Pages property to access the pages of the document.
The MimeType property and GetProperties method can be used to obtain the metadata of the document.
The methods of the ImageManager property can be used get a raster image render or a thumbnail of any page in the document.
The methods of the ObjectManager property can be used to parse the objects found in any page in the document such as text items and font properties.
The DocumentReader class implements the System.IDisposable interface. You must call the System.IDisposable.Dispose method when the reader is no longer used.
Imports Leadtools Imports Leadtools.Codecs Imports Leadtools.Forms Imports Leadtools.Forms.DocumentReaders Imports Leadtools.WinForms Imports Leadtools.Forms.Ocr Imports Leadtools.Drawing Public Sub DocumentReaderExample() Dim documentFileName As String Using dlg As New OpenFileDialog() If dlg.ShowDialog() <> System.Windows.Forms.DialogResult.OK Then Return End If documentFileName = dlg.FileName End Using ' Load the document using default options Dim reader As DocumentReader = DocumentReader.Create(documentFileName, Nothing) ' Show the document properties Dim sb As New StringBuilder() sb.AppendFormat("Reader used: {0}\n", reader.ReaderType) sb.AppendFormat("Document has {0} pages\n", reader.Pages.Count) ' Get the properties (meta data) Dim props As IDictionary(Of String, String) = reader.GetProperties() For Each prop As KeyValuePair(Of String, String) In props sb.AppendFormat("{0}: {1}\n", prop.Key, prop.Value) Next MessageBox.Show(sb.ToString()) ' Now show the pages sizes sb = New StringBuilder() For Each page As DocumentReaderPage In reader.Pages sb.AppendFormat("Page: {0} size: {1}\n", page.PageNumber, page.Size) Next MessageBox.Show(sb.ToString()) ' Now loop and show the text for each page till use cancels ' If this is a Raster document such as TIFF or JPEG, we must use an OCR engine Dim ocrEngine As IOcrEngine = Nothing If reader.ReaderType = DocumentReaderType.Raster Then ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, False) ocrEngine.Startup(Nothing, Nothing, Nothing, LEAD_VARS.OcrAdvantageRuntimeDir) End If reader.ObjectManager.BeginParse(ocrEngine) For Each page As DocumentReaderPage In reader.Pages ' Parse this page Dim pageText As DocumentPageText = reader.ObjectManager.ParsePageText(page) Dim text As String = pageText.BuildText() If MessageBox.Show(text, String.Format("Page {0} text, continue to next page?", _ page.PageNumber), MessageBoxButtons.YesNo) = _ System.Windows.Forms.DialogResult.No Then Exit For End If Next reader.ObjectManager.EndParse() If Not IsNothing(ocrEngine) Then ocrEngine.Dispose() End If reader.Dispose() End Sub
using Leadtools; using Leadtools.Codecs; using Leadtools.Forms; using Leadtools.Forms.DocumentReaders; using Leadtools.WinForms; using Leadtools.Forms.Ocr; using Leadtools.Drawing; public void DocumentReaderExample() { string documentFileName; using(OpenFileDialog dlg = new OpenFileDialog()) { if(dlg.ShowDialog() != DialogResult.OK) { return; } documentFileName = dlg.FileName; } // Load the document using default options DocumentReader reader = DocumentReader.Create(documentFileName, null); // Show the document properties StringBuilder sb = new StringBuilder(); sb.AppendFormat("Reader used: {0}\n", reader.ReaderType); sb.AppendFormat("Document has {0} pages\n", reader.Pages.Count); // Get the properties (meta data) IDictionary<string, string> props = reader.GetProperties(); foreach(KeyValuePair<string, string> prop in props) { sb.AppendFormat("{0}: {1}\n", prop.Key, prop.Value); } MessageBox.Show(sb.ToString()); // Now show the pages sizes sb = new StringBuilder(); foreach(DocumentReaderPage page in reader.Pages) { sb.AppendFormat("Page: {0} size: {1}\n", page.PageNumber, page.Size); } MessageBox.Show(sb.ToString()); // Now loop and show the text for each page till use cancels // If this is a Raster document such as TIFF or JPEG, we must use an OCR engine IOcrEngine ocrEngine = null; if(reader.ReaderType == DocumentReaderType.Raster) { ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, false); ocrEngine.Startup(null, null, null, LEAD_VARS.OcrAdvantageRuntimeDir); } reader.ObjectManager.BeginParse(ocrEngine); foreach(DocumentReaderPage page in reader.Pages) { // Parse this page DocumentPageText pageText = reader.ObjectManager.ParsePageText(page); string text = pageText.BuildText(); if(MessageBox.Show(text, string.Format("Page {0} text, continue to next page?", page.PageNumber), MessageBoxButtons.YesNo) == DialogResult.No) { break; } } reader.ObjectManager.EndParse(); if(ocrEngine != null) { ocrEngine.Dispose(); } reader.Dispose(); }