Leadtools.Forms.DocumentReaders Namespace : DocumentPageText Class |
[SerializableAttribute()] public class DocumentPageText
'Declaration <SerializableAttribute()> Public Class DocumentPageText
'Usage Dim instance As DocumentPageText
[SerializableAttribute()] public ref class DocumentPageText
The text of a document page can be read by using the DocumentObjectManager.ParsePageText method. The text characters found in the page will be set in the in Characters property of the returned DocumentPageText object.
The text words are created from the characters found in the document based on the DocumentCharacter.IsEndOfWord returned by document reader engine. Whenever an "end of word" is found, the last set of characters are grouped together and stored as an item in the DocumentPageText.Words list.
The overall text string (with no extra properties) can be obtained using the DocumentPageText.BuildText method.
Public Sub DocumentPageTextExample() Dim documentFileName As String Using dlg As New OpenFileDialog() If dlg.ShowDialog() <> System.Windows.Forms.DialogResult.OK Then Return End If documentFileName = dlg.FileName End Using ' Load the document at 200 DPI Dim loadOptions As New DocumentReaderLoadOptions() loadOptions.Resolution = 200 Dim reader As DocumentReader = DocumentReader.Create(documentFileName, loadOptions) ' If this is a Raster document such as TIFF or JPEG, we must use an OCR engine Dim ocrEngine As IOcrEngine = Nothing If reader.ReaderType = DocumentReaderType.Raster Then ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, False) ocrEngine.Startup(Nothing, Nothing, Nothing, LEAD_VARS.OcrAdvantageRuntimeDir) End If reader.ObjectManager.BeginParse(ocrEngine) ' Get the text of the first page Dim page As DocumentReaderPage = reader.Pages(0) Dim pageText As DocumentPageText = reader.ObjectManager.ParsePageText(page) ' Create the bitmap to draw the objects to Using btmp As New Bitmap(page.PixelWidth, page.PixelHeight) btmp.SetResolution(CType(page.DpiX, Single), CType(page.DpiY, Single)) Using g As Graphics = Graphics.FromImage(btmp) g.Clear(Color.White) ' Render the objects ' Text is a word at a time Dim textRect As LogicalRectangle = LogicalRectangle.Empty Dim textFontHeight As Double = 0 Dim textWord As New StringBuilder() For Each character As DocumentCharacter In pageText.Characters ' Add the text code and rects together textWord.Append(character.Code) If textRect.IsEmpty Then textRect = character.Bounds Else textRect = LogicalRectangle.Union(textRect, character.Bounds) End If textFontHeight = Math.Max(textFontHeight, character.FontSize) ' If this is the last object in a word, render it If character.IsEndOfWord OrElse character.IsEndOfLine OrElse character.IsEndOfParagraph OrElse character.IsEndOfPage Then RenderText(g, pageText, textWord.ToString(), textRect, character, textFontHeight) textWord = New StringBuilder() textRect = LogicalRectangle.Empty End If Next ' Save the result as PNG Using saveDlg As New SaveFileDialog() saveDlg.Filter = "PNG files|*.png" If saveDlg.ShowDialog() = System.Windows.Forms.DialogResult.OK Then btmp.Save(saveDlg.FileName, System.Drawing.Imaging.ImageFormat.Png) End If End Using End Using End Using reader.ObjectManager.EndParse() If Not IsNothing(ocrEngine) Then ocrEngine.Dispose() End If reader.Dispose() End Sub Private Shared Sub RenderText(ByVal g As Graphics, ByVal pageText As DocumentPageText, _ ByVal text As String, ByVal textRect As LogicalRectangle, _ ByVal character As DocumentCharacter, ByVal textFontHeight _ As Double) ' Create the font Dim font As DocumentFont = pageText.Fonts(character.FontIndex) Dim faceName As String = font.FaceName If String.IsNullOrEmpty(faceName) Then ' Could be an embedded font, use Arial faceName = "Arial" End If Dim fontStyle As FontStyle = fontStyle.Regular If (font.FontStyle And DocumentFontStyle.Bold) = DocumentFontStyle.Bold Then fontStyle = fontStyle Or fontStyle.Bold End If If (font.FontStyle And DocumentFontStyle.Italic) = DocumentFontStyle.Italic Then fontStyle = fontStyle Or fontStyle.Italic End If If (font.FontStyle And DocumentFontStyle.Underline) = DocumentFontStyle.Underline Then fontStyle = fontStyle Or fontStyle.Underline End If Using f As New Font(faceName, CType(textFontHeight * 72 / g.DpiY, Single), fontStyle) Dim rect As New Rectangle(CType(textRect.X, Integer), CType(textRect.Y, Integer), _ CType(textRect.Width, Integer), CType(textRect.Height, _ Integer)) Using sf As New StringFormat() sf.Alignment = StringAlignment.Center sf.LineAlignment = StringAlignment.Center sf.FormatFlags = sf.FormatFlags Or StringFormatFlags.NoClip Or StringFormatFlags.NoWrap g.DrawString(text, f, Brushes.Black, rect, sf) End Using End Using End Sub
public void DocumentPageTextExample() { string documentFileName; using(OpenFileDialog dlg = new OpenFileDialog()) { if(dlg.ShowDialog() != DialogResult.OK) { return; } documentFileName = dlg.FileName; } // Load the document at 200 DPI DocumentReaderLoadOptions loadOptions = new DocumentReaderLoadOptions(); loadOptions.Resolution = 200; DocumentReader reader = DocumentReader.Create(documentFileName, loadOptions); // If this is a Raster document such as TIFF or JPEG, we must use an OCR engine IOcrEngine ocrEngine = null; if(reader.ReaderType == DocumentReaderType.Raster) { ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, false); ocrEngine.Startup(null, null, null, LEAD_VARS.OcrAdvantageRuntimeDir); } reader.ObjectManager.BeginParse(ocrEngine); // Get the text of the first page DocumentReaderPage page = reader.Pages[0]; DocumentPageText pageText = reader.ObjectManager.ParsePageText(page); // Create the bitmap to draw the objects to using(Bitmap btmp = new Bitmap(page.PixelWidth, page.PixelHeight)) { btmp.SetResolution((float)page.DpiX, (float)page.DpiY); using(Graphics g = Graphics.FromImage(btmp)) { g.Clear(Color.White); // Render the objects // Text is a word at a time LogicalRectangle textRect = LogicalRectangle.Empty; double textFontHeight = 0; StringBuilder textWord = new StringBuilder(); foreach(DocumentCharacter character in pageText.Characters) { // Add the text code and rects together textWord.Append(character.Code); if(textRect.IsEmpty) { textRect = character.Bounds; } else { textRect = LogicalRectangle.Union(textRect, character.Bounds); } textFontHeight = Math.Max(textFontHeight, character.FontSize); // If this is the last object in a word, render it if(character.IsEndOfWord || character.IsEndOfLine || character.IsEndOfParagraph || character.IsEndOfPage) { RenderText(g, pageText, textWord.ToString(), textRect, character, textFontHeight); textWord = new StringBuilder(); textRect = LogicalRectangle.Empty; } } // Save the result as PNG using(SaveFileDialog saveDlg = new SaveFileDialog()) { saveDlg.Filter = "PNG files|*.png"; if(saveDlg.ShowDialog() == DialogResult.OK) { btmp.Save(saveDlg.FileName, System.Drawing.Imaging.ImageFormat.Png); } } } } reader.ObjectManager.EndParse(); if(ocrEngine != null) { ocrEngine.Dispose(); } reader.Dispose(); } private static void RenderText(Graphics g, DocumentPageText pageText, string text, LogicalRectangle textRect, DocumentCharacter character, double textFontHeight) { // Create the font DocumentFont font = pageText.Fonts[character.FontIndex]; string faceName = font.FaceName; if(string.IsNullOrEmpty(faceName)) { // Could be an embedded font, use Arial faceName = "Arial"; } FontStyle fontStyle = FontStyle.Regular; if((font.FontStyle & DocumentFontStyle.Bold) == DocumentFontStyle.Bold) { fontStyle |= FontStyle.Bold; } if((font.FontStyle & DocumentFontStyle.Italic) == DocumentFontStyle.Italic) { fontStyle |= FontStyle.Italic; } if((font.FontStyle & DocumentFontStyle.Underline) == DocumentFontStyle.Underline) { fontStyle |= FontStyle.Underline; } using(Font f = new Font(faceName, (float)textFontHeight * 72 / g.DpiY, fontStyle)) { Rectangle rect = new Rectangle((int)textRect.X, (int)textRect.Y, (int)textRect.Width, (int)textRect.Height); using(StringFormat sf = new StringFormat()) { sf.Alignment = StringAlignment.Center; sf.LineAlignment = StringAlignment.Center; sf.FormatFlags |= StringFormatFlags.NoClip | StringFormatFlags.NoWrap; g.DrawString(text, f, Brushes.Black, rect, sf); } } }
Target Platforms: Windows 7, Windows Vista SP1 or later, Windows XP SP3, Windows Server 2008 (Server Core not supported), Windows Server 2008 R2 (Server Core supported with SP1 or later), Windows Server 2003 SP2