Contains the text characters and words found in a document page.
[SerializableAttribute()]
[DataContractAttribute(Name="DocumentPageText")]
public class DocumentPageText
<SerializableAttribute()>
Public Class DocumentPageText
[SerializableAttribute()]
public ref class DocumentPageText
The text of a document page can be read by using the DocumentObjectManager.ParsePageText method. The text characters found in the page will be set in the in Characters property of the returned DocumentPageText object.
The text words are created from the characters found in the document based on the DocumentCharacter.IsEndOfWord returned by document reader engine. Whenever an "end of word" is found, the last set of characters are grouped together and stored as an item in the DocumentPageText.Words list.
The overall text string (with no extra properties) can be obtained using the DocumentPageText.BuildText method.
Public Sub DocumentPageTextExample()
Dim documentFileName As String
Using dlg As New OpenFileDialog()
If dlg.ShowDialog() <> System.Windows.Forms.DialogResult.OK Then
Return
End If
documentFileName = dlg.FileName
End Using
' Load the document at 200 DPI
Dim loadOptions As New DocumentReaderLoadOptions()
loadOptions.Resolution = 200
Dim reader As DocumentReader = DocumentReader.Create(documentFileName, loadOptions)
' If this is a Raster document such as TIFF or JPEG, we must use an OCR engine
Dim ocrEngine As IOcrEngine = Nothing
If reader.ReaderType = DocumentReaderType.Raster Then
ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, False)
ocrEngine.Startup(Nothing, Nothing, Nothing, LEAD_VARS.OcrAdvantageRuntimeDir)
End If
reader.ObjectManager.BeginParse(ocrEngine)
' Get the text of the first page
Dim page As DocumentReaderPage = reader.Pages(0)
Dim pageText As DocumentPageText = reader.ObjectManager.ParsePageText(page)
' Create the bitmap to draw the objects to
Using btmp As New Bitmap(page.PixelWidth, page.PixelHeight)
btmp.SetResolution(CType(page.DpiX, Single), CType(page.DpiY, Single))
Using g As Graphics = Graphics.FromImage(btmp)
g.Clear(Color.White)
' Render the objects
' Text is a word at a time
Dim textRect As LogicalRectangle = LogicalRectangle.Empty
Dim textFontHeight As Double = 0
Dim textWord As New StringBuilder()
For Each character As DocumentCharacter In pageText.Characters
' Add the text code and rects together
textWord.Append(character.Code)
If textRect.IsEmpty Then
textRect = character.Bounds
Else
textRect = LogicalRectangle.Union(textRect, character.Bounds)
End If
textFontHeight = Math.Max(textFontHeight, character.FontSize)
' If this is the last object in a word, render it
If character.IsEndOfWord OrElse character.IsEndOfLine OrElse character.IsEndOfParagraph OrElse character.IsEndOfPage Then
RenderText(g, pageText, textWord.ToString(), textRect, character, textFontHeight)
textWord = New StringBuilder()
textRect = LogicalRectangle.Empty
End If
Next
' Save the result as PNG
Using saveDlg As New SaveFileDialog()
saveDlg.Filter = "PNG files|*.png"
If saveDlg.ShowDialog() = System.Windows.Forms.DialogResult.OK Then
btmp.Save(saveDlg.FileName, System.Drawing.Imaging.ImageFormat.Png)
End If
End Using
End Using
End Using
reader.ObjectManager.EndParse()
If Not IsNothing(ocrEngine) Then
ocrEngine.Dispose()
End If
reader.Dispose()
End Sub
Private Shared Sub RenderText(ByVal g As Graphics, ByVal pageText As DocumentPageText, _
ByVal text As String, ByVal textRect As LogicalRectangle, _
ByVal character As DocumentCharacter, ByVal textFontHeight _
As Double)
' Create the font
Dim font As DocumentFont = pageText.Fonts(character.FontIndex)
Dim faceName As String = font.FaceName
If String.IsNullOrEmpty(faceName) Then
' Could be an embedded font, use Arial
faceName = "Arial"
End If
Dim fontStyle As FontStyle = fontStyle.Regular
If (font.FontStyle And DocumentFontStyle.Bold) = DocumentFontStyle.Bold Then
fontStyle = fontStyle Or fontStyle.Bold
End If
If (font.FontStyle And DocumentFontStyle.Italic) = DocumentFontStyle.Italic Then
fontStyle = fontStyle Or fontStyle.Italic
End If
If (font.FontStyle And DocumentFontStyle.Underline) = DocumentFontStyle.Underline Then
fontStyle = fontStyle Or fontStyle.Underline
End If
Using f As New Font(faceName, CType(textFontHeight * 72 / g.DpiY, Single), fontStyle)
Dim rect As New Rectangle(CType(textRect.X, Integer), CType(textRect.Y, Integer), _
CType(textRect.Width, Integer), CType(textRect.Height, _
Integer))
Using sf As New StringFormat()
sf.Alignment = StringAlignment.Center
sf.LineAlignment = StringAlignment.Center
sf.FormatFlags = sf.FormatFlags Or StringFormatFlags.NoClip Or StringFormatFlags.NoWrap
g.DrawString(text, f, Brushes.Black, rect, sf)
End Using
End Using
End Sub
public void DocumentPageTextExample()
{
string documentFileName;
using(OpenFileDialog dlg = new OpenFileDialog())
{
if(dlg.ShowDialog() != DialogResult.OK)
{
return;
}
documentFileName = dlg.FileName;
}
// Load the document at 200 DPI
DocumentReaderLoadOptions loadOptions = new DocumentReaderLoadOptions();
loadOptions.Resolution = 200;
DocumentReader reader = DocumentReader.Create(documentFileName, loadOptions);
// If this is a Raster document such as TIFF or JPEG, we must use an OCR engine
IOcrEngine ocrEngine = null;
if(reader.ReaderType == DocumentReaderType.Raster)
{
ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, false);
ocrEngine.Startup(null, null, null, LEAD_VARS.OcrAdvantageRuntimeDir);
}
reader.ObjectManager.BeginParse(ocrEngine);
// Get the text of the first page
DocumentReaderPage page = reader.Pages[0];
DocumentPageText pageText = reader.ObjectManager.ParsePageText(page);
// Create the bitmap to draw the objects to
using(Bitmap btmp = new Bitmap(page.PixelWidth, page.PixelHeight))
{
btmp.SetResolution((float)page.DpiX, (float)page.DpiY);
using(Graphics g = Graphics.FromImage(btmp))
{
g.Clear(Color.White);
// Render the objects
// Text is a word at a time
LogicalRectangle textRect = LogicalRectangle.Empty;
double textFontHeight = 0;
StringBuilder textWord = new StringBuilder();
foreach(DocumentCharacter character in pageText.Characters)
{
// Add the text code and rects together
textWord.Append(character.Code);
if(textRect.IsEmpty)
{
textRect = character.Bounds;
}
else
{
textRect = LogicalRectangle.Union(textRect, character.Bounds);
}
textFontHeight = Math.Max(textFontHeight, character.FontSize);
// If this is the last object in a word, render it
if(character.IsEndOfWord || character.IsEndOfLine || character.IsEndOfParagraph
|| character.IsEndOfPage)
{
RenderText(g, pageText, textWord.ToString(), textRect, character, textFontHeight);
textWord = new StringBuilder();
textRect = LogicalRectangle.Empty;
}
}
// Save the result as PNG
using(SaveFileDialog saveDlg = new SaveFileDialog())
{
saveDlg.Filter = "PNG files|*.png";
if(saveDlg.ShowDialog() == DialogResult.OK)
{
btmp.Save(saveDlg.FileName, System.Drawing.Imaging.ImageFormat.Png);
}
}
}
}
reader.ObjectManager.EndParse();
if(ocrEngine != null)
{
ocrEngine.Dispose();
}
reader.Dispose();
}
private static void RenderText(Graphics g, DocumentPageText pageText, string text,
LogicalRectangle textRect, DocumentCharacter character,
double textFontHeight)
{
// Create the font
DocumentFont font = pageText.Fonts[character.FontIndex];
string faceName = font.FaceName;
if(string.IsNullOrEmpty(faceName))
{
// Could be an embedded font, use Arial
faceName = "Arial";
}
FontStyle fontStyle = FontStyle.Regular;
if((font.FontStyle & DocumentFontStyle.Bold) == DocumentFontStyle.Bold)
{
fontStyle |= FontStyle.Bold;
}
if((font.FontStyle & DocumentFontStyle.Italic) == DocumentFontStyle.Italic)
{
fontStyle |= FontStyle.Italic;
}
if((font.FontStyle & DocumentFontStyle.Underline) == DocumentFontStyle.Underline)
{
fontStyle |= FontStyle.Underline;
}
using(Font f = new Font(faceName, (float)textFontHeight * 72 / g.DpiY, fontStyle))
{
Rectangle rect = new Rectangle((int)textRect.X, (int)textRect.Y, (int)textRect.Width,
(int)textRect.Height);
using(StringFormat sf = new StringFormat())
{
sf.Alignment = StringAlignment.Center;
sf.LineAlignment = StringAlignment.Center;
sf.FormatFlags |= StringFormatFlags.NoClip | StringFormatFlags.NoWrap;
g.DrawString(text, f, Brushes.Black, rect, sf);
}
}
}
Target Platforms: Windows 7, Windows Vista SP1 or later, Windows XP SP3, Windows Server 2008 (Server Core not supported), Windows Server 2008 R2 (Server Core supported with SP1 or later), Windows Server 2003 SP2
Raster .NET | C API | C++ Class Library | JavaScript HTML5
Document .NET | C API | C++ Class Library | JavaScript HTML5
Medical .NET | C API | C++ Class Library | JavaScript HTML5
Medical Web Viewer .NET