void ExtractDocumentText(string _documentFile)
{
using (LEADDocument _document = DocumentFactory.LoadFromFile(_documentFile, new LoadDocumentOptions()))
{
IOcrEngine _ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.LEAD);
_ocrEngine.Startup(null, null, null, @"C:\LEADTOOLS22\Bin\Common\OcrLEADRuntime");
_document.Text.OcrEngine = _ocrEngine;
foreach (DocumentPage _page in _document.Pages)
{
DocumentPageText _pageText = _page.GetText();
_pageText.BuildWords();
RasterCodecs infoCodecs = new RasterCodecs();
int pageNumber = 1 + _document.Pages.IndexOf(_page); //pages are 1-based
CodecsImageInfo pageInfo = infoCodecs.GetInformation(_documentFile, false, pageNumber);
// we'll use 300 dpi
double sizeAt300 = pageInfo.Height * 300.0 / pageInfo.YResolution;
double sizeFactor = sizeAt300 / _page.Size.Height;
SplitPageWords(_documentFile, pageNumber, sizeFactor, _pageText.Words);
}
}
}
void SplitPageWords(string inputFile, int pageNumber, double sizeFactor, IList<DocumentWord> words)
{
RasterCodecs codecs = new RasterCodecs();
codecs.Options.RasterizeDocument.Load.Resolution = 300;
codecs.Options.Pdf.Load.DisplayDepth = 24;
RasterImage img = codecs.Load(inputFile, pageNumber);
// Image and metafile for top half of the page's words
MemoryStream msTop = new MemoryStream();
var gTempTop = CreateGraphics();
Metafile mfTop = new Metafile(msTop, gTempTop.GetHdc());
Graphics gEmfTop = Graphics.FromImage(mfTop);
RasterImage imgTop = img.Clone();
imgTop.AddRectangleToRegion(null, new LeadRect(0, 0, imgTop.Width, imgTop.Height), RasterRegionCombineMode.Set);
gEmfTop.DrawRectangle(Pens.White, 0, 0, imgTop.Width, imgTop.Height);
// Image and metafile for bottom half of the page's words
MemoryStream msBot = new MemoryStream();
var gTempBot = CreateGraphics();
Metafile mfBot = new Metafile(msBot, gTempBot.GetHdc());
Graphics gEmfBot = Graphics.FromImage(mfBot);
RasterImage imgBot = img.Clone();
imgBot.AddRectangleToRegion(null, new LeadRect(0, 0, imgBot.Width, imgBot.Height), RasterRegionCombineMode.Set);
gEmfBot.DrawRectangle(Pens.White, 0, 0, imgBot.Width, imgBot.Height);
foreach (var word in words)
{
LeadRect wordBounds = new LeadRect((int)(word.Bounds.X * sizeFactor), (int)(word.Bounds.Y * sizeFactor), (int)(word.Bounds.Width * sizeFactor), (int)(word.Bounds.Height * sizeFactor));
Font font = SystemFonts.DefaultFont;
float w0 = gEmfTop.MeasureString(word.Value, font).Width - 4f;
float factor = wordBounds.Width / w0;
font = new Font(font.FontFamily, font.Size * factor);
if (wordBounds.Bottom < img.Height / 2)
{
gEmfTop.DrawString(word.Value, font, Brushes.White, wordBounds.X, wordBounds.Y);
// Add the word rectangle to the "don't erase" region
imgTop.AddRectangleToRegion(null, wordBounds, RasterRegionCombineMode.AndNotRegion);
}
else
{
gEmfBot.DrawString(word.Value, font, Brushes.White, wordBounds.X, wordBounds.Y);
// Add the word rectangle to the "don't erase" region
imgBot.AddRectangleToRegion(null, wordBounds, RasterRegionCombineMode.AndNotRegion);
}
font.Dispose();
}
// Fill the non-words area in both images with white to erase it
FillCommand Fill = new FillCommand(RasterColor.White);
Fill.Run(imgTop);
gEmfTop.Dispose(); // finished drawing the text
Fill.Run(imgBot);
gEmfBot.Dispose(); // finished drawing the text
// Create a PDF from the Top half data
DocumentWriterEmfPage pageTop = new DocumentWriterEmfPage();
pageTop.Image = imgTop;
pageTop.EmfHandle = mfTop.GetHenhmetafile();
DocumentWriter docWriterTop = new DocumentWriter();
PdfDocumentOptions pdfOptionsTop = docWriterTop.GetOptions(DocumentFormat.Pdf) as PdfDocumentOptions;
pdfOptionsTop.DocumentType = PdfDocumentType.PdfA;
pdfOptionsTop.ImageOverText = true;
pdfOptionsTop.DocumentResolution = imgTop.XResolution;
pdfOptionsTop.EmptyPageResolution = imgTop.XResolution;
docWriterTop.SetOptions(DocumentFormat.Pdf, pdfOptionsTop);
docWriterTop.BeginDocument(inputFile + $"page{pageNumber}_Top.pdf", DocumentFormat.Pdf);
docWriterTop.AddPage(pageTop);
docWriterTop.EndDocument();
mfTop.Dispose();
gTempTop.ReleaseHdc();
gTempTop.Dispose();
// Create a PDF from the Bottom half data
DocumentWriterEmfPage pageBot = new DocumentWriterEmfPage();
pageBot.Image = imgBot;
pageBot.EmfHandle = mfBot.GetHenhmetafile();
DocumentWriter docWriterBot = new DocumentWriter();
PdfDocumentOptions pdfOptionsBot = docWriterBot.GetOptions(DocumentFormat.Pdf) as PdfDocumentOptions;
pdfOptionsBot.DocumentType = PdfDocumentType.PdfA;
pdfOptionsBot.ImageOverText = true;
pdfOptionsBot.DocumentResolution = imgBot.XResolution;
pdfOptionsBot.EmptyPageResolution = imgBot.XResolution;
docWriterBot.SetOptions(DocumentFormat.Pdf, pdfOptionsBot);
docWriterBot.BeginDocument(inputFile + $"page{pageNumber}_Bot.pdf", DocumentFormat.Pdf);
docWriterBot.AddPage(pageBot);
docWriterBot.EndDocument();
mfBot.Dispose();
gTempBot.ReleaseHdc();
gTempBot.Dispose();
}