My program needs to determine if text is rotated or not. If the recognized text is rotated I need to process it, otherwise just ignore it. Please see attached image.
The problems I am struggling with are:
1. On the 90 deg rotated text, the OCR Confidence is still being reported very high (81-100%) making it difficult make a decision based on this alone
2. On the 90 deg rotated text, for the 'W' character, the Confidence is 100% and WordIsCertain is TRUE, again making it difficult to make a decision based on this (see Console Output below)
3. How can I tell if the OcrWord 'word' is recognized by the dictionary or not since it does not contain the collection of OcrCharacters making up the word which does contain more intelligence? i.e., you loose the intelligence the OcrCharacters of the word contains when you work with OcrWords.
4. Leading Space Confidence always seems to be 100%.
4. I am grateful for ANY thoughts/ideas!
void GetAllRecognizedWords()
{
foreach (IOcrPage ocrPage in _document.Pages)
{
String page = String.Format("Page {0} ---", _document.Pages.IndexOf(ocrPage));
Console.WriteLine(page);
IOcrPageCharacters pageCharacters = ocrPage.GetRecognizedCharacters();
foreach (IOcrZoneCharacters zoneCharacters in pageCharacters)
{
String zone = String.Format("Zone {0} --", pageCharacters.IndexOf(zoneCharacters));
Console.WriteLine(zone);
ICollection recogWords = zoneCharacters.GetWords(ocrPage.DpiX, ocrPage.DpiY, LogicalUnit.Pixel);
foreach (OcrWord word in recogWords)
{
Console.WriteLine("Word: {0}, at {1}, characters index from {2} to {3}", word.Value, word.Bounds, word.FirstCharacterIndex, word.LastCharacterIndex);
this._lstWords.Items.Add(word.Value);
}
bool nextCharacterIsNewWord = true;
// Capitalize the first letter if this is a new word
//if (nextCharacterIsNewWord)
// zoneCharacter.Code = Char.ToUpper(zoneCharacter.Code);
for (int i = 0; i < zoneCharacters.Count; i++)
{
OcrCharacter zoneCharacter = zoneCharacters[i];
Console.WriteLine(" Code: '{0}', Confidence: {1}, WordIsCertain: {2}, Leading Sp Conf {3}, GuessCode2 {4}, GuessCode3 {5}, Bounds: {6}, Position: {7}, FontSize: {8}, FontStyle: {9}",
zoneCharacter.Code,
zoneCharacter.Confidence,
zoneCharacter.WordIsCertain,
zoneCharacter.LeadingSpacesConfidence,
zoneCharacter.GuessCode2,
zoneCharacter.GuessCode3,
zoneCharacter.Bounds,
zoneCharacter.Position,
zoneCharacter.FontSize,
zoneCharacter.FontStyle);
// If the charcater is bold, make it underline
if ((zoneCharacter.FontStyle & OcrCharacterFontStyle.Bold) == OcrCharacterFontStyle.Bold)
{
zoneCharacter.FontStyle |= OcrCharacterFontStyle.Italic;
zoneCharacter.FontStyle |= OcrCharacterFontStyle.Underline;
}
// Check if next character is the start of a new word
if ((zoneCharacter.Position & OcrCharacterPosition.EndOfWord) == OcrCharacterPosition.EndOfWord ||
(zoneCharacter.Position & OcrCharacterPosition.EndOfLine) == OcrCharacterPosition.EndOfLine)
nextCharacterIsNewWord = true;
else
nextCharacterIsNewWord = false;
zoneCharacters[i] = zoneCharacter;
}
}
}
}
OCR Results:
SPACE // 0 deg text
W // 90 deg text
U
a
a
Console Output
Zone 0 --
Word: SPACE, at {X=209, Y=534, Width=75, Height=18 pixels}, characters index from 0 to 4
Code: 'S', Confidence: 100, WordIsCertain: True, Leading Sp Conf 100, GuessCode2 ^, GuessCode3 ^, Position: None
Code: 'P', Confidence: 100, WordIsCertain: True, Leading Sp Conf 100, GuessCode2 ^, GuessCode3 ^, Position: None
Code: 'A', Confidence: 100, WordIsCertain: True, Leading Sp Conf 100, GuessCode2 ^, GuessCode3 ^, Position: None
Code: 'C', Confidence: 100, WordIsCertain: True, Leading Sp Conf 100, GuessCode2 ^, GuessCode3 ^, Position: None
Code: 'E', Confidence: 100, WordIsCertain: True, Leading Sp Conf 100, GuessCode2 ^, GuessCode3 ^, Position: EndOfLine, EndOfParagraph, EndOfWord, EndOfZone
Zone 1 --
Word: W, at {X=10, Y=349, Width=19, Height=12 pixels}, characters index from 0 to 0
Word: U, at {X=10, Y=364, Width=19, Height=13 pixels}, characters index from 1 to 1
Word: a, at {X=11, Y=380, Width=17, Height=12 pixels}, characters index from 2 to 2
Word: a, at {X=10, Y=395, Width=18, Height=13 pixels}, characters index from 3 to 3
Code: 'W', Confidence: 100, WordIsCertain: True, Leading Sp Conf 100, GuessCode2 ^, GuessCode3 ^, Position: EndOfLine, EndOfWord
Code: 'U', Confidence: 93, WordIsCertain: False, Leading Sp Conf 100, GuessCode2 ^, GuessCode3 ^, Position: EndOfLine, EndOfWord
Code: 'a', Confidence: 97, WordIsCertain: False, Leading Sp Conf 100, GuessCode2 ^, GuessCode3 ^, Position: EndOfLine, EndOfWord
Code: 'a', Confidence: 81, WordIsCertain: False, Leading Sp Conf 100, GuessCode2 ^, GuessCode3 ^, Position: EndOfLine, EndOfParagraph, EndOfWord, EndOfZone, EndOfPage