OCR Code: No failures, works on all documents
Code: Select all
Init();
PDFXEdit.IPXC_Inst pxcInst = (PDFXEdit.IPXC_Inst)viewerInstance.GetExtension("PXC");
doc = pxcInst.OpenDocumentFromFile(inputPDF, clbk);
int nID = viewerInstance.Str2ID("op.document.OCRPages", false);
Op = viewerInstance.CreateOp(nID);
input = Op.Params.Root["Input"];
input.v = doc;
options = Op.Params.Root["Options"];
if (pages.Length == 0 || (pages.Length == 1 && pages[0] == -1))
{
options["PagesRange.Type"].v = "All";
}
else
{
options["PagesRange.Type"].v = "Exactly";
string pageValues = "";
for (int count = 0; count < pages.Length; count++)
{
if (pageValues != "")
{
pageValues += ",";
}
pageValues += Convert.ToString(pages[count]);
}
options["PagesRange.Text"].v = pageValues;
}
options["OutputType"].v = 0;
options["OutputDPI"].v = 300;
Logger.Log("Attempting to execute OCR on document: " + inputPDF, 5);
Op.Do();
doc.WriteToFile(inputPDF);
Logger.Log("PDF File: " + inputPDF + " had OCR completed", 5);
doc.Close();
options.Clear();
input.Clear();
pxcInst = null;
Code: Select all
IPXC_PageText pageText = this.pdf.Pages[count].GetText(null);
PXC_TextCharFlags lineStart = PXC_TextCharFlags.TCF_LineBegin;
PXC_TextCharFlags paraStart = PXC_TextCharFlags.TCF_ParaBegin;
//PXC_TextCharFlags wordStart = PXC_TextCharFlags.TCF_WordBegin;
PXC_TextCharFlags wordStop = PXC_TextCharFlags.TCF_WhiteSpace | PXC_TextCharFlags.TCF_WordSeparator | PXC_TextCharFlags.TCF_SearchWordSeparator | PXC_TextCharFlags.TCF_SyntheticSpace;
DolphinOCRWordBlock ocrWordBlock = null;
DolphinOCRLine ocrLine = null;
string formingWord = "";
double startingX = 0;
double startingY = 0;
DolphinOCRCharacter prevChar = null;
for (uint charLoop = 0; charLoop < pageText.CharsCount; charLoop++)
{
string charInfo = Convert.ToString(Convert.ToChar(pageText.Char[charLoop]));
uint flags = 0;
pageText.GetCharsFlags(charLoop, 1, out flags);
pageText.CharRect[charLoop]; // I actually do more stuff with this but not important for this part.
}
The very first character, position 0 in the code above, correctly return a "B". Also if I load this document in a PDF viewer and attempt to select the text I also get "B", yet my code produces the data below:
Convert.ToString(Convert.ToChar(pageText.Char[0]))
"B"
pageText.CharRect[0]
{PDFXCoreAPI.PXC_Rect}
bottom: 345.83999633789062
left: 62.989992141723633
right: 76.390002250671387
top: 352.67475080490112
At first I was wondering if I had some type of matrix transformation incorrect but it appears as though the "Bottom" and "Left" values are completely swapped. I can other type of documents and I get exact coordinates. Any advice?