for extracting text contained in pdf files I use the following method:
Code: Select all
public override List<String> ExtractText(String pathToFile) {
List<String> result = new List<String>();
if (File.Exists(pathToFile)) {
try {
IntPtr pBuffer = IntPtr.Zero;
int objectPtr = 0;
int ln = 0;
int res = 0;
int m_PagesCount = 0;
XCPro40_Defs.PXP_TETextComposeOptions m_TextComposeOptions;
string strBuffer = "";
res = XCPro40_Defs.PXCp_Init(out objectPtr, XCPro40_Defs.g_RegKey, XCPro40_Defs.g_DevCode);
if (!PXCp_Error.IS_DS_FAILED(res)) {
res = XCPro40_Defs.PXCp_ReadDocumentW(objectPtr, pathToFile, 0);
}
else {
throw new InvalidOperationException("Error reading document: " + PXCp_Error.GetDSErrorString(res));
}
XCPro40_Defs.PXCp_GetPagesCount(objectPtr, out m_PagesCount);
if (PXCp_Error.IS_DS_FAILED(res)) {
throw new InvalidOperationException("Error getting page count: " + PXCp_Error.GetDSErrorString(res));
}
res = XCPro40_Defs.PXCp_ET_Prepare(objectPtr);
res = XCPro40_Defs.PXCp_ET_GetCurrentComposeParams(objectPtr, out m_TextComposeOptions);
m_TextComposeOptions.ComposeMethod = XCPro40_Defs.PXP_TE_TextComposeMethod.TETCM_PreserveOrder;
m_TextComposeOptions.AddSpaces = XCPro40_Defs.PXP_TE_AddSpaces.TEAS_Single;
m_TextComposeOptions.Undecoded = XCPro40_Defs.PXP_TE_UnecodedCharacters.TEUC_KeepOriginal;
try {
for (int k = 0; k < m_PagesCount; k++) {
res = XCPro40_Defs.PXCp_ET_GetPageContentAsTextW(objectPtr, k, ref m_TextComposeOptions, out pBuffer, ref ln);
strBuffer = Marshal.PtrToStringUni(pBuffer, ln);
strBuffer = strBuffer.Replace("\0", "");
// split page content in lines and add to result
if (!string.IsNullOrWhiteSpace(strBuffer)) {
string[] pageContent = System.Text.RegularExpressions.Regex.Split(strBuffer, "\r\n");
foreach (var line in pageContent) {
if (!string.IsNullOrWhiteSpace(line)) {
result.Add(line.Trim());
}
}
}
}
}
catch (System.Exception ex) {
if (PXCp_Error.IS_DS_FAILED(res)) {
throw new InvalidOperationException("Error extracting text: " + PXCp_Error.GetDSErrorString(res), ex);
}
}
finally {
pBuffer = IntPtr.Zero;
XCPro40_Defs.PXCp_ET_Finish(objectPtr);
XCPro40_Defs.PXCp_Delete(objectPtr);
}
}
finally {
}
}
return result;
}
The result looks very strange.
Example:
Code: Select all
N O P Q R S T U
SMM SMM SMM SMM
^ ^
HmjMS_NgMPM HmjMS_NgMPM HmjMS_NgMPM HmjMS_NgMPM
J^dMN J^dMO J^dMP J^dMQ
NNWMS===OMMTJMUJNP
_ _
` `
OOMM
a a
phPPORKNMM phPPORKNMM phPPORKNMM phPPORKNMM
b b
_^v R obsW^bkaborkd j^u^r QQNMMNMM=L=OQNMMNMM OMKMTKMQ a~íìã Z^^NSK^NP
bfk_^rmi^k=jlarimi^qqb
MTKOMMT
MOKOMMS
_^v Q obsW^bkaborkd s^ofqlm==PMKOQQRU sboeKL_boke^oa _É~êÄK gMPMJ^dKK
aobepqolj^kqofb_b
rkqboi^dbkJkoK ^rcqo^dpJkoK
>
NOKOMMQ
P obsWf_p pqlo^bkpl tboh=j^u^r _rohe^oaq dÉéêK _ä~íí N
MQMQOU QQNMMNMM
K
k~ãÉ
„åÇÉêìåÖ a~íìã wìëíK kçêã rêëéêKW bêëKÑKW bêëKÇKW S_äK
siPMPig
@support: Please contact me so I can send you such a pdf file having confidential content.
Thank you!
Best regards,
cew