PDF OCR Compressor SDK
PDF OCR Compressor SDK

PDF OCR Compressor Toolkit (SDK) Reference Manual

USD3495

Free Download PDF OCR Compressor SDK Purchase PDF OCR Compressor SDK

PDF OCR SDK Reference Manual

Download Evaluation Version
Purchase Full Version

typedef struct STEXTPOStag{
    int
x;                //left position for the character or word
    int
y;                //top position for the character or word
    int
width;          //width for the character or word
    int
height;         //height position for the character or word
    char
text[500];  //text contents
}STEXTPOS;

int WINAPI Image2PDFOCR_SinglePage_GetTextInfo(char *lpszPDFFile, LPBYTE *lpChars, char *lpszOptions)
Description
    This function does OCR on a PDF file or image file, it will return text information to calling application, this function does support single page PDF file only.

Parameters
    lpszPDFFile
        [in] Input PDF filename.
    lpChars
        [out] OCRed text contents, it is a pointer to the STEXTPOS structure.
    lpszOptions
        [in] Set the options for OCR process, this parameter does support following options,
       
-pidpi: Set the DPI resolution for render PDF page and OCR.
        -firstpg: First page to be OCRed.
        -lastpg: Last page to be OCRed.

Return Values
    If the function succeeds, the return value is the number of STEXTPOS structure which output by lpChars parameter.

Example


LPBYTE lpChars = NULL;
int nCharCount = Image2PDFOCR_SinglePage_GetTextInfo(szInFile, &lpChars, "-pidpi 300 -firstpg 1 -lastpg 1");
STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
for(int i = 0; i < nCharCount; i++)
{
    printf(
"%d,%d,%d,%d,%s\n", lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width, lpTextPos[i].height, lpTextPos[i].text);
}


void
WINAPI Image2PDFOCR_SinglePage_FreeTextInfo(LPBYTE lpChars)
Description
    Deallocates or frees a memory block.

Parameters
    lpChars
        [in] OCRed text contents, it is a pointer to the STEXTPOS structure, it is returned by Image2PDFOCR_SinglePage_GetTextInfo function.

Return Values
    None.

Example

Image2PDFOCR_SinglePage_FreeTextInfo(lpChars);


int
WINAPI Image2PDFOCR_SinglePage_CreatePDF(char *lpszInPDFFile, LPBYTE lpChars, int nCharCount, char *lpszOutPDFFile, char *lpszOptions)
Description
    Create searchable PDF file.

Parameters
    lpszInPDFFile
       
[in] Input PDF filename.
    lpChars
        [in] OCRed text contents, it is a pointer to the STEXTPOS structure.
    nCharCount
        [in] The number of STEXTPOS structure.
    lpszOutPDFFile
        [out] Create output searchable PDF filename.
    lpszOptions

        [in] Set the options for PDF creating, this parameter does support following options,
       
-pidpi: Set the DPI resolution for render PDF page and OCR, the value of this parameter should same as -pidpi option in Image2PDFOCR_SinglePage_GetTextInfo function.
        -firstpg: First page to merge the text information.
        -lastpg: Last page to merge the text information.

Return Values
    If the function succeeds, the return value is zero. If the function fails, the return value is one of following values,
        -1: Can't find input PDF file.
        -2: Can't load DLL files correctly.
        -3: Something is wrong in lpChars structure.

Example

char szInFile[MAX_PATH] = {0};
char szOutFile[MAX_PATH] = {0};
GetModulePath(szInFile,
"test2.pdf");
GetModulePath(szOutFile,
"test2_pdf_ocred-singlepage.pdf");
LPBYTE lpChars = NULL;
int nCharCount = Image2PDFOCR_SinglePage_GetTextInfo(szInFile, &lpChars, "-pidpi 300 -firstpg 1 -lastpg 1");
STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
for(int i = 0; i < nCharCount; i++)
{
    printf(
"%d,%d,%d,%d,%s\n", lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width, lpTextPos[i].height, lpTextPos[i].text);
}
nRet = Image2PDFOCR_SinglePage_CreatePDF(szInFile, lpChars, nCharCount, szOutFile,
"-pidpi 300 -firstpg 1 -lastpg 1");
Image2PDFOCR_SinglePage_FreeTextInfo(lpChars);
printf(
"Example #2 return '%d'\n", nRet);


int WINAPI Image2PDFOCR_PDFCmd(char *lpszCmd)
Description
    Process PDF file by some special commands.

Parameters
   
lpszCmd
        [in] Input Command Line to process the PDF file, it is support following commands,
            -mergepdf: merge more PDF files into one PDF file.

Return Values
    If the function succeeds, the return value is zero. If the function fails, the return value is one of following values,
        -1: Input wrong command line.
        -2: Can't load DLL files correctly.

Example

char szMergePDFCmd[1024] = {0};
sprintf(szMergePDFCmd,
"-mergepdf C:\\allfiles.txt C:\\out.pdf");
int nRet = Image2PDFOCR_PDFCmd(szMergePDFCmd);


int WINAPI Image2PDFOCR_CreateSearchablePDF(const char *lpTIFOrPDFFile, const char *lpOutputFile, const char *lpOptions)
Description
    Convert TIFF or PDF file to searchable PDF file directly.

Parameters
    lpTIFOrPDFFile
        [in] Input TIFF or PDF filename.
    lpOutputFile
        [out] Output PDF filename.
    lpOptions
        [in] Options to control the conversion.

Return Values
    If the function succeeds, the return value is zero. If the function fails, the return value is one of following values,
        1: Can't find input TIFF or PDF file.
        -2: Can't load DLL files correctly.
        2: Something is wrong during conversion.

Example

char szInFile[MAX_PATH] = {0};
char szOutFile[MAX_PATH] = {0};
GetModulePath(szInFile,
"test.tif");
GetModulePath(szOutFile,
"test_tif_ocred.pdf");
nRet = Image2PDFOCR_CreateSearchablePDF(szInFile, szOutFile,
"");
printf(
"Example #1 return '%d'\n", nRet);


HANDLE WINAPI Image2PDFOCR_GetTextHandle(
char *lpszTIFOrPDFFile, char *lpszOptions)
Description
    OCR TIFF or PDF file and return a handle.

Parameters
    lpszTIFOrPDFFile
        [in] Input TIFF or PDF filename.

    lpszOptions
        [in] Set the options for OCR process, this parameter does support following options,
       
-pidpi: Set the DPI resolution for render PDF page and OCR.
        -firstpg: First page to be OCRed.
        -lastpg: Last page to be OCRed.
        -ocrrect: OCR text in a rectangle, the unit of rectangle is pixel, the definition of rectangle is [X, Y, Width, Height], for example, if you wish OCR text in [74, 47, 200, 65] rectangle on PDF page and you wish render PDF page at 300DPI, you can use following method to calculate rectangle position on OCRed PDF page,

            ' "Coordinate on PDF page" * "the value of -pidpi" / 72 = "Coordinate on OCRed PDF page"
            x = (74 * (300 / 72))
            y = (47 * (300 / 72))
            w = (200 * (300 / 72))
            h = (65 * (300 / 72))           
            Dim socrrect As String
            socrrect = x.ToString & "," & y.ToString & "," & w.ToString & "," & h.ToString
            strOptions = "-pidpi 300 -ocrrect """ & socrrect & """ -firstpg 1 -lastpg 1"
            Dim hOCRTextSDK As Integer = Image2PDFOCR_GetTextHandle(strInFile, strOptions)

Return Values
    If the function succeeds, the return value is an open handle to the OCRed contents. If the function fails, the return value is NULL.

Example

HANDLE hOCRTextSDK = Image2PDFOCR_GetTextHandle(szInFile,
"-pidpi 300");


int WINAPI Image2PDFOCR_GetOCRedPageCount(HANDLE hImage2PDFData);
Description
    Get the OCRed page account from a handle.

Parameters
   
hImage2PDFData
        [in] This parameter is returned by Image2PDFOCR_GetTextHandle function.

Return Values
    The number of OCRed page account.


int WINAPI Image2PDFOCR_GetTextInfo(HANDLE hImage2PDFData, int nPage, LPBYTE *lpOutTextInfo);
Description
    Read text information from a handle.

Parameters
   
hImage2PDFData
        [in] This parameter is returned by Image2PDFOCR_GetTextHandle function.
    nPage
        [in] Specify page number to retrieve text information.
    lpOutTextInfo
        [in] OCRed text contents, it is a pointer to the STEXTPOS structure.

Return Values
   
If the function succeeds, the return value is the number of lpOutTextInfo structure. If the function fails, the return value is 0.

Example

LPBYTE lpChars = NULL;
int nCharCount = Image2PDFOCR_GetTextInfo(hOCRTextSDK, page, &lpChars);
STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
for(int i = 0; i < nCharCount; i++)
{
    printf(
"%d,%d,%d,%d,%s\n", lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width, lpTextPos[i].height, lpTextPos[i].text);
}


int WINAPI Image2PDFOCR_SetTextInfo(HANDLE hImage2PDFData, int nPage, LPBYTE lpInTextInfo, int nCharNum)
Description
    Modify OCRed text information.

Parameters
   
hImage2PDFData
        [in] This parameter is returned by Image2PDFOCR_GetTextHandle function.
    nPage
        [in] Specify page number to retrieve text information.
    lpInTextInfo
        [in] Set modified text information, this is a pointer to the STEXTPOS structure.
    nCharNum
        [in] the count of lpInTextInfo parameter.

Return Values
   
If the function succeeds, the return value is the number of modified STEXTPOS structure. If the function fails, the return value is 0.

Example

LPBYTE lpChars = NULL;
int nCharCount = Image2PDFOCR_GetTextInfo(hOCRTextSDK, page, &lpChars);
STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
for(int i = 0; i < nCharCount; i++)
{
    printf(
"%d,%d,%d,%d,%s\n", lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width, lpTextPos[i].height, lpTextPos[i].text);
}
if(lpTextPos)
{
    strcpy(lpTextPos->text,
"Image2PDF");
}
Image2PDFOCR_SetTextInfo(hOCRTextSDK, page, lpChars, nCharCount);


void WINAPI Image2PDFOCR_FreeTextHandle(HANDLE hImage2PDFData)
Description
    Free data handle.

Parameters
   
hImage2PDFData
        [in] This parameter is returned by Image2PDFOCR_GetTextHandle function.

Return Values
    None.


int WINAPI Image2PDFOCR_CreatePDF(HANDLE hImage2PDFData, char *lpszOutPDFFile, char *lpszOptions)
Description
    Create searchable PDF file.

Parameters
   
hImage2PDFData
        [in] This parameter is returned by Image2PDFOCR_GetTextHandle function.
    lpszOutPDFFile
        [in] output PDF filename.
    lpszOptions

        [in] Set the options for PDF creating, this parameter does support following options,
       
-pidpi: Set the DPI resolution for render PDF page and OCR, the value of this parameter should same as -pidpi option in Image2PDFOCR_GetTextHandle function.
        -firstpg: First page to merge the text information.
        -lastpg: Last page to merge the text information.

Return Values
    If the function succeeds, the return value is zero. If the function fails, the return value is one of following values,
        -1: hImage2PDFData parameter is NULL.
        -2: hImage2PDFData parameter is not a valid handle.
        -3: Can't load DLL files correctly.

Example

nRet = Image2PDFOCR_CreatePDF(hOCRTextSDK, szOutFile,
"-pidpi 300");


int WINAPI Image2PDFOCR_CreatePDFInMemory(HANDLE hImage2PDFData, char *lpszOptions, LPBYTE *lpDataBuf, int *nDataBufLen)
Description
   
    Create searchable PDF file in memory.

Parameters
   
hImage2PDFData
        [in] This parameter is returned by Image2PDFOCR_GetTextHandle function.
    lpszOptions

        [in] Set the options for PDF creating, this parameter does support following options,
       
-pidpi: Set the DPI resolution for render PDF page and OCR, the value of this parameter should same as -pidpi option in Image2PDFOCR_GetTextHandle function.
        -firstpg: First page to merge the text information.
        -lastpg: Last page to merge the text information.
    lpDataBuf
        [out] Set a pointer to receive searchable PDF stream in memory.
    nDataBufLen
        [out] Set a point to receive the length of lpDataBuf parameter.

Return Values
    If the function succeeds, the return value is zero. If the function fails, the return value is one of following values,
        -1: hImage2PDFData parameter is NULL.
        -2: hImage2PDFData parameter is not a valid handle.
        -3: Can't load DLL files correctly.

Example

char szInFile[MAX_PATH] = {0};
char szOutFile[MAX_PATH] = {0};
GetModulePath(szInFile,
"test2.pdf");
GetModulePath(szOutFile,
"test2_pdf_ocred-in-memory.pdf");
int time1 = GetTickCount();
char *lpOptions = "-pidpi 300";
int nPageCount = Image2PDFOCR_GetPageCount(szInFile);
printf(
"'%s' file contains '%d' pages.\n", szInFile, nPageCount);
HANDLE hOCRTextSDK = Image2PDFOCR_GetTextHandle(szInFile, lpOptions);
if(hOCRTextSDK)
{
    int nPageCount = Image2PDFOCR_GetOCRedPageCount(hOCRTextSDK);
    for(int page = 0; page < nPageCount; page++)
    {
        LPBYTE lpChars = NULL;
        int nCharCount = Image2PDFOCR_GetTextInfo(hOCRTextSDK, page, &lpChars);
        STEXTPOS *lpTextPos = (STEXTPOS*)lpChars;
        for(int i = 0; i < nCharCount; i++)
        {
            printf("%d,%d,%d,%d,%s\n", lpTextPos[i].x, lpTextPos[i].y, lpTextPos[i].width, lpTextPos[i].height, lpTextPos[i].text);
        }
        if(lpTextPos)
        {
            strcpy(lpTextPos->text, "Image2PDF");
        }
        printf("Update text info for page %d\n", page+1);
        Image2PDFOCR_SetTextInfo(hOCRTextSDK, page, lpChars, nCharCount);
    }
    LPBYTE lpDataBuf = NULL;
    int nDataBufLen = 0;
    nRet = Image2PDFOCR_CreatePDFInMemory(hOCRTextSDK, lpOptions, &lpDataBuf, &nDataBufLen);
    if(lpDataBuf != NULL && nDataBufLen > 0)
    {
        FILE *file = fopen(szOutFile, "wb");
        if(file)
        {
            fwrite(lpDataBuf, 1, nDataBufLen, file);
            fclose(file);
        }
    }
    Image2PDFOCR_FreeTextHandle(hOCRTextSDK);
}
int time2 = GetTickCount();
printf(
"Example return '%d', it is spend %dms (%.2fs)...\n", nRet, time2-time1, (time2-time1)/1000.0);


int WINAPI Image2PDFOCR_GetPageCount(char *lpszPDFFile)
Description
    Read page count from PDF file.

Parameters
   
lpszPDFFile
        [in] input PDF filename.

Return Values
    If the function succeeds, the return value is count of PDF pages. If the function fails, the return value is zero.


int WINAPI Image2PDFOCR_GetWordCountOnPage(HANDLE hImage2PDFData, int nPage)
Description
    Get word count from OCRed page contents.

Parameters
   
hImage2PDFData
        [in] This parameter is returned by Image2PDFOCR_GetTextHandle function.
    nPage
        [in] Specify page number to get the word count.

Return Values
    If the function succeeds, the return value is count of word contents. If the function fails, the return value is zero.

Example

int nPageCount = Image2PDFOCR_GetOCRedPageCount(hOCRTextSDK);
for(int page = 0; page < nPageCount; page++)
{
    int nWordCount = Image2PDFOCR_GetWordCountOnPage(hOCRTextSDK, page);
    for(int nWordIndex = 0; nWordIndex < nWordCount; nWordIndex++)
    {
        int X, Y, Width, Height;
        char szText[500] = {0};
        Image2PDFOCR_GetWordInfoByIndex(hOCRTextSDK, page, nWordIndex, &X, &Y, &Width, &Height, szText);
        if(nWordIndex == 0)
            strcpy(szText, "Image2PDF");
        Image2PDFOCR_SetWordInfoByIndex(hOCRTextSDK, page, nWordIndex, X, Y, Width, Height, szText);
    }
}


int WINAPI Image2PDFOCR_GetWordInfoByIndex(HANDLE hImage2PDFData, int nPage, int nWordIndex, int *X, int *Y, int *Width, int *Height, char *lpText)
Description
    Get Word information by given index.

Parameters
   
hImage2PDFData
        [in] This parameter is returned by Image2PDFOCR_GetTextHandle function.
    nPage
        [in] Specify page number to get the word information.
    nWordIndex
        [in] Specify word index to get the word information.
    X, Y, Width, Height
        [out] Receive word's X, Y, Width, Height information.
    lpText

        [out] Receive word's text information, the buffer of lpText should longer than 500 characters.

Return Values
    If the function succeeds, the return value is 1. If the function fails, the return value is zero.


int WINAPI Image2PDFOCR_SetWordInfoByIndex(HANDLE hImage2PDFData, int nPage, int nWordIndex, int X, int Y, int Width, int Height, char *lpText)
Description
   
Set Word information by given index.

Parameters
   
hImage2PDFData
        [in] This parameter is returned by Image2PDFOCR_GetTextHandle function.
    nPage
        [in] Specify page number to get the word information.
    nWordIndex
        [in] Specify word index to get the word information.
    X, Y, Width, Height
        [in] Set word's X, Y, Width, Height information.
    lpText

        [in] Set word's text information.

Return Values
    If the function succeeds, the return value is 1. If the function fails, the return value is zero.

PDF OCR Compressor SDK Copyright © 2002- ImagePDF Software all rights reserved. Home | Products | Download | Purchase | Support | Resources PDF OCR Compressor SDK