Tesseract-ocr的dll tesseract-3.02.02-win32-lib-include-dirs下载地址:
Google-Tesseract-OCR
解压之后包括 include文件夹和Lib文件夹,配置头文件和库文件,建立工程进行测试。
#include "baseapi.h"
#include "strngs.h"
#pragma comment(lib,"libtesseract302.lib")
- char * str = "test.jpg";
- tesseract::TessBaseAPI api;
- api.Init(NULL, "chi_sim", tesseract::OEM_DEFAULT); //初始化,设置语言包,中文简体:chi_sim;英文:eng;也可以自己训练语言包
- //api.SetVariable( "tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" );
- STRING text_out;
- if (!api.ProcessPages(str, NULL, 0, &text_out))
- {
- return 0;
- }
上边输出text_out.string(),结果为utf-8编码格式,因此需要转码,写了一个转成GBK的代码:
- string UTF8ToGBK(const std::string& strUTF8)
- {
- int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);
- unsigned short * wszGBK = new unsigned short[len + 1];
- memset(wszGBK, 0, len * 2 + 2);
- MultiByteToWideChar(CP_UTF8, 0,LPCSTR(strUTF8.c_str()), -1, LPWSTR(wszGBK), len);
-
- len = WideCharToMultiByte(CP_ACP, 0,LPCTSTR(wszGBK), -1, NULL, 0, NULL, NULL);
- char *szGBK = new char[len + 1];
- memset(szGBK, 0, len + 1);
- WideCharToMultiByte(CP_ACP,0, LPCTSTR(wszGBK), -1, szGBK, len, NULL, NULL);
- //strUTF8 = szGBK;
- std::string strTemp(szGBK);
- delete[]szGBK;
- delete[]wszGBK;
- return strTemp;
- }
由于是在mfc中应用,因此又写了一段mfc里边转成unicode的代码:
- wchar_t * result_str;
- CString result;
- //utf-8转换成unicode
- int len = MultiByteToWideChar(CP_UTF8,0,text_out.string(),-1,NULL,0);
- result_str = new wchar_t[len + 1];
- memset(result_str,0,len + 1);
- MultiByteToWideChar(CP_UTF8,0,text_out.string(),-1,result_str,len);
-
- //识别结果
- result = result_str;
|