VC 编程ANSI环境下读写Unicode文件
没有注意到文件编码的不同会产生这么多的问题,在动手以前查询了很多资料,在本博客中收藏了不少先辈的成果,在这里一并表示致敬! 关于ANSI和Unicode编码的原理在这里也不说了,主要讲下如何读写! 首先确定你的工程是采用的是什么编码环境,默认是ANSI,不同的字符集读写文件的差别也比较大,我这里只在ANSI环境下做的,下一步在探索在Unicode环境下如何读写!(原先这个没搞懂,读了不少代码发现自己试验都是有误的)。 在ANSI的字符集下,CString等都是单字节版本的,所以一定要注意。而多要读取的Unicode文件却是双字节的,这里就要转换了,当然在ANSI字符集下,还是用二进制的方式打开Unicode文件,自己判断是否是换行,在转化成ANSI编码。而在写Unicode的时候,先将所字符转化成Unicode编码再写入,而且在写文件之前一定要加上Unicode文件的标识。
下面是读 CFile mFile(UnicodefilePath,CFile::modeRead); byte head[2]; mFile.Read(head,2); if((head[0]==0xff&&head[1]==0xfe)||(head[0]==0xfe&&head[1]==0xff) ) { //AfxMessageBox(_T("File is Unicode!")); isUnicode = true; } if(isUnicode) mFile.Seek(2,CFile::begin); //0xfffe wchar_t wch; wchar_t wstr[300];
CString strvalue ; hile(mFile.Read((char *)&wch,2)>0) { if(wch==0x000D) //by line { //chang to ansi int nLen = i; char *buf = new char[2*nLen]; WideCharToMultiByte(CP_ACP, 0, wstr, nLen, buf, 2*nLen, NULL, NULL); buf[2*nLen-1] = 0; //some assertion failed,这个比较重要,小问题可以折腾人啊 strvalue = buf; mFile.Seek(2,CFile::current); //跳过行开头符号 i=0; } else { wstr[i++] = wch; } }
//下面是写 CStdioFile transFile; transFile.Open(strUnicodeSavepath,CFile::modeCreate|CFile::modeWrite|CFile::typeBinary); WORD wSignature = 0xFEFF; transFile.Write(&wSignature, 2); //Unicode的文件符号 CHAR *pszAnsi = new TCHAR[strvalue.GetLength()+1]; _tcscpy(pszAnsi, strvalue); WCHAR * szwBuffer = new WCHAR[strvalue.GetLength()+1]; MultiByteToWideChar(CP_ACP, 0, pszAnsi, -1, szwBuffer, strvalue.GetLength()+1); //write to files transFile.Write(szwBuffer, lstrlenW(szwBuffer) * sizeof(WCHAR));
当然你可以把你的工程设置成Unicode的字符集,这个时候在Unicode的工程下读取ANSI文件又是一个烦人的事情,讲文件读到CString中的时候,每个单字节的ANSI被转成了双字节,需要自己来处理,后面我再探索探索再来记录。
本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/Augusdi/archive/2009/10/15/4677528.aspx
============================================================================
将CStdioFile类扩展,读取UNICODE文本文件
最近因为要读取SQL脚本文件,用CStdioFile来读取脚本文件,却在程序调试时读取不了文件。 后来看了一下文本文件格式,竟然是UNICODE格式的,原来在导出SQL脚本的时候,选项默认的是UNICODE格式。为了同时支持ANSI和UNICODE两种格式,在Codeproject站点上找到了CStdioFileEx类的代码,但在调试运行过程中发现,在生成UNICODE版本的执行文件时,运行没有错误,但在非UNICODE版本中却出现错误,原来在代码中此部分没有考虑文件读到末尾的情况,经修改,CStdioFileEx类就可以正常使用了,在读取文本文件时,自动识别ANSI和UNICODE两种格式。
实现头文件如下: #define nUNICODE_BOM 0xFEFF // Unicode "byte order mark" which goes at start of file #define sNEWLINE _T("/r/n") // New line characters #define sDEFAULT_UNICODE_FILLER_CHAR "#" // Filler char used when no conversion from Unicode to local code page is possible
class CStdioFileEx: public CStdioFile { public: CStdioFileEx(); CStdioFileEx( LPCTSTR lpszFileName, UINT nOpenFlags );
virtual BOOL Open( LPCTSTR lpszFileName, UINT nOpenFlags, CFileException* pError = NULL ); virtual BOOL ReadString(CString& rString); virtual void WriteString( LPCTSTR lpsz ); bool IsFileUnicodeText() { return m_bIsUnicodeText; } unsigned long GetCharCount();
// Additional flag to allow Unicode text writing static const UINT modeWriteUnicode;
// static utility functions
// -------------------------------------------------------------------------------------------- // // CStdioFileEx::GetUnicodeStringFromMultiByteString() // // -------------------------------------------------------------------------------------------- // Returns: bool // Parameters: char * szMultiByteString (IN) Multi-byte input string // wchar_t* szUnicodeString (OUT) Unicode output string // short nUnicodeBufferSize (IN) Size of Unicode output buffer // UINT nCodePage (IN) Code page used to perform conversion // Default = -1 (Get local code page). // // Purpose: Gets a Unicode string from a MultiByte string. // Notes: None. // Exceptions: None. // static bool GetUnicodeStringFromMultiByteString(char * szMultiByteString,wchar_t* szUnicodeString, short nUnicodeBufferSize,UINT nCodePage=-1);
// -------------------------------------------------------------------------------------------- // // CStdioFileEx::GetMultiByteStringFromUnicodeString() // // -------------------------------------------------------------------------------------------- // Returns: BOOL // Parameters: wchar_t * szUnicodeString (IN) Unicode input string // char* szMultiByteString (OUT) Multibyte output string // short nMultiByteBufferSize (IN) Multibyte buffer size // UINT nCodePage (IN) Code page used to perform conversion // Default = -1 (Get local code page). // // Purpose: Gets a MultiByte string from a Unicode string. // Notes: . // Exceptions: None. // static BOOL GetMultiByteStringFromUnicodeString(wchar_t * szUnicodeString,char* szMultiByteString, short nMultiByteBufferSize,UINT nCodePage=-1);
// -------------------------------------------------------------------------------------------- // // CStdioFileEx::IsFileUnicode() // // -------------------------------------------------------------------------------------------- // Returns: bool // Parameters: const CString& sFilePath // // Purpose: Determines whether a file is Unicode by reading the first character and detecting // whether it's the Unicode byte marker. // Notes: None. // Exceptions: None. // static bool IsFileUnicode(const CString& sFilePath);
protected: UINT ProcessFlags(const CString& sFilePath, UINT& nOpenFlags);
bool m_bIsUnicodeText; UINT m_nFlags; };
实现文件如下: /*static*/ const UINT CStdioFileEx::modeWriteUnicode = 0x20000; // Add this flag to write in Unicode
CStdioFileEx::CStdioFileEx(): CStdioFile() { m_bIsUnicodeText = false; }
CStdioFileEx::CStdioFileEx(LPCTSTR lpszFileName,UINT nOpenFlags) :CStdioFile(lpszFileName, ProcessFlags(lpszFileName, nOpenFlags)) { }
BOOL CStdioFileEx::Open(LPCTSTR lpszFileName,UINT nOpenFlags,CFileException* pError /*=NULL*/) { // Process any Unicode stuff ProcessFlags(lpszFileName, nOpenFlags);
return CStdioFile::Open(lpszFileName, nOpenFlags, pError); }
BOOL CStdioFileEx::ReadString(CString& rString) { const int nMAX_LINE_CHARS = 4096; BOOL bReadData; LPTSTR lpsz; int nLen = 0; //, nMultiByteBufferLength = 0, nChars = 0; CString sTemp; wchar_t* pszUnicodeString = NULL; char * pszMultiByteString= NULL;
// If at position 0, discard byte-order mark before reading if (!m_pStream || (GetPosition() == 0 && m_bIsUnicodeText)) { wchar_t cDummy; // Read(&cDummy, sizeof(_TCHAR)); Read(&cDummy, sizeof(wchar_t)); }
// If compiled for Unicode #ifdef _UNICODE // Do standard stuff -- both ANSI and Unicode cases seem to work OK bReadData = CStdioFile::ReadString(rString); #else
if (!m_bIsUnicodeText) { // Do standard stuff -- read ANSI in ANSI bReadData = CStdioFile::ReadString(rString); } else { pszUnicodeString = new wchar_t[nMAX_LINE_CHARS]; pszMultiByteString= new char[nMAX_LINE_CHARS];
// Read as Unicode, convert to ANSI
if(fgetws(pszUnicodeString, nMAX_LINE_CHARS, m_pStream)==NULL) { bReadData=FALSE; } else { bReadData=TRUE; if (GetMultiByteStringFromUnicodeString(pszUnicodeString, pszMultiByteString, nMAX_LINE_CHARS)) { rString = (CString)pszMultiByteString; }
if (pszUnicodeString) { delete pszUnicodeString; }
if (pszMultiByteString) { delete pszMultiByteString; } } } #endif
// Then remove end-of-line character if in Unicode text mode if (bReadData) { // Copied from FileTxt.cpp but adapted to Unicode and then adapted for end-of-line being just '/r'. nLen = rString.GetLength(); if (nLen > 1 && rString.Mid(nLen-2) == sNEWLINE) { rString.GetBufferSetLength(nLen-2); } else { lpsz = rString.GetBuffer(0); if (nLen != 0 && (lpsz[nLen-1] == _T('/r') || lpsz[nLen-1] == _T('/n'))) { rString.GetBufferSetLength(nLen-1); } } }
return bReadData; }
// -------------------------------------------------------------------------------------------- // // CStdioFileEx::WriteString() // // -------------------------------------------------------------------------------------------- // Returns: void // Parameters: LPCTSTR lpsz // // Purpose: Writes string to file either in Unicode or multibyte, depending on whether the caller specified the // CStdioFileEx::modeWriteUnicode flag. Override of base class function. // Notes: If writing in Unicode we need to: // a) Write the Byte-order-mark at the beginning of the file // b) Write all strings in byte-mode // - If we were compiled in Unicode, we need to convert Unicode to multibyte if // we want to write in multibyte // - If we were compiled in multi-byte, we need to convert multibyte to Unicode if // we want to write in Unicode. // Exceptions: None. // void CStdioFileEx::WriteString(LPCTSTR lpsz) { // If writing Unicode and at the start of the file, need to write byte mark if (m_nFlags & CStdioFileEx::modeWriteUnicode) { // If at position 0, write byte-order mark before writing anything else if (!m_pStream || GetPosition() == 0) { wchar_t cBOM = (wchar_t)nUNICODE_BOM; CFile::Write(&cBOM, sizeof(wchar_t)); } }
// If compiled in Unicode... #ifdef _UNICODE
// If writing Unicode, no conversion needed if (m_nFlags & CStdioFileEx::modeWriteUnicode) { // Write in byte mode CFile::Write(lpsz, lstrlen(lpsz) * sizeof(wchar_t)); } // Else if we don't want to write Unicode, need to convert else { int nChars = lstrlen(lpsz) + 1; // Why plus 1? Because yes int nBufferSize = nChars * sizeof(char); wchar_t* pszUnicodeString = new wchar_t[nChars]; char * pszMultiByteString= new char[nChars];
// Copy string to Unicode buffer lstrcpy(pszUnicodeString, lpsz);
// Get multibyte string if (GetMultiByteStringFromUnicodeString(pszUnicodeString, pszMultiByteString, nBufferSize, GetACP())) { // Do standard write CFile::Write((const void*)pszMultiByteString, lstrlen(lpsz)); }
if (pszUnicodeString && pszMultiByteString) { delete [] pszUnicodeString; delete [] pszMultiByteString; } } // Else if *not* compiled in Unicode #else // If writing Unicode, need to convert if (m_nFlags & CStdioFileEx::modeWriteUnicode) { int nChars = lstrlen(lpsz) + 1; // Why plus 1? Because yes int nBufferSize = nChars * sizeof(wchar_t); wchar_t* pszUnicodeString = new wchar_t[nChars]; char * pszMultiByteString= new char[nChars];
// Copy string to multibyte buffer lstrcpy(pszMultiByteString, lpsz);
if (GetUnicodeStringFromMultiByteString(pszMultiByteString, pszUnicodeString, nBufferSize, GetACP())) { // Write in byte mode CFile::Write(pszUnicodeString, lstrlen(lpsz) * sizeof(wchar_t)); } else { ASSERT(false); }
if (pszUnicodeString && pszMultiByteString) { delete [] pszUnicodeString; delete [] pszMultiByteString; } } // Else if we don't want to write Unicode, no conversion needed else { // Do standard stuff CStdioFile::WriteString(lpsz); }
#endif }
UINT CStdioFileEx::ProcessFlags(const CString& sFilePath, UINT& nOpenFlags) { m_bIsUnicodeText = false;
// If we have writeUnicode we must have write or writeRead as well #ifdef _DEBUG if (nOpenFlags & CStdioFileEx::modeWriteUnicode) { ASSERT(nOpenFlags & CFile::modeWrite || nOpenFlags & CFile::modeReadWrite); } #endif
// If reading in text mode and not creating... if (nOpenFlags & CFile::typeText && !(m_nFlags & CFile::modeCreate) && !(m_nFlags & CFile::modeWrite )) { m_bIsUnicodeText = IsFileUnicode(sFilePath);
// If it's Unicode, switch to binary mode if (m_bIsUnicodeText) { nOpenFlags ^= CFile::typeText; nOpenFlags |= CFile::typeBinary; } }
m_nFlags = nOpenFlags;
return nOpenFlags; }
// -------------------------------------------------------------------------------------------- // // CStdioFileEx::IsFileUnicode() // // -------------------------------------------------------------------------------------------- // Returns: bool // Parameters: const CString& sFilePath // // Purpose: Determines whether a file is Unicode by reading the first character and detecting // whether it's the Unicode byte marker. // Notes: None. // Exceptions: None. // /*static*/ bool CStdioFileEx::IsFileUnicode(const CString& sFilePath) { CFile file; bool bIsUnicode = false; wchar_t cFirstChar; CFileException exFile;
// Open file in binary mode and read first character if (file.Open(sFilePath, CFile::typeBinary | CFile::modeRead, &exFile)) { // If byte is Unicode byte-order marker, let's say it's Unicode if (file.Read(&cFirstChar, sizeof(wchar_t)) > 0 && cFirstChar == (wchar_t)nUNICODE_BOM) { bIsUnicode = true; }
file.Close(); } else { // Handle error here if you like }
return bIsUnicode; }
unsigned long CStdioFileEx::GetCharCount() { int nCharSize; unsigned long nByteCount, nCharCount = 0;
if (m_pStream) { // Get size of chars in file nCharSize = m_bIsUnicodeText ? sizeof(wchar_t): sizeof(char);
// If Unicode, remove byte order mark from count nByteCount = (unsigned long)GetLength(); if (m_bIsUnicodeText) { nByteCount = nByteCount - sizeof(wchar_t); }
// Calc chars nCharCount = (nByteCount / nCharSize); }
return nCharCount; }
// -------------------------------------------------------------------------------------------- // // CStdioFileEx::GetUnicodeStringFromMultiByteString() // // -------------------------------------------------------------------------------------------- // Returns: bool // Parameters: char * szMultiByteString (IN) Multi-byte input string // wchar_t* szUnicodeString (OUT) Unicode outputstring // short nUnicodeBufferSize (IN) Size of Unicode output buffer // UINT nCodePage (IN) Code page used to perform conversion // Default = -1 (Get local code page). // // Purpose: Gets a Unicode string from a MultiByte string. // Notes: None. // Exceptions: None. // bool CStdioFileEx::GetUnicodeStringFromMultiByteString(char * szMultiByteString, wchar_t* szUnicodeString, short nUnicodeBufferSize, UINT nCodePage) { bool bOK = true; int nReturn = 0; CString sErrorMsg; if (szUnicodeString && szMultiByteString) { // If no code page specified, take default for system if (nCodePage == -1) { nCodePage = GetACP(); }
try { nReturn = MultiByteToWideChar(nCodePage,MB_PRECOMPOSED,szMultiByteString,-1,szUnicodeString,nUnicodeBufferSize);
if (nReturn == 0) { bOK = false; } } catch(...) { bOK = false; } } else { bOK = false; }
ASSERT(bOK); return bOK; }
// -------------------------------------------------------------------------------------------- // // CStdioFileEx::GetMultiByteStringFromUnicodeString() // // -------------------------------------------------------------------------------------------- // Returns: BOOL // Parameters: wchar_t * szUnicodeString (IN) Unicode input string // char* szMultiByteString (OUT) Multibyte output string // short nMultiByteBufferSize (IN) Multibyte buffer size // UINT nCodePage (IN) Code page used to perform conversion // Default = -1 (Get local code page). // // Purpose: Gets a MultiByte string from a Unicode string // Notes: None. // Exceptions: None. // BOOL CStdioFileEx::GetMultiByteStringFromUnicodeString(wchar_t * szUnicodeString, char* szMultiByteString, short nMultiByteBufferSize, UINT nCodePage) { BOOL bUsedDefChar = FALSE; BOOL bGotIt = FALSE;
if (szUnicodeString && szMultiByteString) { // If no code page specified, take default for system if (nCodePage == -1) { nCodePage = GetACP(); }
try { bGotIt = WideCharToMultiByte(nCodePage, WC_COMPOSITECHECK | WC_SEPCHARS, szUnicodeString,-1, szMultiByteString, nMultiByteBufferSize, sDEFAULT_UNICODE_FILLER_CHAR, &bUsedDefChar); } catch(...) { TRACE(_T("Controlled exception in WideCharToMultiByte!/n")); } }
return bGotIt; }
本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/Augusdi/archive/2009/10/15/4677520.aspx
原文链接:http://blog.csdn.net/sunboy_2050/article/details/5019900
|