程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> C語言 >> C++ >> 關於C++ >> 支持UNICODE/UTF8/ANSI之間的轉換的類

支持UNICODE/UTF8/ANSI之間的轉換的類

編輯:關於C++

ZUtf8_16.h文件:

//---------------------------------------------------------------------------
#ifndef ZUtf8_16H
#define ZUtf8_16H
//---------------------------------------------------------------------------
/*
支持UNICODE,UNICODE BE ,UTF8,ASCII之間的轉換的類。
日期:2007-06-15
版本:1.0
作者:小笨象
網站:http://www.9ele.com
郵箱:[email protected] //不要發垃圾郵件給我~~
說明:你可以隨意使用本文件,不過如果你修改了其中的BUG,
或者修改得更好了,請你也通知我一下,
讓我也能享受一下開源的好處,謝謝。
*/
enum EncodingType
{
uni8Bit = 0, // 默認ASCII
uni16BE = 1,
uni16LE = 2, // Windows 默認的編碼,也就是UNICODE
uniUTF8 = 3,
uniUTF8NOBOM = 4 // 沒有UTF8標識頭的UTF8文件
};
class ZUtf8_16
{
private:
EncodingType m_unicodeMode; // 編碼方式
int isUTF8_16(const char *s, unsigned int len, unsigned *cchUnused);
EncodingType __fastcall DetermineEncoding(unsigned char *data, size_t iLen);
public:
__fastcall ZUtf8_16();
__fastcall ~ZUtf8_16();
EncodingType __fastcall GetEncodingType(void){return m_unicodeMode;};
bool __fastcall LoadFromStream(TMemoryStream *pStream, AnsiString &DestText);
bool __fastcall StreamSaveToFile(TMemoryStream *pStream,
AnsiString FileNameA, EncodingType unicodeMode);
};
#endif

ZUtf8_16.cpp文件:

//---------------------------------------------------------------------------
#include <vcl.h>
#pragma hdrstop
#include <stdio.h>
#include "ZUtf8_16.h"
#pragma package(smart_init)
__fastcall ZUtf8_16::ZUtf8_16()
{
m_unicodeMode = uni8Bit;
}
//---------------------------------------------------------------------------
__fastcall ZUtf8_16::~ZUtf8_16()
{
}
//---------------------------------------------------------------------------
int ZUtf8_16::isUTF8_16(const char *s, unsigned int len, unsigned *cchUnused)
{
int rv = 1;
int ASCII7only = 1;
const unsigned char *sx = (unsigned char *)s, *endx = sx+len;
while(sx < endx)
{
if(!*sx)
{
// For detection, we'll say that NUL means not UTF8
ASCII7only = 0;
rv = 0;
break;
}
else if (*sx < 0x80)
{
// 0nnnnnnn If the byte's first hex code begins with 0-7, it is an ASCII character.
sx++;
}
else if (*sx < (0x80 + 0x40))
{
// 10nnnnnn 8 through B cannot be first hex codes
ASCII7only = 0;
rv = 0;
break;
}
else if(*sx < (0x80 + 0x40 + 0x20))
{
// 110xxxvv 10nnnnnn If it begins with C or D, it is an 11 bit character
ASCII7only = 0;
if(sx >= endx-1) break;
if(!(*sx & 0x1F) || (sx[1]&(0x80+0x40)) != 0x80) { rv = 0; break; }
sx += 2;
}
else if (*sx < (0x80 + 0x40 + 0x20 + 0x10))
{
// 1110qqqq 10xxxxvv 10nnnnnn If it begins with E, it is 16 bit
ASCII7only = 0;
if(sx >= endx-2) break;
if(!(*sx & 0xF) || (sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80)
{ rv = 0; break; }
sx += 3;
}
else
{
/* more than 16 bits are not allowed here */
ASCII7only = 0;
rv = 0;
break;
}
}
if(cchUnused) *cchUnused = endx-sx;
return(ASCII7only?0:rv);
}
//---------------------------------------------------------------------------
EncodingType __fastcall ZUtf8_16::DetermineEncoding(unsigned char *data, size_t iLen)
{
// TODO:判斷當前文件的編碼類型。
m_unicodeMode = uni8Bit; // 默認ASCII
if(data[0] == 0xFE && data[1] == 0xFF) // Big endian == UNICODE-BIG UTF16
{
m_unicodeMode = uni16BE;
}
else if(data[0] == 0xFF && data[1] == 0xFE) // Little endian == UNICODE UTF16
{
m_unicodeMode = uni16LE; // Unicode
}
else if(data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) // UTF8
{
m_unicodeMode = uniUTF8;
}
else if(isUTF8_16(data, iLen, NULL) == 1)
{
m_unicodeMode = uniUTF8NOBOM;
}
return m_unicodeMode;
}
//---------------------------------------------------------------------------
bool __fastcall ZUtf8_16::LoadFromStream(TMemoryStream *pSourceStream, AnsiString &DestText)
{
// TODO:從流中讀取數據
// 先判斷字符編碼
pSourceStream->Position = 0;
if(pSourceStream->Size == 0) return true;
m_unicodeMode = DetermineEncoding((char*)pSourceStream->Memory, pSourceStream->Size);
pSourceStream->Position = 0;
// 再根椐相應的編碼做相應的事。
switch(m_unicodeMode)
{
case uni8Bit:
{
// 什麼都不做。以保證打開一些大的文件時速度快一些。
// 所以調用者需要自己在調的之後判斷字符編碼,
// 如果是uni8Bit,則需要自己處理。
// int iLength = pSourceStream->Size ;
// char *szUnicode = new char[iLength+1];
// memset(szUnicode, 0x00, iLength+1);
// pSourceStream->Read(szUnicode, iLength);
// DestText = AnsiString(szUnicode);
// delete []szUnicode;
// szUnicode = NULL;
break;
}
case uni16BE:
{
// UC Big endian
pSourceStream->Position = 2;
int iLength = pSourceStream->Size-2 ;
char temp;
char *szUnicode = new char[iLength+2];
memset(szUnicode, 0x00, iLength+2);
pSourceStream->Read(szUnicode, iLength);
// 只要把每兩個字節的位置交換一下,就是UNICODE LE了。So...
for(int i = 0; i<iLength; i += 2)
{
temp = szUnicode[i];
szUnicode[i] = szUnicode[i+1];
szUnicode[i+1] = temp;
Application->ProcessMessages();
}
DestText = WideCharLenToString ((wchar_t*)(szUnicode), iLength/2);
delete []szUnicode;
szUnicode = NULL;
break;
}
case uni16LE:
{
// UNICODE Little endian
pSourceStream->Position = 2;
int iLength = pSourceStream->Size-2 ;
wchar_t *szUnicode = new wchar_t[iLength+2];
memset(szUnicode, 0x00, iLength+2);
pSourceStream->Read(szUnicode, iLength);
WideString WideStr = WideString(szUnicode);
DestText = WideStr;
delete []szUnicode;
szUnicode = NULL;
break;
}
case uniUTF8:
{
// UTF8
pSourceStream->Position = 3;
int iLength = pSourceStream->Size-3 ;
char *szUTF8 = new char[iLength+3];
memset(szUTF8, 0x00, iLength+3);
pSourceStream->Read(szUTF8, iLength);
AnsiString Utf8Str = Utf8ToAnsi(szUTF8);
// 本文轉自 C++Builder研究 - http://www.ccrun.com/article.asp?i=1023&d=cbj0f7
if(Utf8Str == "")
DestText = AnsiString((char*)pSourceStream->Memory);
else
DestText = Utf8Str;
delete []szUTF8;
szUTF8 = NULL;
break;
}
case uniUTF8NOBOM:
{
// UTF8 沒有頭標識的情況。
int iLength = pSourceStream->Size;
char *szUTF8 = new char[iLength+3];
memset(szUTF8, 0x00, iLength+3);
pSourceStream->Read(szUTF8, iLength);
AnsiString Utf8Str = Utf8ToAnsi(szUTF8);
if(Utf8Str == "")
DestText = AnsiString((char*)pSourceStream->Memory);
else
DestText = Utf8Str;
delete []szUTF8;
szUTF8 = NULL;
break;
}
}
return true;
}
//---------------------------------------------------------------------------
bool __fastcall ZUtf8_16::StreamSaveToFile(TMemoryStream *pStream,
AnsiString FileNameA, EncodingType unicodeMode)
{
// TODO:把流內容按指定的格式保存到文件中。
try
{
pStream->Position = 0;
switch(unicodeMode)
{
case uni8Bit:
{
// 什麼都不做。直接保存。
pStream->SaveToFile(FileNameA);
break;
}
case uni16BE:
{
// UC Big endian
int iLength = pStream->Size ;
char temp;
char *pSource = new char[iLength+2];
memset(pSource, 0x00, iLength+2);
pStream->Read(pSource, iLength);
// 先看看轉成的寬字節數返到nLen
int nLen = MultiByteToWideChar( CP_ACP, 0, pSource, iLength, NULL, NULL );
LPWSTR lpwsz = new WCHAR[nLen];
MultiByteToWideChar( CP_ACP, 0, pSource, -1, lpwsz, nLen );
int iNewLen = lstrlenW(lpwsz) * sizeof(WCHAR);
char *pDest = new char[iNewLen];
memcpy(pDest, lpwsz, iNewLen);
// 只要把每兩個字節的位置交換一下,就是UNICODE Big了。So...
for(int i = 0; i<iNewLen; i += 2)
{
temp = pDest[i];
pDest[i] = pDest[i+1];
pDest[i+1] = temp;
Application->ProcessMessages();
}
FILE *f = fopen(FileNameA.c_str(), "wb");
// 寫Unicode Big頭
fputc(0xFE, f);
fputc(0xFF, f);
fwrite(pDest, 1, iNewLen, f);
fclose(f);
delete []pDest;
pDest = NULL;
delete []lpwsz;
lpwsz = NULL;
delete []pSource;
pSource = NULL;
break;
}
case uni16LE:
{
// UNICODE Little endian
int iLength = pStream->Size ;
char *pSource = new char[iLength+2];
memset(pSource, 0x00, iLength+2);
pStream->Read(pSource, iLength);
// 先看看轉成的寬字節數返到nLen
int nLen = MultiByteToWideChar( CP_ACP, 0, pSource, iLength, NULL, NULL );
LPWSTR lpwsz = new WCHAR[nLen];
MultiByteToWideChar( CP_ACP, 0, pSource, -1, lpwsz, nLen );
FILE *f = fopen(FileNameA.c_str(), "wb");
// 寫Unicode頭
fputc(0xFF, f);
fputc(0xFE, f);
// 一個寬字節占兩個字節
fwrite(lpwsz, 1, lstrlenW(lpwsz) * sizeof(WCHAR), f);
fclose(f);
delete []lpwsz;
lpwsz = NULL;
delete []pSource;
pSource = NULL;
break;
}
case uniUTF8:
{
// UTF8
int iLen = pStream->Size;
char *pSource = new char[iLen+3];
memset(pSource, 0x00, iLen+3);
pStream->Read(pSource, iLen);
AnsiString Utf8Str = AnsiToUtf8(pSource);
delete []pSource;
pSource = NULL;
FILE *f = fopen(FileNameA.c_str(), "wb");
// 寫UTF8頭
fputc(0xEF, f);
fputc(0xBB, f);
fputc(0xBF, f);
// 一個寬字節占兩個字節
fwrite(Utf8Str.c_str(), 1, Utf8Str.Length(), f);
fclose(f);
break;
}
case uniUTF8NOBOM:
{
// UTF8沒有標識頭的情況。
int iLen = pStream->Size;
char *pSource = new char[iLen+3];
memset(pSource, 0x00, iLen+3);
pStream->Read(pSource, iLen);
AnsiString Utf8Str = AnsiToUtf8(pSource);
delete []pSource;
pSource = NULL;
FILE *f = fopen(FileNameA.c_str(), "wb");
// 一個寬字節占兩個字節
fwrite(Utf8Str.c_str(), 1, Utf8Str.Length(), f);
fclose(f);
break;
}
} // end of switch
}
catch(...)
{
return false;
}
return true;
}
//---------------------------------------------------------------------------
// 試用舉例:
#include "ZUtf8_16.h"
bool __fastcall LoadFile(AnsiString strFileName, TStrings *pList)
{
EncodingType unicodeMode;
// TODO:裝入文件。
// 如果裝入成功,則返回true
AnsiString ErrMsg;
bool bReturn = true;
ErrMsg.sprintf("裝入 %s 文檔時出錯,\n\n該文檔不存在"
"或者被其它程序以獨占方式打開!", strFileName);
if(!FileExists(strFileName))
{
MessageBox(0, ErrMsg.c_str(), "錯誤", MB_OK|MB_ICONERROR);
return false;
}
AnsiString ReturnTxt;
ZUtf8_16 zutf8_16;
TMemoryStream *ReadStream = new TMemoryStream();
ReadStream->LoadFromFile(strFileName);
bReturn = zutf8_16.LoadFromStream(ReadStream, ReturnTxt);
if(bReturn)
{
unicodeMode = zutf8_16.GetEncodingType();
if(unicodeMode==uni8Bit)
pList->LoadFromStream(ReadStream);
else
pList->Text = ReturnTxt;
}
else
{
MessageBox(0, ErrMsg.c_str(), "錯誤", MB_OK|MB_ICONERROR);
}
delete ReadStream;
ReadStream = NULL;
return bReturn;
}
  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved