場景:
1. 分析數據時,獲取到的數據是字符串,但是有可能不是正確的完整的utf8字符串,打印出來或輸出到文件時表現出來的就是顯示亂碼.
這時候就需要過濾掉非法字符使utf8字符串能正確顯示, 比如把非法字符替換為#
代碼:
1. 這個函數的特性是1個個字符判斷, 適合任意長度,任意構造的 utf8 (無效)字符串.
bool IREUtil::FilterUtf8(unsigned char * string,int length) { if(!string) { return false; } unsigned char * bytes = string; unsigned char * end = bytes+length; //10xxxxxx 應該出現個數 int count_s = 0; //10xxxxxx 剩余個數 int minus_s = 0; while(bytes != end) { if(bytes[0] > 0xF7) { if(minus_s) { int m = count_s-minus_s+1; memset((void*)(bytes-m),'#',m); } minus_s = 0; count_s = 0; bytes[0] = '#'; bytes+=1; continue; } if(bytes[0] <= 0x7F) { if(minus_s) { int m = count_s-minus_s+1; memset((void*)(bytes-m),'#',m); } minus_s = 0; count_s = 0; //過濾掉不可見字符 if((bytes[0] == 0x09 || bytes[0] == 0x0A || bytes[0] == 0x0D || (0x20 <= bytes[0] && bytes[0] <= 0x7E))) { ; }else { bytes[0] = '#'; } bytes+=1; continue; } if((bytes[0] & 0xF8) == 0xF0) { // 1111 0XXX if(minus_s) { int m = count_s-minus_s+1; memset((void*)(bytes-m),'#',m); } count_s = 3; minus_s = 3; bytes+=1; continue; } if((bytes[0] & 0xF0) == 0xE0) { // 1110 XXXX if(minus_s) { int m = count_s-minus_s+1; memset((void*)(bytes-m),'#',m); } count_s = 2; minus_s = 2; bytes+=1; continue; } if((bytes[0] & 0xE0) == 0xC0) { // 110X XXXX if(minus_s) { int m = count_s-minus_s+1; memset((void*)(bytes-m),'#',m); } count_s = 1; minus_s = 1; bytes+=1; continue; } if((bytes[0] & 0xC0) == 0x80) { // 10XX XXXX if(minus_s) { --minus_s; }else { bytes[0] = '#'; } bytes+=1; continue; } if(minus_s) { int m = count_s-minus_s+1; memset((void*)(bytes-m),'#',m); }else { bytes[0] = '#'; } minus_s = 0; count_s = 0; bytes+=1; continue; } if(minus_s) { int m = count_s-minus_s+1; memset((void*)(bytes-m),'#',m); } return true; }
原文地址沒留下:
bool IREUtil::is_utf8(const unsigned char * string,int length) { if(!string) { return false; } const unsigned char * bytes = (const unsigned char *)string; const unsigned char * end = bytes+length; while(bytes != end) { if( (// ASCII // use bytes[0] <= 0x7F to allow ASCII control characters bytes[0] == 0x09 || bytes[0] == 0x0A || bytes[0] == 0x0D || (0x20 <= bytes[0] && bytes[0] <= 0x7E) ) ) { bytes += 1; continue; } if( (// non-overlong 2-byte (0xC2 <= bytes[0] && bytes[0] <= 0xDF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) ) ) { bytes += 2; continue; } if( (// excluding overlongs bytes[0] == 0xE0 && (0xA0 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) ) || (// straight 3-byte ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || bytes[0] == 0xEE || bytes[0] == 0xEF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) ) || (// excluding surrogates bytes[0] == 0xED && (0x80 <= bytes[1] && bytes[1] <= 0x9F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) ) ) { bytes += 3; continue; } if( (// planes 1-3 bytes[0] == 0xF0 && (0x90 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF) ) || (// planes 4-15 (0xF1 <= bytes[0] && bytes[0] <= 0xF3) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF) ) || (// plane 16 bytes[0] == 0xF4 && (0x80 <= bytes[1] && bytes[1] <= 0x8F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF) ) ) { bytes += 4; continue; } return false; } return true; }
歡迎指正!