2008年(884)
分类: C/C++
2008-08-06 10:04:37
UCS-4范围(16进制) UTF-8 系列(二进制) 0000 0000<->0000 007F 0xxxxxxx 0000 0080<->0000 07FF 110xxxxx 10xxxxxx 0000 0800<->0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 0001 0000<->001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 0020 0000<->03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 0400 0000<->7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
typedef usigned short WCHAR // 输出的UTF8编码至多是3个字节。 int UnicodeToUTF8(WCHAR ucs2, unsigned char *buffer) { memset(buffer, 0, 4); if ((0x0000 <= ucs2) && (ucs2 <= 0x007f)) // one char of UTF8 { buffer[0] = (char)ucs2; return 1; } if ((0x0080 <= ucs2) && (ucs2 <= 0x07ff)) // two char of UTF8 { buffer[1] = 0x80 | char(ucs2 & 0x003f); buffer[0] = 0xc0 | char((ucs2 >> 6) & 0x001f); return 2; } if ((0x0800 <= ucs2) && (ucs2 <= 0xffff)) // three char of UTF8 { buffer[2] = 0x80 | char(ucs2 & 0x003f); buffer[1] = 0x80 | char((ucs2 >> 6) & 0x003f); buffer[0] = 0xe0 | char((ucs2 >> 12) & 0x001f); return 3; } return 0; }
WCHAR UTF8ToUnicode(unsigned char *buffer) { WCHAR temp = 0; if (buffer[0] < 0x80) // one char of UTF8 { temp = buffer[0]; } if ((0xc0 <= buffer[0]) && (buffer[0] < 0xe0)) // two char of UTF8 { temp = buffer[0] & 0x1f; temp = temp << 6; temp = temp | (buffer[1] & 0x3f); } if ((0xe0 <= buffer[0]) && (buffer[0] < 0xf0)) // three char of UTF8 { temp = buffer[0] & 0x0f; temp = temp << 6; temp = temp | (buffer[1] & 0x3f); temp = temp << 6; temp = temp | (buffer[2] & 0x3f); } if ((0x80 <= buffer[0]) && (buffer[0] < 0xc0)) // not the first byte of UTF8 character return 0xfeff; // 0xfeff will never appear in usual return temp; // more than 3-bytes return 0 }
typedef unsigned char byte // 64 characters for base64 coding byte base64Chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 /"; // 8 characters are safe just as base64 characters for MAIL gates byte safeChars[] = "''(),-.:?"; // 4 characters all means space byte spaceChars[] = " \t\n\r";
// mask value defined for indentify the type of a byte #define BASE64 0x01 #define SAFE 0x02 #define SPACE 0x04 byte byteType[256]; // hash table used for find the type of a byte bool firstTime = true; // the first time to use the lib, wait for init the table // 注:为了解码base64编码部分的字符,需要一个哈希表,对一个base64字符都可以直接得到0-64之间的一个数: byte base64Value[128]; 这两个哈希表在使用前要初始化: void initUTF7Tables() { byte *s; if(!firstTime) return; // not necessary, but should do it to be robust memset(byteType, 0, 256); memset(base64Value, 0, 128); for(s=base64Chars; *s!=''\0''; s ) { byteType[*s] |= BASE64; base64Value[*s] = s - base64Chars; // the offset, it is a 6bits value,0-64 } for(s=safeChars; *s!=''\0''; s ) byteType[*s] |= SAFE; for(s=spaceChars; *s!=''\0''; s ) byteType[*s] |= SPACE; firstTime = false; }UTF-7编码转换时候,是与当前字符是与状态有关的,也就是说:
// the state of current character #define IN_ASCII 0 #define IN_BASE64 1 #define AFTER_PLUS 2在使用规则2进行编码时候,需要使用base64的方法,也就需要2个全局的辅助变量:
int state; // state in which we are working int nbits; // number of bits in the bit buffer unsigned long bitBuffer; // used for base64 coding把一个Unicode字符转化为一个UTF-7序列:返回写到缓冲区里的字节数目,函数影响了state,nbits,bitBuffer三个全局变量。这里先实现了一个简单的辅助函数,功能是把一个Unicode字符转变后写到提供的缓冲区中,返回写入的字节个数。在开始编码Unicode字符数组中第一个字符的时候,state,nbits,bitBuffer三个全局变量需要被初始化:
state = IN_ASCII; nbits = 0; bitBuffer = 0; int UnicodeToUTF7(WCHAR ucs2, byte *buffer) { byte *head = buffer; int index; // is an ASCII and is a byte in char set defined if (((ucs2 & 0xff80) == 0)) && (byteType[(byte)u2] & (BASE64|SAFE|SPACE))) { byte temp = (byte)ucs2; if (state == IN_BASE64) // should switch out from base64 coding here { if (nbits > 0) // if some bits in buffer, then output them { index = (bitBuffer << (6 - nbits)) & 0x3f; *s = base64[index]; } if ((byteType[temp] & BASE64) || (temp == ''-'')) *s = ''-''; state = IN_ASCII; } *s = temp; if (temp == '' '') *s = ''-''; } else { if (state == IN_ASCII) { *s = '' ''; state = IN_BASE64; // begins base64 coding here nbits = 0; bitBuffer = 0; } bitBuffer <<= 16; bitBuffer |= ucs2; nbits = 16; while(nbits >= 6) { nbits -= 6; index = (bitBuffer >> nbits) & 0x3f; // output the high 6 bits *s = base64[index]; } } return (s - head); }
state = IN_ASCII; nbits = 0; bitBuffer = 0; #define RET0 0xfeff WCHAR UTF7ToUnicode(byte c) { if(state == IN_ASCII) { if (c == '' '') { state = AFTER_PLUS; return RET0; } else return (WCHAR)c; } if (state == AFTER_PLUS) { if (c == ''-'') { return (WCHAR)'' ''; } else { state = IN_BASE64; nbits = 0; bitBuffer = 0; // it is not necessary // don''t return yet, continue to the IN_BASE64 mode } } // state == Base64 if (byteType[c] & BASE64) { bitBuffer <<= 6; bitBuffer |= base64Value[c]; nbits = 6; if (nbits >= 16) { nbits -= 16; return (WCHAR)((bitBuffer >> nbits) & 0x0000ffff); } return RET0; } // encount a byte which is not in base64 character set, switch out of base64 coding state = IN_ASCII; if (c != ''-'') { return (WCHAR)c; } return RET0; }说明:对于一个UTF-7序列,可以通过连续输入字节并调用上面的函数,判断返回值,得到一个Unicode字符数组。
int StringEncode::UnicodeToGB2312(char **dest, const WCHAR *src) { char* buffer; int size = ::WideCharToMultiByte(CP_ACP, 0, src, -1, NULL, 0, NULL, NULL); // null termidated wchar''s buffer buffer = new char[size]; int ret = ::WideCharToMultiByte(CP_ACP, NULL, src, -1, buffer, size 1, NULL, NULL); if (*dest != 0) delete *dest; *dest = buffer; return ret; }
int StringEncode::Gb2312ToUnicode(WCHAR **dest, const char *src) { int length = strlen(src); // null terminated buffer WCHAR *buffer = new WCHAR[length 1]; // WCHAR means unsinged short, 2 bytes // provide enough buffer size for Unicodes int ret = ::MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, src, length, buffer, length); buffer[ret] = 0; if (*dest != 0) delete *dest; *dest = buffer; return ret; }
CString CTestUrlDlg::UrlToString(CString url) { CString str = ""; int n = url.GetLength(); url.MakeLower(); BYTE a, b1, b2; for (int i=0; i= ''0'') && (c <= ''9'')) d = c - ''0''; else if ((c >= ''a'') && (c <= ''f'')) { d = c - ''a'' 10; } else if ((c >= ''A'') && (c <= ''F'')) { d = c - ''A'' 10; } else d = 0; return d; } static void UnicodeToGB2312(const WCHAR unicode, char* buffer) { // int size = ::WideCharToMultiByte(CP_ACP, 0, unicode, -1, NULL, 0, NULL, NULL); int ret = ::WideCharToMultiByte(CP_ACP, NULL, &unicode, -1, buffer, 3, NULL, NULL); } CString CTestUrlDlg::Uft8ToGB(CString url) { CString str = ""; char buffer[3]; WCHAR unicode; unsigned char * p = (unsigned char *)(LPCTSTR)url; int n = url.GetLength(); int t = 0; while (t < n) { unicode = UTF8ToUnicode(p, t); UnicodeToGB2312(unicode, buffer); buffer[2] = 0; str = buffer; } return str; }
CString str = "/MFC鑻辨枃鎵嬪唽.chm"; CString ret = UrlToString(str); ret = Uft8ToGB(ret); // MFC英文手册.chm