分类: 系统运维
2008-04-19 15:23:19
UTF-8 C++ 程序编码示例:
下面是四个C++函数,他们分别实现2字节和4字节UNICODE和UTF-8之间的转换。
#define MASKBITS 0x3F
#define MASKBYTE 0x80
#define MASK2BYTES 0xC0
#define MASK3BYTES 0xE0
#define MASK4BYTES 0xF0
#define MASK5BYTES 0xF8
#define MASK6BYTES 0xFC
typedef unsigned short Unicode2Bytes;
typedef unsigned int Unicode4Bytes;
void UTF8Encode2BytesUnicode(std::vector< Unicode2Bytes > input,
std::vector< byte >& output)
{
for(int i=0; i < input.size(); i++)
{
// 0xxxxxxx
if(input < 0x80)
{
output.push_back((byte)input);
}
// 110xxxxx 10xxxxxx
else if(input < 0x800)
{
output.push_back((byte)(MASK2BYTES | input >> 6));
output.push_back((byte)(MASKBYTE | input & MASKBITS));
}
// 1110xxxx 10xxxxxx 10xxxxxx
else if(input < 0x10000)
{
output.push_back((byte)(MASK3BYTES | input >> 12));
output.push_back((byte)(MASKBYTE | input >> 6 & MASKBITS));
output.push_back((byte)(MASKBYTE | input & MASKBITS));
}
}
}
void UTF8Decode2BytesUnicode(std::vector< byte > input,
std::vector< Unicode2Bytes >& output)
{
for(int i=0; i < input.size();)
{
Unicode2Bytes ch;
// 1110xxxx 10xxxxxx 10xxxxxx
if((input & MASK3BYTES) == MASK3BYTES)
{
ch = ((input & 0x0F) << 12) | (
(input[i+1] & MASKBITS) << 6)
| (input[i+2] & MASKBITS);
i += 3;
}
// 110xxxxx 10xxxxxx
else if((input & MASK2BYTES) == MASK2BYTES)
{
ch = ((input & 0x1F) << 6) | (input[i+1] & MASKBITS);
i += 2;
}
// 0xxxxxxx
else if(input < MASKBYTE)
{
ch = input;
i += 1;
}
output.push_back(ch);
}
}
void UTF8Encode4BytesUnicode(std::vector< Unicode4Bytes > input,
std::vector< byte >& output)
{
for(int i=0; i < input.size(); i++)
{
// 0xxxxxxx
if(input < 0x80)
{
output.push_back((byte)input);
}
// 110xxxxx 10xxxxxx
else if(input < 0x800)
{
output.push_back((byte)(MASK2BYTES | input > 6));
output.push_back((byte)(MASKBYTE | input & MASKBITS));
}
// 1110xxxx 10xxxxxx 10xxxxxx
else if(input < 0x10000)
{
output.push_back((byte)(MASK3BYTES | input >> 12));
output.push_back((byte)(MASKBYTE | input >> 6 & MASKBITS));
output.push_back((byte)(MASKBYTE | input & MASKBITS));
}
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
else if(input < 0x200000)
{
output.push_back((byte)(MASK4BYTES | input >> 18));
output.push_back((byte)(MASKBYTE | input >> 12 & MASKBITS));
output.push_back((byte)(MASKBYTE | input >> 6 & MASKBITS));
output.push_back((byte)(MASKBYTE | input & MASKBITS));
}
// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
else if(input < 0x4000000)
{
output.push_back((byte)(MASK5BYTES | input >> 24));
output.push_back((byte)(MASKBYTE | input >> 18 & MASKBITS));
output.push_back((byte)(MASKBYTE | input >> 12 & MASKBITS));
output.push_back((byte)(MASKBYTE | input >> 6 & MASKBITS));
output.push_back((byte)(MASKBYTE | input & MASKBITS));
}
// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
else if(input < 0x8000000)
{
output.push_back((byte)(MASK6BYTES | input >> 30));
output.push_back((byte)(MASKBYTE | input >> 18 & MASKBITS));
output.push_back((byte)(MASKBYTE | input >> 12 & MASKBITS));
output.push_back((byte)(MASKBYTE | input >> 6 & MASKBITS));
output.push_back((byte)(MASKBYTE | input & MASKBITS));
}
}
}
void UTF8Decode4BytesUnicode(std::vector< byte > input,
std::vector< Unicode4Bytes >& output)
{
for(int i=0; i < input.size();)
{
Unicode4Bytes ch;
// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
if((input & MASK6BYTES) == MASK6BYTES)
{
ch = ((input & 0x01) << 30) | ((input[i+1] & MASKBITS) << 24)
| ((input[i+2] & MASKBITS) << 18) | ((input[i+3]
& MASKBITS) << 12)
| ((input[i+4] & MASKBITS) << 6) | (input[i+5] & MASKBITS);
i += 6;
}
// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
else if((input & MASK5BYTES) == MASK5BYTES)
{
ch = ((input & 0x03) << 24) | ((input[i+1]
& MASKBITS) << 18)
| ((input[i+2] & MASKBITS) << 12) | ((input[i+3]
& MASKBITS) << 6)
| (input[i+4] & MASKBITS);
i += 5;
}
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
else if((input & MASK4BYTES) == MASK4BYTES)
{
ch = ((input & 0x07) << 18) | ((input[i+1]
& MASKBITS) << 12)
| ((input[i+2] & MASKBITS) << 6) | (input[i+3] & MASKBITS);
i += 4;
}
// 1110xxxx 10xxxxxx 10xxxxxx
else if((input & MASK3BYTES) == MASK3BYTES)
{
ch = ((input & 0x0F) << 12) | ((input[i+1] & MASKBITS) << 6)
| (input[i+2] & MASKBITS);
i += 3;
}
// 110xxxxx 10xxxxxx
else if((input & MASK2BYTES) == MASK2BYTES)
{
ch = ((input & 0x1F) << 6) | (input[i+1] & MASKBITS);
i += 2;
}
// 0xxxxxxx
else if(input < MASKBYTE)
{
ch = input;
i += 1;
}
output.push_back(ch);
}
}