分类: C/C++
2008-03-25 21:30:49
UTF8是一种储存和传送的格式。
UTF8是以8bits即1Bytes为编码的最基本单位,也可以基于16bits和32bits的形式,分别称为UTF16和UTF32,但目前使用不多,而UTF8则被广泛应用在文件储存和网络传输中。
编码原理
先看这个模板:
UCS-4 range (hex.) UTF-8 octet sequence (binary)
0000 0000-0000 007F 0xxxxxxx
0000 0080-0000 07FF 110xxxxx 10xxxxxx
0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
编码步骤:
1) 首先确定需要多少个8bits(octets)
2) 按照上述模板填充每个octets的高位bits
3) 把字符的bits填充至x中,字符顺序:低位→高位,UTF8顺序:最后一个octet的最末位x→第一个octet最高位x
4) 解码的原理一样。
实例:(留意每个bit的颜色,粗体字为模板内容)
UCS-4 UTF-8
HEX BIN Bytes BIN HEX Bytes
0000 000A 00001010 4 00001010 0A 1
0000 0099 10011001 4 11000010 10011001 C2 99 2
0000 8D99 10001101 10011001 4 11101000 10110110 10011001 E8 B6 99 3
以UTF8格式储存的文件档首标识为EF BB BF。
效率
从上述编码原理中得可以得以下结论:
1.每个英文字母、数字所占用空间为1 Byte。
2.汉字占3 Bytes。
而直接使用无论是ANSI还是Unicode/UCS2来编码,中文都只占用2字节。所以UTF8对英文来说是个非常好的方案,但对中文来说并不是一个好的选择。
因此:大文件分块读取时,很有可能把一个utf8编码的charact从中间分开,需要把最后不是一个完整的字符的bytes放到下一个文件block中。我们仅需要判断当前字节是不是一个utf8字符的第一个字节,判断方式如下:
(1)C < 0x80 || ( C& 0x1100) == 1100
utf8_to_ucs4:解码函数
//
// utf8_to_utf32
//
// Converts a single codepoint in the specified UTF-8 stream of text
// into a UTF-32 value
//
// Illegal sequences are converted to the unicode replacement character
//
// utf8str - [in] buffer containing UTF-8 text
// utf8len - [in] number of code-units (bytes) available in buffer
// pch32 - [out] single UTF-32 value
//
// Returns number of bytes processed from utf8str
//
size_t utf8_to_utf32(UTF8 *utf8str, size_t utf8len, UTF32 *pch32)
{
UTF8 ch = *utf8str++;
UTF32 val32 = 0;
size_t trailing = 0;
size_t len = 1;
size_t i;
static UTF32 nonshortest[] =
{
0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff
};
// validate parameters
if(utf8str == 0 || utf8len <= 0 || pch32 == 0)
return 0;
// look for plain ASCII first as this is most likely
if(ch < 0x80)
{
*pch32 = (UTF32)ch;
return 1;
}
// LEAD-byte of 2-byte seq: 110xxxxx 10xxxxxx
else if((ch & 0xE0) == 0xC0)
{
trailing = 1;
val32 = ch & 0x1F;
}
// LEAD-byte of 3-byte seq: 1110xxxx 10xxxxxx 10xxxxxx
else if((ch & 0xF0) == 0xE0)
{
trailing = 2;
val32 = ch & 0x0F;
}
// LEAD-byte of 4-byte seq: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
else if((ch & 0xF8) == 0xF0)
{
trailing = 3;
val32 = ch & 0x07;
}
// ILLEGAL 5-byte seq: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
else if((ch & 0xFC) == 0xF8)
{
// range-checking the UTF32 result will catch this
trailing = 4;
val32 = ch & 0x03;
}
// ILLEGAL 6-byte seq: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
else if((ch & 0xFE) == 0xFC)
{
// range-checking the UTF32 result will catch this
trailing = 5;
val32 = ch & 0x01;
}
// ILLEGAL continuation (trailing) byte by itself
else if((ch & 0xC0) == 0x80)
{
*pch32 = UNI_REPLACEMENT_CHAR;
return 1;
}
// any other ILLEGAL form.
else
{
*pch32 = UNI_REPLACEMENT_CHAR;
return 1;
}
// process trailing bytes
for(i = 0; i < trailing && len < utf8len; i++)
{
ch = *utf8str++;
// Valid trail-byte: 10xxxxxx
if((ch & 0xC0) == 0x80)
{
val32 = (val32 << 6) + (ch & 0x7f);
len++;
}
// Anything else is an error
else
{
*pch32 = UNI_REPLACEMENT_CHAR;
return len;
}
}
// did we decode a full utf-8 sequence?
if(val32 < nonshortest[trailing] || i != trailing)
*pch32 = UNI_REPLACEMENT_CHAR;
else
*pch32 = val32;
return len;
}
ucs4_to_utf8:编码函数
//
// utf32_to_utf8
//
// Converts the specified UTF-32 value to UTF-8
//
// ch32 - [in] single utf-32 value
// utf8str - [out] buffer to receive UTF-8 text
// utf8len - [in] size of utf8 buffer in bytes
//
// Returns number of bytes stored in utf8str
//
size_t utf32_to_utf8(UTF8 *utf8str, size_t utf8len, UTF32 ch32)
{
size_t len = 0;
// validate parameters
if(utf8str == 0 || utf8len == 0)
return 0;
// ASCII is the easiest
if(ch32 < 0x80)
{
*utf8str = (UTF8)ch32;
return 1;
}
// make sure we have a legal utf32 char
if(ch32 > UNI_MAX_LEGAL_UTF32)
ch32 = UNI_REPLACEMENT_CHAR;
// cannot encode the surrogate range
if(ch32 >= UNI_SUR_HIGH_START && ch32 <= UNI_SUR_LOW_END)
ch32 = UNI_REPLACEMENT_CHAR;
// 2-byte sequence
if(ch32 < 0x800 && utf8len >= 2)
{
*utf8str++ = (UTF8)((ch32 >> 6) | 0xC0);
*utf8str++ = (UTF8)((ch32 & 0x3f) | 0x80);
len = 2;
}
// 3-byte sequence
else if(ch32 < 0x10000 && utf8len >= 3)
{
*utf8str++ = (UTF8)((ch32 >> 12) | 0xE0);
*utf8str++ = (UTF8)((ch32 >> 6) & 0x3f | 0x80);
*utf8str++ = (UTF8)((ch32 & 0x3f) | 0x80);
len = 3;
}
// 4-byte sequence
else if(ch32 <= UNI_MAX_LEGAL_UTF32 && utf8len >= 4)
{
*utf8str++ = (UTF8)((ch32 >> 18) | 0xF0);
*utf8str++ = (UTF8)((ch32 >> 12) & 0x3f | 0x80);
*utf8str++ = (UTF8)((ch32 >> 6) & 0x3f | 0x80);
*utf8str++ = (UTF8)((ch32 & 0x3f) | 0x80);
len = 4;
}
// 5/6 byte sequences never occur because we limit using UNI_MAX_LEGAL_UTF32
return len;
}