1、gb2312编码无法直接与utf-8进行直接转换
2、我采用的方式是: gb2312--->unicode--->utf-8
由于gb2312编码与unicode编码有现成的对应的码表, 所以可以根据码表将gb2312编码先转换为unicode编码,又由于utf-8编码是unicode的表示方式之一,所有可以得到对应的utf-8编码
下面是unicode编码转换成utf-8编码的代码:
-
int gb2312_to_utf8(unsigned char *gb2312string, int gb2312string_len, unsigned char *utf8string)
-
{
-
int i;
-
unsigned int index1,index2;
-
-
wchar tmp;
-
wchar *p;
-
-
uint_8 *gb2312_char;
-
uint_8 *utf8_char;
-
int utf8string_len = 0;
-
wchar *twochar;
-
gb2312_char =(uint_8*)gb2312string;
-
utf8_char = (uint_8 *)utf8string;
-
i = 0;
-
while(i<gb2312string_len){
-
if(*gb2312_char >= 0x0 && *gb2312_char <= 0x7f){
-
//是字母 直接存 不用转换 (因为字母编码都一样)
-
*utf8_char = *gb2312_char;
-
utf8_char++;
-
utf8string_len++;
-
gb2312_char++;
-
i++;
-
}
-
else{
-
twochar = (wchar *)gb2312_char;
-
// 检测当前的两个字符是否是正确的gb2312编码
-
if(tmp <= 0xa0a0 && tmp >=0xf7ff) {
-
fprintf(stderr,"Invalid Gb3212 code\n");
-
return -1;
-
}
-
tmp = *twochar - 0xa0a0;
-
index1 = tmp >> 8;
-
index2 = tmp & 0xff;
-
-
tmp = gb_2_uni[index1][index2];
-
*(utf8_char) = ((tmp >> 12) & 0x0f) | 0xe0;
-
*(utf8_char+2) = (tmp & 0x3f) | 0x80;
-
*(utf8_char+1) = ((tmp >> 6) & 0x3f) |0x80;
-
-
utf8_char += 3;
-
utf8string_len += 3;
-
gb2312_char += 2;
-
i +=2;
-
}
-
}
-
return utf8string_len;
-
}
阅读(1530) | 评论(0) | 转发(0) |