Chinaunix首页 | 论坛 | 博客
  • 博客访问: 2611100
  • 博文数量: 877
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 5921
  • 用 户 组: 普通用户
  • 注册时间: 2013-12-05 12:25
个人简介

技术的乐趣在于分享,欢迎多多交流,多多沟通。

文章分类

全部博文(877)

文章存档

2021年(2)

2016年(20)

2015年(471)

2014年(358)

2013年(26)

分类: 嵌入式

2014-08-27 13:33:48

常用字库编码的转换(Unicode,Utf8,Big5,Gb18030)
http://gzfboke.blog.sohu.com/150509491.html

常用字库编码的转换(Unicode,Utf8,Big5,Gb18030)

vc++ 2009-02-06 15:45:58 阅读123 评论0 字号:大中小

int UnicodeToUtf8(LPUNC src, BYTE* putf8)
{
int len=0;
while(*src)
{
   if (*src < 0x80) //one byte
   {
    putf8[len++] = *src;
   }
   else if (*src < 0x800) //two byte
   {
    putf8[len++] = 0xC0 | (*src >> 12);
    putf8[len++] = 0x80 | (*src >>6 & 0x3F);
   }
   else
   {
    putf8[len++] = 0xE0 | (*src >> 12);
    putf8[len++] = 0x80 | (*src >>6 & 0x3F);
    putf8[len++] = 0x80 | (*src &0x3F);
   } 
   src ++;
}
putf8[len] = 0;
return len;
}

int Utf8ToUnicode(BYTE* src, LPUNC punicode)
{
    if(0 == (src[0] & 0x80)){
        // 单字节
        *punicode = (UNC)src[0];
        return 1;
    }
    if(0xC0 == (src[0] & 0xE0) &&
       0x80 == (src[1] & 0xC0)){
        // 双字节
        *punicode = (UNC)((((UNC)src[0] & 0x001F) << 6) |
                           ((UNC)src[1] & 0x003F));
        return 2;
    }

    if(0xE0 == (src[0] & 0xF0) &&
       0x80 == (src[1] & 0xC0) &&
       0x80 == (src[2] & 0xC0)){
       // 三字节
       *punicode = (UNC)((((UNC)src[0] & 0x000F) << 12) |
                         (((UNC)src[1] & 0x003F) << 6) |
                          ((UNC)src[2] & 0x003F));
       return 3;
    }
    return 0; // 表示出错
}

UNC Big5ToUnicode(WORD big5)
{
    if(NULL == big5_unicode_tbl){
        return 0x0000;
    }
int low = 0;
int high = 13502;
int mid;
if(big5 < 0x80)
{
   return big5;


if(big5 >= 0xA140 && big5 <= 0xF9FE)

   while(low <= high)
   {
    mid = (low+high)/2;
    WORD cur = *(big5_unicode_tbl + mid * 2);
    if(cur > big5)
    {
     high = mid-1;
     continue;
    }
    if(cur < big5)
    {
     low = mid +1;
     continue;
    }
    if(cur == big5)
    {
     return *(big5_unicode_tbl + mid * 2 + 1);
    }

   }
}
return 0x0000;
}

WORD UnicodeToBig5(UNC unicode)
{
    if(NULL == unicode_big5_tbl){
        return 0x0000;
    }
int low = 0;
int high = 13502;
int mid;
if(unicode < 0x80)
{
   return unicode;


while(low <= high)
{
   mid = (low+high)/2;
   UNC cur = *(unicode_big5_tbl + mid * 2 + 1);
   if(cur > unicode)
   {
    high = mid-1;
    continue;
   }
   if(cur < unicode)
   {
    low = mid +1;
    continue;
   }
   if(cur == unicode)
   {
    return *(unicode_big5_tbl + mid * 2);
   }

}
return 0x0;

}

UNC Gb18030ToUnicode(WORD gb18030)
{
    if(NULL == gb18030_unicode_tbl){
        return 0x0000;
    }
int low = 0;
int high = 21790;
int mid;
if(gb18030 < 0x80)
{
   return gb18030;
}
if(gb18030 == 0x80)
   return 0x20AC;
if(gb18030 >= 0x8140 && gb18030 <= 0xFE4F)
{
   while(low <= high)
   {
    mid = (low+high)/2;
    WORD cur = *(gb18030_unicode_tbl + mid * 2);
    if(cur > gb18030)
    {
     high = mid-1;
     continue;
    }
    if(cur < gb18030)
    {
     low = mid +1;
     continue;
    }
    if(cur == gb18030)
    {
     return *(gb18030_unicode_tbl + mid * 2 + 1);
    }

   }
}
return 0x0000;
}

WORD UnicodeToGb18030(UNC unicode)
{
    if(NULL == unicode_gb18030_tbl){
        return 0x0000;
    }
int low = 0;
int high = 21790;
int mid;
if(unicode < 0x80)
{
   return unicode;
}
if(unicode == 0x20AC)
   return 0x80;
while(low <= high)
{
   mid = (low+high)/2;
   UNC cur = *(unicode_gb18030_tbl + mid * 2 + 1);
   if(cur > unicode)
   {
    high = mid-1;
    continue;
   }
   if(cur < unicode)
   {
    low = mid +1;
    continue;
   }
   if(cur == unicode)
   {
    return *(unicode_gb18030_tbl + mid * 2);
   }

}
return 0x0;
}


// 转换utf8字符串
// 参数:src:源字符串;dest:目标字符串;dest_len:目标可容纳长度(字符数)
// 返回值:实际完成转换的字符数
int utf8_to_unicode(BYTE* src, LPUNC dest, int dest_len)
{
char* old_src = src;
int err = 0;
    int i = 0;
    while(i < dest_len){
        int len = Utf8ToUnicode(src, &dest[i]);
        //F(len);printf("code = 0x%04X\n", dest[i]);
        if(0 == len){
        err = 1;
        len = 1;
        dest[i] = UNKNOWN_CHAR;
        }
        src += len;
        if(0x0000 == dest[i]){
            break; // 字符串结束符
        }
        i ++;
    }
    if(err){
    printf("utf8 string err! ");T();
    DUMP_BUFFER("---------------",old_src,20);
    DUMP_BUFFER("===============",(char *)dest,20);
    }
    return i+1;
}

int utf8_to_unicode2(BYTE* src, LPUNC dest, int dest_len, int* perr_count)
{
    *perr_count = 0;
char* old_src = src;
int err = 0;
    int i = 0;
    while(i < dest_len){
        int len = Utf8ToUnicode(src, &dest[i]);
        //F(len);printf("code = 0x%04X\n", dest[i]);
        if(0 == len){
        err = 1;
        len = 1;
        dest[i] = UNKNOWN_CHAR;
            if(NULL != perr_count){
                (*perr_count) ++;
            }
        }
        src += len;
        if(0x0000 == dest[i]){
            break; // 字符串结束符
        }
        i ++;
    }
    if(err){
    printf("utf8 string err! ");T();
    DUMP_BUFFER("---------------",old_src,20);
    DUMP_BUFFER("===============",(char *)dest,20);
    }
    return i+1;
}

// 转换gb18030字符串
// 参数:src:源字符串;dest:目标字符串;dest_len:目标可容纳长度(字符数)
// 返回值:实际完成转换的字符数
int gb18030_to_unicode(BYTE* src, LPUNC dest, int dest_len)
{
    int i = 0;
    while(i < dest_len){
        if(*src < 0x80){
            dest[i] = (UNC)(*src);
            src += 1;
        }else{
            WORD ch = MAKEWORD(*(src+1), *src);
            dest[i] = Gb18030ToUnicode(ch);
            src += 2;
        }
        //F(len);printf("code = 0x%04X\n", dest[i]);
        if(0x0000 == dest[i]){
            break; // 字符串结束符
        }
        i ++;
    }
    return i+1;
}

阅读(939) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~