Chinaunix首页 | 论坛 | 博客
  • 博客访问: 30204440
  • 博文数量: 2065
  • 博客积分: 10377
  • 博客等级: 上将
  • 技术积分: 21525
  • 用 户 组: 普通用户
  • 注册时间: 2008-11-04 17:50
文章分类

全部博文(2065)

文章存档

2012年(2)

2011年(19)

2010年(1160)

2009年(969)

2008年(153)

分类: C/C++

2010-07-24 07:40:16

/*
    接口如下:
        utf*_encode 把 utf-8/utf-16 的字符串(is 为长度),转换为 utf-c 格式。
        os 为 buffer 的大小,如果转换后的结果不足以放到 buffer 中,返回 -1 ;否则,返回转换后的长度。
 
        utf*_decode 把 utf-c 的字符串,转换为 utf-8 或 utf-16 格式。
        is/os 以及返回值的含义同上。
 */

 
int utf8_encode(const char *utf8,size_t is,unsigned char *utfc,size_t os);
int utf8_decode(const unsigned char *utfc,size_t is,char *utf8,size_t os);
int utf16_encode(const wchar_t *utf16,size_t is,unsigned char *utfc,size_t os);
int utf16_decode(const unsigned char *utfc,size_t is,wchar_t *utf16,size_t os);



#include <stddef.h>
#include <wchar.h>
 
int
utf8_encode(const char *utf8,size_t is,unsigned char *utfc,size_t os)
{
    const unsigned char *input=(const unsigned char *)utf8;
    unsigned char *output=utfc;
    while ((*input & 0xc0) == 0x80 && is!=0) {
        ++input;
        --is;
    }
 
    if (is==0) {
        return 0;
    }
 
    do {
        if (os==0) {
            return -1;
        }
        if (*input < 128) {
            *output++ = *input++;
            --is;
            --os;
        }
        else {
            unsigned char b=*input++;
            int c=(int)(b&(0x0f | (~(b>>1) &0x1f)));
            --is;
            while (is!=0 && (*input & 0xc0)==0x80) {
                c=c<<6 | (*input & 0x3f);
                ++input;
                --is;
            }
 
            unsigned char hi = (unsigned char)(c >> 8);
            unsigned char low = (unsigned char)(c&0xff);
            if (hi>=0x20 && hi<=0x9f && hi!=0x3f) {
                if (os<2) {
                    return -1;
                }
                *output++ = (unsigned char)(hi + 0x60);
                *output++ = low;
                os-=2;
            }
            else {
                if (os<3) {
                    return -1;
                }
                *output++ = 0x9f;
                *output++ = hi;
                *output++ = low;
                os-=3;
            }
        }
    } while (is!=0);
 
    return output-utfc;
}
 
int
utf8_decode(const unsigned char *utfc,size_t is,char *utf8,size_t os)
{
    const unsigned char *input=utfc;
    unsigned char *output=(unsigned char *)utf8;
 
    if (is==0) {
        return 0;
    }
 
    do {
        if (os==0) {
            return -1;
        }
        if (*input < 128) {
            *output++ = *input++;
            --is;
            --os;
        }
        else {
            int c;
            if (*input==0x9f) {
                if (is<3) {
                    c=0xffff;
                    is=0;
                }
                else {
                    c=input[1]<<8 | input[2];
                    is-=3;
                    input+=3;
                }
            }
            else {
                if (is<2) {
                    c=0xffff;
                    is=0;
                }
                else {
                    c=(input[0] - 0x60)<<8 | input[1];
                    is-=2;
                    input+=2;
                }
            }
            if (c<=0x7ff) {
                if (os<2) {
                    return -1;
                }
                *output++ =(unsigned char)((c>>6) | 0xc0);
                os-=2;
            }
            else {
                if (os<3) {
                    return -1;
                }
                *output++ =(unsigned char)((c>>12) | 0xe0);
                *output++ =(unsigned char)(((c>>6) & 0x3f) | 0x80);
                os-=3;
            }
            *output++ =(unsigned char)((c&0x3f) | 0x80);
        }
    } while (is!=0);
 
    return output-(unsigned char *)utf8;
}
 
int
utf16_encode(const wchar_t *utf16,size_t is,unsigned char *utfc,size_t os)
{
    const wchar_t *input=utf16;
    unsigned char *output=utfc;
 
    if (is==0) {
        return 0;
    }
 
    do {
        if (os==0) {
            return -1;
        }
        if ((unsigned)*input < 128) {
            *output++ = (unsigned char)*input++;
            --is;
            --os;
        }
        else {
            int c=(unsigned)*input++;
            --is;
 
            unsigned char hi = (unsigned char)(c >> 8);
            unsigned char low = (unsigned char)(c&0xff);
            if (hi>=0x20 && hi<=0x9f && hi!=0x3f) {
                if (os<2) {
                    return -1;
                }
                *output++ = (unsigned char)(hi + 0x60);
                *output++ = low;
                os-=2;
            }
            else {
                if (os<3) {
                    return -1;
                }
                *output++ = 0x9f;
                *output++ = hi;
                *output++ = low;
                os-=3;
            }
        }
    } while (is!=0);
 
    return output-utfc;
}
 
int
utf16_decode(const unsigned char *utfc,size_t is,wchar_t *utf16,size_t os)
{
    const unsigned char *input=utfc;
    wchar_t *output=utf16;
 
    if (is==0) {
        return 0;
    }
 
    do {
        if (os==0) {
            return -1;
        }
        if (*input < 128) {
            *output++ = (wchar_t)*input++;
            --is;
            --os;
        }
        else {
            int c;
            if (*input==0x9f) {
                if (is<3) {
                    c=0xffff;
                    is=0;
                }
                else {
                    c=input[1]<<8 | input[2];
                    is-=3;
                    input+=3;
                }
            }
            else {
                if (is<2) {
                    c=0xffff;
                    is=0;
                }
                else {
                    c=(input[0] - 0x60)<<8 | input[1];
                    is-=2;
                    input+=2;
                }
            }
            *output++ =(wchar_t)c;
        }
    } while (is!=0);
 
    return output-utf16;
}


阅读(1477) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~