GB2312与UTF-8编码的转换 [3]-L_

LittleBearjpf.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

L__J

博客访问： 377659
博文数量： 53
博客积分： 1411
博客等级：上尉
技术积分： 701
用户组：普通用户
注册时间： 2008-11-04 14:40

文章分类

全部博文（53）

Linux（0）
VOIP（6）
生活点滴（0）
未分配的博文（47）

文章存档

2011年（6）

2010年（20）

2009年（18）

2008年（9）

我的朋友

相关博文

GB2312与UTF-8编码的转换 [3]

分类： C/C++

2009-02-23 11:38:47

字符转换函数：

encoding.c

/************************************************************************
*
* File name:                encoding.c
*
* Description:                Convert character encoding between GB2312
*                             and Unicode(Utf-8).
*
* Author:                    JiangPeifu
* Creation date:            2009-02-20
*
************************************************************************/

#include
#include "encoding.h"

#define ENCODING_DEBUG 0

#if ENCODING_DEBUG
#define debug printf
#else
#define debug
#endif //ENCODING_DEBUG

const unsigned char *GBCodeToUnicode(unsigned char *gbCode);
const unsigned char *UnicodeToGBCode(unsigned char *unicode);
static void UnicodeToUtf8(char* utf8, char *unicode);
static void Utf8ToUnicode(char* unicode, char *utf8);
void GB2312StrToUtf8(
        char *utf8Str,        /* Output Utf-8 chars */
        char* gbStr,        /* Input GB2312 chars */
        int nBytes            /* size of input GB2312 chars */
        );
void Utf8StrToGB2312(
        char *gbStr,        /* Output GB2312 chars */
        char* utf8Str,        /* Input Utf-8 chars */
        int nBytes            /* Size of input GB2312 chars */
        );

/************************************************************************
* Function: GBCodeToUnicode
*     Convert one GB2312 character to one Unicode character
************************************************************************/
const unsigned char *GBCodeToUnicode(unsigned char *gbCode)
{
    const unsigned char *mapped = 0;
    unsigned int i = 0;

    if ((*(gbCode + 1) >= 0xa1) && (*(gbCode + 1) <= 0xfe))
    {
        if ((*gbCode >= 0xa1) && (*gbCode <= 0xa9))
        {
            i = ((*gbCode - 0xa1) * 94 + (*(gbCode + 1) - 0xa1)) * 2;
            mapped = &gb2uTable[i];
        }
        else
        {
            if ((*gbCode >= 0xb0) && (*gbCode <= 0xf7))
            {
                i = ((*gbCode - 0xb0 + 9) * 94 + (*(gbCode + 1) - 0xa1)) * 2;
                mapped = &gb2uTable[i];
            }
            else
            {
                debug("ERROR: GB2312 convert to unicode!!!\n");
            }
        }
    }
    else
    {
        debug("ERROR: GB2312 convert to unicode!!!\n");
    }

   return mapped;
}

/************************************************************************
* Function: UnicodeToGBCode
*     Convert single Unicode character to single GB2312 character
************************************************************************/
const unsigned char *UnicodeToGBCode(unsigned char *unicode)
{
    unsigned int   i;

    i = ((*unicode << 8) + *(unicode + 1)) * 2;

    return &u2gbTable[i];
}

/************************************************************************
* Function: UnicodeToUtf8
************************************************************************/
static void UnicodeToUtf8(char* utf8, char *unicode)
{
    char *pchar = unicode;

    if (unicode == 0)
    {
        debug("ERROR: Unicode convert to utf8, unicode=0\n");
        return;
    }

    unsigned char Hchar = *pchar;
    unsigned char Lchar = *(pchar + 1);
    utf8[0] = (0xE0 | ((Hchar & 0xF0) >>4));
    utf8[1] = (0x80 | ((Hchar & 0x0F) <<2)) + ((Lchar & 0xc0) >>6);
    utf8[2] = (0x80 | (Lchar & 0x3F));

    return ;
}

/************************************************************************
*     Function: GB2312StrToUtf8
***********************************************************************/
void GB2312StrToUtf8(
        char *utf8Str,        /* Output Utf-8 chars */
        char* gbStr,        /* Input GB2312 chars */
        int nBytes            /* size of input GB2312 chars */
        )
{
    char buf[3];

    int i = 0;
    int j = 0;

    while (i < nBytes)
    {
        if(*(gbStr + i) >= 0)
        {
            utf8Str[j++] = gbStr[i++];
            debug("GB2312Str[%d]=%c\n", i-1, gbStr[i-1]);
            debug(" utf8Str[%d]=%c\n", j-1, utf8Str[j-1]);
        }
        else
        {
            char *pbuffer;
            pbuffer = (char *)GBCodeToUnicode(gbStr+i);
            debug("unicode [0]=%x, [1]=%x\n", *pbuffer, *(pbuffer+1));
            UnicodeToUtf8(buf, pbuffer);

            utf8Str[j++] = buf[0];
            debug(" utf8Str[%d]=%x\n", j-1, utf8Str[j-1]);
            utf8Str[j++] = buf[1];
            debug(" utf8Str[%d]=%x\n", j-1, utf8Str[j-1]);
            utf8Str[j++] = buf[2];
            debug(" utf8Str[%d]=%x\n", j-1, utf8Str[j-1]);

            i +=2;
        }
    }
    utf8Str[j] = '\0';

    return;
}

/************************************************************************
* Function: Utf8ToUnicode
************************************************************************/
static void Utf8ToUnicode(char* unicode, char *utf8)
{
    char *pchar = utf8;
    int nBytes = 0;

    if (0 == (*utf8 & 0x80))
    {
        /*
         * single-byte char
         */
        nBytes = 1;
        unicode[0] = *utf8;
    }
    else
    {
        /*
         * 3-byte char (chinese char)
         */
        int i;

        if ( (*utf8 & 0xf0) == 0xe0 )
        {
            nBytes = 3;
            unicode[0] = ((utf8[0] & 0x0f) <<4) + ((utf8[1] & 0x3c) >>2);
            unicode[1] = ((utf8[1] & 0x03) <<6) + (utf8[2] & 0x3f);
        }
        else
        {
            debug("ERROR: utf-8 to unicode, nBytes !=3\n");
            nBytes = 0;
            unicode[0] = '?';
            return;
        }
    }

    return;
}

/************************************************************************
*     Function: GB2312StrToUtf8
***********************************************************************/
void Utf8StrToGB2312(
        char *gbStr,        /* Output GB2312 chars */
        char* utf8Str,        /* Input Utf-8 chars */
        int nBytes            /* Size of input GB2312 chars */
        )
{
    char buf[2];
    int i = 0;
    int j = 0;

    while (i < nBytes)
    {
        if (0 == (*(utf8Str + i) & 0x80))
        {
            gbStr[j++] = utf8Str[i++];
            debug(" utf8Str[%d]=%c\n", i-1, utf8Str[i-1]);
            debug("GB2312Str[%d]=%c\n", j-1, gbStr[j-1]);
        }
        else
        {
            const unsigned char *pbuffer;
            Utf8ToUnicode(buf, utf8Str + i);
            debug(" utf8Str[%d]=%x\n" ,i, utf8Str[i]);
            debug(" utf8Str[%d]=%x\n" ,i+1, utf8Str[i+1]);
            debug(" utf8Str[%d]=%x\n" ,i+2, utf8Str[i+2]);
            debug("unicode [0]=%x, [1]=%x\n", buf[0], buf[1]);

            pbuffer = UnicodeToGBCode(buf);
            gbStr[j++] = *pbuffer;
            debug("GB2312[%d]=%x\n", j-1, gbStr[j-1]);
            gbStr[j++] = *(pbuffer + 1);
            debug("GB2312[%d]=%x\n", j-1, gbStr[j-1]);

            i +=3;
        }
    }
    gbStr[j] = 0;

    return;
}

至此，一切OK！！！

阅读(4658) | 评论(1) | 转发(0) |

上一篇：GB2312与UTF-8编码的转换 [1]

下一篇：Mysql 命令小结

给主人留下些什么吧！~~

chinaunix网友2010-12-21 22:27:53

encoding.h在哪

回复 | 举报

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6