Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1634677
  • 博文数量: 197
  • 博客积分: 10046
  • 博客等级: 上将
  • 技术积分: 1983
  • 用 户 组: 普通用户
  • 注册时间: 2006-08-07 12:36
个人简介

在外企做服务器开发, 目前是项目经理, 管理两个server开发的项目。不做嵌入式好久了。

文章分类
文章存档

2011年(2)

2010年(6)

2009年(18)

2008年(30)

2007年(100)

2006年(41)

分类: LINUX

2007-05-10 14:56:30

看代码之前, 还是了解一下unicode得基础知识, refer ,我之前得一个blog文章:
 
 
用mbsrtowcs() 一次性全转化成unicode
代码:
 

#include <stdio.h>
#include <stdlib.h>

#include <locale.h>

#include <string.h>

#include <stdio.h>
#include <locale.h>
#include <wchar.h>
int main()
{
    size_t len;
    char * locp;
    char *src="中华人民共和国a";
    size_t nbytes;
    wchar_t dst[100];
    int ret = 0;
    wchar_t *ss = L"中国人";

    mbstate_t state;
    memset (&state, '\0', sizeof (state));

    len = strlen(src);
    printf("mbs len is %d\n",len);

    locp = setlocale(LC_ALL, "zh_CN.GB2312");
    printf("You have set Locale to %s\n",locp);
    
    ret = NULL, (const char **)&src, 0, &state);/*这里only计算src(s)的长度 ,only 比如中文的长度*/
    printf("ins_len = %d\n",ret);
    
    
    ret = mbsrtowcs(dst,(const char **)&src,len,&state);
    printf("ret=%d\n",ret);
    
    printf("开始打印unicode \n");
    printf("dst[0] = %#x\n",dst[0]);
    printf("dst[1] = %#x\n",dst[1]);
    
    printf("%d\n", wcslen(dst));
    //如果用ssh,尝试一下将转换后的字符串输出,如果是真实终端,结果只能是乱码

    printf("%ls\n",dst);
    printf("----------------------\n");
    printf("src=%s\n",src);
    wprintf(L"%s\n",dst);
    wprintf(L"%s\n",ss);
    return 0;
}

-------------

pc机上用locale 支持国际化(i18n),一个个转换 , 当你需要临时改变每个unicode的时候,比如endian转换的时候

 

#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <stdio.h>
#include <locale.h>
#include <wchar.h>
#include <errno.h>


int main()
{
    size_t len;
    char * locp;
    char *src="中华人民共和国a";
    size_t nbytes;
    int cnt=0;
    
    wchar_t *dest; //here ,both use wchar_t type

    wchar_t wc ;
    
    int ret = 0;
    int ins_size = strlen(src);
    int i,o=0;

    mbstate_t mbstate;
    memset (&mbstate, '\0', sizeof (mbstate));

    len = strlen(src);
    printf("mbs len is %d\n",len);
    printf("sizeof (wchar_t)=%d\n",sizeof(wchar_t));
    
    dest = malloc((len+1) *2);

    locp = setlocale(LC_ALL, "zh_CN.GB2312");
    printf("You have set Locale to %s\n",locp);
    
    //下面是mbrtowc() function

    
    for(i=o=0;i<ins_size;i+=cnt,o++)
    {
        cnt = mbrtowc(&wc, src + i, ins_size - i, &mbstate);
        printf("cnt=%d, o=%d\n",cnt,o);
        
        dest[o] = wc;
    }
    
    //end mbstate

    if (!mbsinit(&mbstate)) {
        printf("Eeek. mbstate not in initial state!\n");
        errno = EILSEQ;
        return -1;
    }
    dest[o] = L'\0'; //要填充一下 L'\0' 才行

    
    
    
//    ret = mbsrtowcs(dst,(const char **)&src,len,&state);

    //printf("ret=%d\n",ret);

    printf("dest length %d\n", wcslen(dest));
    //如果用ssh,尝试一下将转换后的字符串输出,如果是真实终端,结果只能是乱码

    printf("%ls\n",dest);
    printf("----------------------\n");
    wprintf(L"%ls\n",dest);
    return 0;
}

 

 

 

 

---

比如ntfs-3g 上面的例子 转成unicode ,同时要endian转换

/**
 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
 * @ins:    input multibyte string buffer
 * @outs:    on return contains the (allocated) output Unicode string
 * @outs_len:    length of output buffer in Unicode characters
 *
 * Convert the input multibyte string @ins, from the current locale into the
 * corresponding little endian, 2-byte Unicode string. //16bit的unicode
 *
 * If *@outs is NULL, the function allocates the string and the caller is
 * responsible for calling free(*@outs); when finished with it.
 *
 * On success the function returns the number of Unicode characters written to
 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
 * character. If the output string buffer was allocated, *@outs is set to it.
 *
 * On error, -1 is returned, and errno is set to the error code. The following
 * error codes can be expected:
 *    EINVAL        Invalid arguments (e.g. @ins or @outs is NULL).
 *    EILSEQ        The input string cannot be represented as a Unicode
 *            string according to the current locale.
 *    ENAMETOOLONG    Destination buffer is too small for input string.
 *    ENOMEM        Not enough memory to allocate destination buffer.
 */

int ntfs_mbstoucs(const char *ins, ntfschar **outs, int outs_len)
{
    ntfschar *ucs;
    const char *s;
    wchar_t wc;
    int i, o, cnt, ins_len, ucs_len, ins_size;
#ifdef HAVE_MBSINIT
    mbstate_t mbstate;
#endif

    if (!ins || !outs) {
        errno = EINVAL;
        return -1;
    }
    ucs = *outs;
    ucs_len = outs_len;
    if (ucs && !ucs_len) {
        errno = ENAMETOOLONG;
        return -1;
    }
    /* Determine the size of the multi-byte string in bytes. */
    ins_size = strlen(ins);
    /* Determine the length of the multi-byte string. */
    s = ins;
#if defined(HAVE_MBSINIT)
    memset(&mbstate, 0, sizeof(mbstate));
    ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
#ifdef __CYGWIN32__
    if (!ins_len && *ins) {
        /* Older Cygwin had broken mbsrtowcs() implementation. */
        ins_len = strlen(ins);
    }
#endif
#elif !defined(DJGPP)
    ins_len = mbstowcs(NULL, s, 0);
#else
    /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
    ins_len = strlen(ins);
#endif
    if (ins_len == -1)
        return ins_len;
#ifdef HAVE_MBSINIT
    if ((s != ins) || !mbsinit(&mbstate)) {
#else
    if (s != ins) {
#endif
        errno = EILSEQ;
        return -1;
    }

    /* Add the NULL terminator. */
    ins_len++;
    printf("ins_len=%d\n",ins_len);
    if (!ucs) {
        ucs_len = ins_len;
        ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
        if (!ucs)
            return -1;
    }
#ifdef HAVE_MBSINIT
    memset(&mbstate, 0, sizeof(mbstate));
#else
    mbtowc(NULL, NULL, 0);
#endif
    for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
        /* Reallocate memory if necessary or abort. */
        if (o >= ucs_len) {
            ntfschar *tc;
            if (ucs == *outs) {
                errno = ENAMETOOLONG;
                return -1;
            }
            /*
             * We will never get here but hey, it's only a bit of
             * extra code...
             */

            ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
            tc = (ntfschar*)realloc(ucs, ucs_len);
            if (!tc)
                goto err_out;
            ucs = tc;
            ucs_len /= sizeof(ntfschar);
        }
        /* Convert the multibyte character to a wide character. */
#ifdef HAVE_MBSINIT
        cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
#else
        cnt = mbtowc(&wc, ins + i, ins_size - i);
#endif
        printf("cnt=%d\n",cnt);
        printf("wc=%ls\n",&wc);
        if (!cnt)
            break;
        if (cnt == -1)
            goto err_out;
        if (cnt < -1) {
            ntfs_log_trace("Eeek. cnt = %i\n", cnt);
            errno = EINVAL;
            goto err_out;
        }
        /* Make sure we are not overflowing the NTFS Unicode set. */
        if ((unsigned long)wc >= (unsigned long)(1 <<
                (8 * sizeof(ntfschar)))) {
            errno = EILSEQ;
            goto err_out;
        }
        /* Convert the CPU wide character to a LE Unicode character. */
        ucs[o] = cpu_to_le16(wc);
    }
#ifdef HAVE_MBSINIT
    /* Make sure we are back in the initial state. */
    if (!mbsinit(&mbstate)) {
        ntfs_log_trace("Eeek. mbstate not in initial state!\n");
        errno = EILSEQ;
        goto err_out;
    }
#endif
    /* Now write the NULL character. */
    ucs[o] = cpu_to_le16(L'\0');
    if (*outs != ucs)
        *outs = ucs;
    return o;
err_out:
    if (ucs != *outs) {
        int eo = errno;
        free(ucs);
        errno = eo;
    }
    return -1;
}

 

--

还可以用libiconv 来实现 。 适合embeded system , 支持国际化,unicode

 


/* I use libiconv library instead of mbrtowc functions of locale
                        --- bob 20061117

/* 从目前测试的请看来看,只能支持63个字母的文件名
 * 这个看起来是libiconv的限制,(63+1)*4 = 256 ,这个可以,如果超过63个
 * 就Argument list too long
 */

int ntfs_mbstoucs(const char *ins, ntfschar **outs, int outs_len)
{
    //定义两个变量: unicode_wchar 和 unicode_ntfschar 来分别来进行。

    
    ntfschar *unicode_ntfschar = NULL;
    wchar_t *unicode_wchar = NULL; //later will malloc

    char *iconv_char = NULL;
    int insize = strlen(ins);
    int ucs_len =0;        //should = outs_len

    int ins_len = 0;
    int ucs_bytes_len = 0;
    iconv_t cd;
    int i=0;
    int nconv = 0;
    int written_unicode = 0;
    char *tt_char = NULL;
    mbstate_t mbstate;
    const char *s = ins;
    ucs_len = outs_len;
    unicode_ntfschar = *outs;
    ucs_bytes_len = outs_len ;


    if (!ins || !outs) {
        errno = EINVAL;
        return -1;
    }

    if (*outs && !outs_len) { //这样肯定有问题

        errno = ENAMETOOLONG;
        return -1;
    }//如果*outs 为空,那么outs_len就随便了,一般为0


    
    /* 这里我们需要计算ins的长度,当然是它作为一种编码的长度 ,比如 "中国a"的长度,应该为3
        问题是我们怎么计算,其实很简单,比如"中国a" , strlen() = 5 , 那么实际的unicode的个数为3
        那就这么算: ((str_len +4) & (~3))/4 + 1 = 3
    */

    printf("insize=%d\n",insize);
    
    /* 为什么要注释掉呢? 因为还有点问题:
     * 下面只是对简体中文有效,如果用户输入的就是英文,看起来就有点不对了,
     * 正好少了一半的bytes
     */
    
    //ins_len = ((insize + 4) & ~3) /4 + 1; //这样就可以计算出实际的字体的个数了, 根据这个数字,就可以乘以sizeof(ntfschar) 或者 sizeof(wchar_t) 了

    ins_len = insize;
    printf("ins_len = %d\n",ins_len);
    
    /* only test the length of unicode 这是locale的做法,其实自己就可以按照上面的方法计算
    memset(&mbstate, 0, sizeof(mbstate));
    setlocale(LC_ALL,"zh_CN.GB2312");
    //这就是locale的一个优势,可以根据当前的locale设置,判断当前的字符串到底是多少个
    ins_len = mbsrtowcs(NULL, (const char **)&ins, 0, &mbstate);//这里only计算src(s)的长度 ,only 比如中文的长度
    */

    
    printf("ins_len = %d\n",ins_len);
    
    ins_len++; //用来存储 L'\0'

    
    if( ! unicode_ntfschar)
    {
        ucs_len = ins_len; //unicode 的长度,否则就是outs_len 这个长度也要

        //这里面要改变ucs_len的长度,因为我们要malloc

        unicode_ntfschar = (ntfschar *)malloc(ucs_len * sizeof(ntfschar));
        if(unicode_ntfschar == NULL)
            return -1;        
        *outs = unicode_ntfschar ;
        ucs_bytes_len = ucs_len * sizeof(wchar_t);
    }
    
    printf("sizeof(wchar_t) = %d\n",sizeof(wchar_t));
    unicode_wchar = (wchar_t *)malloc(ucs_len * sizeof(wchar_t));
    if(unicode_wchar == NULL)
    {
        printf("Can't malloc wchar \n");
        return -1;
    }
    
    iconv_char = (char *)unicode_wchar ;
    tt_char = iconv_char ;
    
    
      cd = libiconv_open(UNICODE_CHARSET,USER_CHARSET); //Chinese to unicode

      if (cd == (iconv_t) -1)    {
        /* Something went wrong. */
        if (errno == EINVAL)
            printf ("conversion from '%s' to '%s' not available",USER_CHARSET,UNICODE_CHARSET);
        else
            perror ("iconv_open");
    
        /* Terminate the output string. */
        *(*outs) = cpu_to_le16(L'\0');
        if(unicode_wchar)
            free(unicode_wchar);
        return -1;
    }
    
    printf("before iconv_char address =%p\n",iconv_char);
    nconv = libiconv(cd, (char **)&ins, &insize, &iconv_char, &ucs_bytes_len);    //cd (conversion descriptor )

    
    if (nconv == (size_t) -1) {
        if(unicode_wchar)
                free(unicode_wchar);
          if (errno == EINVAL) {
                  perror("mbstoucs: iconv error");
                  return -1;
          }else {
                  perror("mbstoucs: iconv error not EINVAL ");
                  return -1;
          }
      }
      printf("after iconv() , iconv_char address =%p\n",iconv_char);
      printf("tt_char =%ls\n",(wchar_t *)tt_char);
      /* Terminate the output string. */
         *((wchar_t *) iconv_char) = cpu_to_le16(L'\0');
         printf("chinese:%ls\n",unicode_wchar);
         
     written_unicode = (wchar_t *)iconv_char - unicode_wchar ;
         
     for(i=0;i<written_unicode;i++)
     {
        //assign and endian conversion

         unicode_ntfschar[i] = cpu_to_le16(unicode_wchar[i]);
    }
     unicode_ntfschar[i] = cpu_to_le16(L'\0');
    
    if (libiconv_close (cd) != 0)
        perror ("iconv_close");

    if(unicode_wchar)
        free(unicode_wchar);
    return written_unicode; //don't include '\0'

}

 

阅读(4898) | 评论(2) | 转发(0) |
给主人留下些什么吧!~~