pc机上使用locale支持国际化的例子1 ：用mbsrtowcs(）一次性全转化成unicode-bob

linux&nbsp;kernel&nbsp;自由博客kernelchina.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

bob_zhang2004

博客访问： 1648937
博文数量： 197
博客积分： 10046
博客等级：上将
技术积分： 1983
用户组：普通用户
注册时间： 2006-08-07 12:36

个人简介

在外企做服务器开发，目前是项目经理，管理两个server开发的项目。不做嵌入式好久了。

文章分类

全部博文（197）

项目管理（0）
linux and Unix（1）
Linux kernel/ARM（1）
文件系统（5）
u-boot 移植（8）
数据结构（10）
新人培训（16）

开心故事（1）
linux kernel（95）

ia64 开发（1）

linux kernel 研（16）

linux kernel读码（23）

linux driver（5）

kernel新手培训（15）
应用开发（32）

一些研究（9）

进程间通信（6）

Apache（2）

MySQL（3）
未分配的博文（29）

文章存档

2011年（2）

2010年（6）

2009年（18）

2008年（30）

2007年（100）

2006年（41）

我的朋友

相关博文

pc机上使用locale支持国际化的例子1 ：用mbsrtowcs(）一次性全转化成unicode

分类： LINUX

2007-05-10 14:56:30

看代码之前，还是了解一下unicode得基础知识， refer ，我之前得一个blog文章：

http://blog.chinaunix.net/u/22617/showart.php?id=214792

用mbsrtowcs(）一次性全转化成unicode

代码：

#include <stdio.h> #include <stdlib.h> #include <locale.h> #include <string.h> #include <stdio.h> #include <locale.h> #include <wchar.h> int main() { size_t len; char * locp; char *src="中华人民共和国a"; size_t nbytes; wchar_t dst[100]; int ret = 0; wchar_t *ss = L"中国人"; mbstate_t state; memset (&state, '\0', sizeof (state)); len = strlen(src); printf("mbs len is %d\n",len); locp = setlocale(LC_ALL, "zh_CN.GB2312"); printf("You have set Locale to %s\n",locp); ret = NULL, (const char **)&src, 0, &state);/*这里only计算src(s)的长度 ,only 比如中文的长度*/ printf("ins_len = %d\n",ret); ret = mbsrtowcs(dst,(const char **)&src,len,&state); printf("ret=%d\n",ret); printf("开始打印unicode \n"); printf("dst[0] = %#x\n",dst[0]); printf("dst[1] = %#x\n",dst[1]); printf("%d\n", wcslen(dst)); //如果用ssh，尝试一下将转换后的字符串输出,如果是真实终端，结果只能是乱码 printf("%ls\n",dst); printf("----------------------\n"); printf("src=%s\n",src); wprintf(L"%s\n",dst); wprintf(L"%s\n",ss); return 0; }

－－－－－－－－－－－－－

pc机上用locale 支持国际化（i18n），一个个转换，当你需要临时改变每个unicode的时候，比如endian转换的时候

#include <stdio.h> #include <stdlib.h> #include <locale.h> #include <string.h> #include <stdio.h> #include <locale.h> #include <wchar.h> #include <errno.h> int main() { size_t len; char * locp; char *src="中华人民共和国a"; size_t nbytes; int cnt=0; wchar_t *dest; //here ,both use wchar_t type wchar_t wc ; int ret = 0; int ins_size = strlen(src); int i,o=0; mbstate_t mbstate; memset (&mbstate, '\0', sizeof (mbstate)); len = strlen(src); printf("mbs len is %d\n",len); printf("sizeof (wchar_t)=%d\n",sizeof(wchar_t)); dest = malloc((len+1) *2); locp = setlocale(LC_ALL, "zh_CN.GB2312"); printf("You have set Locale to %s\n",locp); //下面是mbrtowc() function for(i=o=0;i<ins_size;i+=cnt,o++) { cnt = mbrtowc(&wc, src + i, ins_size - i, &mbstate); printf("cnt=%d, o=%d\n",cnt,o); dest[o] = wc; } //end mbstate if (!mbsinit(&mbstate)) { printf("Eeek. mbstate not in initial state!\n"); errno = EILSEQ; return -1; } dest[o] = L'\0'; //要填充一下 L'\0' 才行 // ret = mbsrtowcs(dst,(const char **)&src,len,&state); //printf("ret=%d\n",ret); printf("dest length %d\n", wcslen(dest)); //如果用ssh，尝试一下将转换后的字符串输出,如果是真实终端，结果只能是乱码 printf("%ls\n",dest); printf("----------------------\n"); wprintf(L"%ls\n",dest); return 0; }

－－－

比如ntfs-3g 上面的例子转成unicode ，同时要endian转换

/** * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string * @ins: input multibyte string buffer * @outs: on return contains the (allocated) output Unicode string * @outs_len: length of output buffer in Unicode characters * * Convert the input multibyte string @ins, from the current locale into the * corresponding little endian, 2-byte Unicode string. //16bit的unicode * * If *@outs is NULL, the function allocates the string and the caller is * responsible for calling free(*@outs); when finished with it. * * On success the function returns the number of Unicode characters written to * the output string *@outs (>= 0), not counting the terminating Unicode NULL * character. If the output string buffer was allocated, *@outs is set to it. * * On error, -1 is returned, and errno is set to the error code. The following * error codes can be expected: * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). * EILSEQ The input string cannot be represented as a Unicode * string according to the current locale. * ENAMETOOLONG Destination buffer is too small for input string. * ENOMEM Not enough memory to allocate destination buffer. */ int ntfs_mbstoucs(const char *ins, ntfschar **outs, int outs_len) { ntfschar *ucs; const char *s; wchar_t wc; int i, o, cnt, ins_len, ucs_len, ins_size; #ifdef HAVE_MBSINIT mbstate_t mbstate; #endif if (!ins || !outs) { errno = EINVAL; return -1; } ucs = *outs; ucs_len = outs_len; if (ucs && !ucs_len) { errno = ENAMETOOLONG; return -1; } /* Determine the size of the multi-byte string in bytes. */ ins_size = strlen(ins); /* Determine the length of the multi-byte string. */ s = ins; #if defined(HAVE_MBSINIT) memset(&mbstate, 0, sizeof(mbstate)); ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate); #ifdef __CYGWIN32__ if (!ins_len && *ins) { /* Older Cygwin had broken mbsrtowcs() implementation. */ ins_len = strlen(ins); } #endif #elif !defined(DJGPP) ins_len = mbstowcs(NULL, s, 0); #else /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */ ins_len = strlen(ins); #endif if (ins_len == -1) return ins_len; #ifdef HAVE_MBSINIT if ((s != ins) || !mbsinit(&mbstate)) { #else if (s != ins) { #endif errno = EILSEQ; return -1; } /* Add the NULL terminator. */ ins_len++; printf("ins_len=%d\n",ins_len); if (!ucs) { ucs_len = ins_len; ucs = ntfs_malloc(ucs_len * sizeof(ntfschar)); if (!ucs) return -1; } #ifdef HAVE_MBSINIT memset(&mbstate, 0, sizeof(mbstate)); #else mbtowc(NULL, NULL, 0); #endif for (i = o = cnt = 0; i < ins_size; i += cnt, o++) { /* Reallocate memory if necessary or abort. */ if (o >= ucs_len) { ntfschar *tc; if (ucs == *outs) { errno = ENAMETOOLONG; return -1; } /* * We will never get here but hey, it's only a bit of * extra code... */ ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63; tc = (ntfschar*)realloc(ucs, ucs_len); if (!tc) goto err_out; ucs = tc; ucs_len /= sizeof(ntfschar); } /* Convert the multibyte character to a wide character. */ #ifdef HAVE_MBSINIT cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate); #else cnt = mbtowc(&wc, ins + i, ins_size - i); #endif printf("cnt=%d\n",cnt); printf("wc=%ls\n",&wc); if (!cnt) break; if (cnt == -1) goto err_out; if (cnt < -1) { ntfs_log_trace("Eeek. cnt = %i\n", cnt); errno = EINVAL; goto err_out; } /* Make sure we are not overflowing the NTFS Unicode set. */ if ((unsigned long)wc >= (unsigned long)(1 << (8 * sizeof(ntfschar)))) { errno = EILSEQ; goto err_out; } /* Convert the CPU wide character to a LE Unicode character. */ ucs[o] = cpu_to_le16(wc); } #ifdef HAVE_MBSINIT /* Make sure we are back in the initial state. */ if (!mbsinit(&mbstate)) { ntfs_log_trace("Eeek. mbstate not in initial state!\n"); errno = EILSEQ; goto err_out; } #endif /* Now write the NULL character. */ ucs[o] = cpu_to_le16(L'\0'); if (*outs != ucs) *outs = ucs; return o; err_out: if (ucs != *outs) { int eo = errno; free(ucs); errno = eo; } return -1; }

－－

还可以用libiconv 来实现。适合embeded system ，支持国际化，unicode

/* I use libiconv library instead of mbrtowc functions of locale --- bob 20061117 /* 从目前测试的请看来看，只能支持63个字母的文件名 * 这个看起来是libiconv的限制，(63+1)*4 = 256 ,这个可以，如果超过63个 * 就Argument list too long */ int ntfs_mbstoucs(const char *ins, ntfschar **outs, int outs_len) { //定义两个变量： unicode_wchar 和 unicode_ntfschar 来分别来进行。 ntfschar *unicode_ntfschar = NULL; wchar_t *unicode_wchar = NULL; //later will malloc char *iconv_char = NULL; int insize = strlen(ins); int ucs_len =0; //should = outs_len int ins_len = 0; int ucs_bytes_len = 0; iconv_t cd; int i=0; int nconv = 0; int written_unicode = 0; char *tt_char = NULL; mbstate_t mbstate; const char *s = ins; ucs_len = outs_len; unicode_ntfschar = *outs; ucs_bytes_len = outs_len ; if (!ins || !outs) { errno = EINVAL; return -1; } if (*outs && !outs_len) { //这样肯定有问题 errno = ENAMETOOLONG; return -1; }//如果*outs 为空，那么outs_len就随便了，一般为0 /* 这里我们需要计算ins的长度，当然是它作为一种编码的长度，比如 "中国a"的长度，应该为3 问题是我们怎么计算，其实很简单，比如"中国a" , strlen() = 5 , 那么实际的unicode的个数为3 那就这么算： ((str_len +4) & (~3))/4 + 1 = 3 */ printf("insize=%d\n",insize); /* 为什么要注释掉呢? 因为还有点问题: * 下面只是对简体中文有效，如果用户输入的就是英文，看起来就有点不对了， * 正好少了一半的bytes */ //ins_len = ((insize + 4) & ~3) /4 + 1; //这样就可以计算出实际的字体的个数了，根据这个数字，就可以乘以sizeof(ntfschar) 或者 sizeof(wchar_t) 了 ins_len = insize; printf("ins_len = %d\n",ins_len); /* only test the length of unicode 这是locale的做法，其实自己就可以按照上面的方法计算 memset(&mbstate, 0, sizeof(mbstate)); setlocale(LC_ALL,"zh_CN.GB2312"); //这就是locale的一个优势，可以根据当前的locale设置，判断当前的字符串到底是多少个 ins_len = mbsrtowcs(NULL, (const char **)&ins, 0, &mbstate);//这里only计算src(s)的长度 ,only 比如中文的长度 */ printf("ins_len = %d\n",ins_len); ins_len++; //用来存储 L'\0' if( ! unicode_ntfschar) { ucs_len = ins_len; //unicode 的长度，否则就是outs_len 这个长度也要 //这里面要改变ucs_len的长度，因为我们要malloc unicode_ntfschar = (ntfschar *)malloc(ucs_len * sizeof(ntfschar)); if(unicode_ntfschar == NULL) return -1; *outs = unicode_ntfschar ; ucs_bytes_len = ucs_len * sizeof(wchar_t); } printf("sizeof(wchar_t) = %d\n",sizeof(wchar_t)); unicode_wchar = (wchar_t *)malloc(ucs_len * sizeof(wchar_t)); if(unicode_wchar == NULL) { printf("Can't malloc wchar \n"); return -1; } iconv_char = (char *)unicode_wchar ; tt_char = iconv_char ; cd = libiconv_open(UNICODE_CHARSET,USER_CHARSET); //Chinese to unicode if (cd == (iconv_t) -1) { /* Something went wrong. */ if (errno == EINVAL) printf ("conversion from '%s' to '%s' not available",USER_CHARSET,UNICODE_CHARSET); else perror ("iconv_open"); /* Terminate the output string. */ *(*outs) = cpu_to_le16(L'\0'); if(unicode_wchar) free(unicode_wchar); return -1; } printf("before iconv_char address =%p\n",iconv_char); nconv = libiconv(cd, (char **)&ins, &insize, &iconv_char, &ucs_bytes_len); //cd (conversion descriptor ) if (nconv == (size_t) -1) { if(unicode_wchar) free(unicode_wchar); if (errno == EINVAL) { perror("mbstoucs: iconv error"); return -1; }else { perror("mbstoucs: iconv error not EINVAL "); return -1; } } printf("after iconv() , iconv_char address =%p\n",iconv_char); printf("tt_char =%ls\n",(wchar_t *)tt_char); /* Terminate the output string. */ *((wchar_t *) iconv_char) = cpu_to_le16(L'\0'); printf("chinese:%ls\n",unicode_wchar); written_unicode = (wchar_t *)iconv_char - unicode_wchar ; for(i=0;i<written_unicode;i++) { //assign and endian conversion unicode_ntfschar[i] = cpu_to_le16(unicode_wchar[i]); } unicode_ntfschar[i] = cpu_to_le16(L'\0'); if (libiconv_close (cd) != 0) perror ("iconv_close"); if(unicode_wchar) free(unicode_wchar); return written_unicode; //don't include '\0' }

阅读(5026) | 评论(2) | 转发(0) |

上一篇：【原创】关于对齐的一篇好文章，以及对他的一个结论的商榷和纠正

下一篇：[转载] Linux MTD 源代码分析太好了！自己先学习下先

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6