字符串编码转换函数 unicode转换成utf8-winterzhang0324-ChinaUnix博客

Winter's BLOG

首页　| 　博文目录　| 　关于我

winterzhang0324

博客访问： 117609
博文数量： 22
博客积分： 2000
博客等级：大尉
技术积分： 290
用户组：普通用户
注册时间： 2007-04-08 20:21

文章分类

全部博文（22）

文章存档

2011年（9）

2009年（1）

2008年（12）

我的朋友

相关博文

字符串编码转换函数 unicode转换成utf8

分类： C/C++

2008-07-31 13:44:48

/*! Turn "wide characters" as returned by some system calls
    (especially on Windows) into UTF-8.

    Up to \a dstlen bytes are written to \a dst, including a null
    terminator. The return value is the number of bytes that would be
    written, not counting the null terminator. If greater or equal to
    \a dstlen then if you malloc a new array of size n+1 you will have
    the space needed for the entire string. If \a dstlen is zero then
    nothing is written and this call just measures the storage space
    needed.

    \a srclen is the number of words in \a src to convert. On Windows
    this is not necessairly the number of characters, due to there
    possibly being "surrogate pairs" in the UTF-16 encoding used.
    On Unix wchar_t is 32 bits and each location is a character.

    On Unix if a src word is greater than 0x10ffff then this is an
    illegal character according to RFC 3629. These are converted as
    though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
    range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
    illegal according to RFC 3629. However I encode these as though
    they are legal, so that utf8towc will return the original data.

    On Windows "surrogate pairs" are converted to a single character
    and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
    pairs are converted as though they are individual characters.
*/

typedef unsigned short wchar_t;

unsigned utf8fromwc(char* dst, unsigned dstlen, const wchar_t* src, unsigned srclen) {
unsigned i = 0;
unsigned count = 0;
if (dstlen) for (;;) {
    unsigned ucs;
    if (i >= srclen) {dst[count] = 0; return count;}
    ucs = src[i++];
    if (ucs < 0x80U) {
      dst[count++] = ucs;
      if (count >= dstlen) {dst[count-1] = 0; break;}
    } else if (ucs < 0x800U) { // 2 bytes
      if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
      dst[count++] = 0xc0 | (ucs >> 6);
      dst[count++] = 0x80 | (ucs & 0x3F);
#ifdef _WIN32
    } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
           src[i] >= 0xdc00 && src[i] <= 0xdfff) {
      // surrogate pair
      unsigned ucs2 = src[i++];
      ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
      // all surrogate pairs turn into 4-byte utf8
#else
    } else if (ucs >= 0x10000) {
      if (ucs > 0x10ffff) {
    ucs = 0xfffd;
    goto J1;
      }
#endif
      if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
      dst[count++] = 0xf0 | (ucs >> 18);
      dst[count++] = 0x80 | ((ucs >> 12) & 0x3F);
      dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
      dst[count++] = 0x80 | (ucs & 0x3F);
    } else {
#ifndef _WIN32
    J1:
#endif
      // all others are 3 bytes:
      if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
      dst[count++] = 0xe0 | (ucs >> 12);
      dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
      dst[count++] = 0x80 | (ucs & 0x3F);
    }
}
// we filled dst, measure the rest:
while (i < srclen) {
    unsigned ucs = src[i++];
    if (ucs < 0x80U) {
      count++;
    } else if (ucs < 0x800U) { // 2 bytes
      count += 2;
#ifdef _WIN32
    } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
           src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
      // surrogate pair
      ++i;
#else
    } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
#endif
      count += 4;
    } else {
      count += 3;
    }
}
return count;
}

阅读(2770) | 评论(0) | 转发(0) |

上一篇：C语言函数实现字符串替换

下一篇：字符串函数去掉单个特殊字符

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6