Chinaunix首页 | 论坛 | 博客
  • 博客访问: 116578
  • 博文数量: 22
  • 博客积分: 2000
  • 博客等级: 大尉
  • 技术积分: 290
  • 用 户 组: 普通用户
  • 注册时间: 2007-04-08 20:21
文章分类
文章存档

2011年(9)

2009年(1)

2008年(12)

我的朋友

分类: C/C++

2008-07-31 13:44:48

/*! Turn "wide characters" as returned by some system calls
    (especially on Windows) into UTF-8.
 
    Up to \a dstlen bytes are written to \a dst, including a null
    terminator. The return value is the number of bytes that would be
    written, not counting the null terminator. If greater or equal to
    \a dstlen then if you malloc a new array of size n+1 you will have
    the space needed for the entire string. If \a dstlen is zero then
    nothing is written and this call just measures the storage space
    needed.
 
    \a srclen is the number of words in \a src to convert. On Windows
    this is not necessairly the number of characters, due to there
    possibly being "surrogate pairs" in the UTF-16 encoding used.
    On Unix wchar_t is 32 bits and each location is a character.
 
    On Unix if a src word is greater than 0x10ffff then this is an
    illegal character according to RFC 3629. These are converted as
    though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
    range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
    illegal according to RFC 3629. However I encode these as though
    they are legal, so that utf8towc will return the original data.
 
    On Windows "surrogate pairs" are converted to a single character
    and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
    pairs are converted as though they are individual characters.
*/
 
typedef unsigned short wchar_t;
 
unsigned utf8fromwc(char* dst, unsigned dstlen, const wchar_t* src, unsigned srclen) {
  unsigned i = 0;
  unsigned count = 0;
  if (dstlen) for (;;) {
    unsigned ucs;
    if (i >= srclen) {dst[count] = 0; return count;}
    ucs = src[i++];
    if (ucs < 0x80U) {
      dst[count++] = ucs;
      if (count >= dstlen) {dst[count-1] = 0; break;}
    } else if (ucs < 0x800U) { // 2 bytes
      if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
      dst[count++] = 0xc0 | (ucs >> 6);
      dst[count++] = 0x80 | (ucs & 0x3F);
#ifdef _WIN32
    } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
           src[i] >= 0xdc00 && src[i] <= 0xdfff) {
      // surrogate pair
      unsigned ucs2 = src[i++];
      ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
      // all surrogate pairs turn into 4-byte utf8
#else
    } else if (ucs >= 0x10000) {
      if (ucs > 0x10ffff) {
    ucs = 0xfffd;
    goto J1;
      }
#endif
      if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
      dst[count++] = 0xf0 | (ucs >> 18);
      dst[count++] = 0x80 | ((ucs >> 12) & 0x3F);
      dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
      dst[count++] = 0x80 | (ucs & 0x3F);
    } else {
#ifndef _WIN32
    J1:
#endif
      // all others are 3 bytes:
      if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
      dst[count++] = 0xe0 | (ucs >> 12);
      dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
      dst[count++] = 0x80 | (ucs & 0x3F);
    }
  }
  // we filled dst, measure the rest:
  while (i < srclen) {
    unsigned ucs = src[i++];
    if (ucs < 0x80U) {
      count++;
    } else if (ucs < 0x800U) { // 2 bytes
      count += 2;
#ifdef _WIN32
    } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
           src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
      // surrogate pair
      ++i;
#else
    } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
#endif
      count += 4;
    } else {
      count += 3;
    }
  }
  return count;
}
阅读(2744) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~