/*! Turn "wide characters" as returned by some system calls
(especially on Windows) into UTF-8.
Up to \a dstlen bytes are written to \a dst, including a null
terminator. The return value is the number of bytes that would be
written, not counting the null terminator. If greater or equal to
\a dstlen then if you malloc a new array of size n+1 you will have
the space needed for the entire string. If \a dstlen is zero then
nothing is written and this call just measures the storage space
needed.
\a srclen is the number of words in \a src to convert. On Windows
this is not necessairly the number of characters, due to there
possibly being "surrogate pairs" in the UTF-16 encoding used.
On Unix wchar_t is 32 bits and each location is a character.
On Unix if a src word is greater than 0x10ffff then this is an
illegal character according to RFC 3629. These are converted as
though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
illegal according to RFC 3629. However I encode these as though
they are legal, so that utf8towc will return the original data.
On Windows "surrogate pairs" are converted to a single character
and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
pairs are converted as though they are individual characters.
*/
typedef unsigned short wchar_t;
unsigned utf8fromwc(char* dst, unsigned dstlen, const wchar_t* src, unsigned srclen) {
unsigned i = 0;
unsigned count = 0;
if (dstlen) for (;;) {
unsigned ucs;
if (i >= srclen) {dst[count] = 0; return count;}
ucs = src[i++];
if (ucs < 0x80U) {
dst[count++] = ucs;
if (count >= dstlen) {dst[count-1] = 0; break;}
} else if (ucs < 0x800U) { // 2 bytes
if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
dst[count++] = 0xc0 | (ucs >> 6);
dst[count++] = 0x80 | (ucs & 0x3F);
#ifdef _WIN32
} else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
src[i] >= 0xdc00 && src[i] <= 0xdfff) {
// surrogate pair
unsigned ucs2 = src[i++];
ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
// all surrogate pairs turn into 4-byte utf8
#else
} else if (ucs >= 0x10000) {
if (ucs > 0x10ffff) {
ucs = 0xfffd;
goto J1;
}
#endif
if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
dst[count++] = 0xf0 | (ucs >> 18);
dst[count++] = 0x80 | ((ucs >> 12) & 0x3F);
dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
dst[count++] = 0x80 | (ucs & 0x3F);
} else {
#ifndef _WIN32
J1:
#endif
// all others are 3 bytes:
if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
dst[count++] = 0xe0 | (ucs >> 12);
dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
dst[count++] = 0x80 | (ucs & 0x3F);
}
}
// we filled dst, measure the rest:
while (i < srclen) {
unsigned ucs = src[i++];
if (ucs < 0x80U) {
count++;
} else if (ucs < 0x800U) { // 2 bytes
count += 2;
#ifdef _WIN32
} else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
// surrogate pair
++i;
#else
} else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
#endif
count += 4;
} else {
count += 3;
}
}
return count;
}
阅读(2744) | 评论(0) | 转发(0) |