Main Page | Modules | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members | Related Pages
csuctransform.h
Go to the documentation of this file.00001 /* 00002 Copyright (C) 2003 by Frank Richter 00003 00004 This library is free software; you can redistribute it and/or 00005 modify it under the terms of the GNU Library General Public 00006 License as published by the Free Software Foundation; either 00007 version 2 of the License, or (at your option) any later version. 00008 00009 This library is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00012 Library General Public License for more details. 00013 00014 You should have received a copy of the GNU Library General Public 00015 License along with this library; if not, write to the Free 00016 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 00017 */ 00018 00019 #ifndef __CS_CSUCTRANSFORM_H__ 00020 #define __CS_CSUCTRANSFORM_H__ 00021 00022 #include "csunicode.h" 00023 00035 #define CS_UC_MAX_UTF8_ENCODED 4 /* 6 to encode 32 bit */ 00036 00040 #define CS_UC_MAX_UTF16_ENCODED 2 00041 00045 #define CS_UC_MAX_UTF32_ENCODED 1 00046 00050 #define CS_UC_MAX_MAPPED 3 00051 00055 enum 00056 { 00062 csUcMapSimple = (1 << 0) 00063 }; 00064 00068 class csUnicodeTransform 00069 { 00070 public: 00071 #define FAIL(ret) \ 00072 { \ 00073 if (isValid) *isValid = false; \ 00074 ch = CS_UC_CHAR_REPLACER; \ 00075 return ret; \ 00076 } 00077 00078 #define SUCCEED \ 00079 if (isValid) *isValid = true; \ 00080 return chUsed; 00081 00082 #define GET_NEXT(next) \ 00083 if ((size_t)chUsed == strlen) \ 00084 { \ 00085 FAIL(chUsed); \ 00086 } \ 00087 next = *str++; \ 00088 if (next == 0) \ 00089 { \ 00090 FAIL(chUsed); \ 00091 } \ 00092 chUsed++; 00093 00112 inline static int UTF8Decode (const utf8_char* str, size_t strlen, 00113 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00114 { 00115 if (str == 0) 00116 { 00117 FAIL(0); 00118 } 00119 int chUsed = 0; 00120 00121 utf8_char curCh; 00122 GET_NEXT(curCh); 00123 if ((curCh & 0x80) == 0) 00124 { 00125 // easy case 00126 ch = curCh; 00127 SUCCEED; 00128 } 00129 else 00130 { 00131 // Count with how many bytes this char is encoded. 00132 int n = 0; 00133 while ((n < 7) && ((curCh & (1 << (7 - n))) != 0)) { n++; } 00134 00135 if ((n < 2) || (n > 6)) 00136 { 00137 // Invalid code: first char of a "sequence" must have 00138 // at least two and at most six MSBs set 00139 FAIL(1); 00140 } 00141 00142 ch = (curCh & ((1 << (8 - n)) - 1)); 00143 00144 for (int i = 1; i < n; i++) 00145 { 00146 GET_NEXT(curCh); 00147 if ((curCh & 0xc0) != 0x80) 00148 { 00149 FAIL(chUsed); 00150 } 00151 else 00152 { 00153 ch <<= 6; 00154 ch |= (curCh & 0x3f); 00155 } 00156 } 00157 00158 // Check if in Unicode range. 00159 if (ch > CS_UC_LAST_CHAR) 00160 { 00161 FAIL(chUsed); 00162 } 00163 00164 // Check for "overlong" codes. 00165 if ((ch < 0x80) && (n > 0)) 00166 { 00167 FAIL(chUsed); 00168 } 00169 else if ((ch < 0x800) && (n > 2)) 00170 { 00171 FAIL(chUsed); 00172 } 00173 else if ((ch < 0x10000) && (n > 3)) 00174 { 00175 FAIL(chUsed); 00176 } 00177 else if ((ch < 0x200000) && (n > 4)) 00178 { 00179 FAIL(chUsed); 00180 } 00181 /* 00182 else if ((ch < 0x4000000) && (n > 5)) 00183 { 00184 FAIL(chUsed); 00185 } 00186 else if ((ch < 0x80000000) && (n > 6)) 00187 { 00188 FAIL(chUsed); 00189 } 00190 */ 00191 00192 if (!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 00193 || CS_UC_IS_SURROGATE(ch))) 00194 FAIL(chUsed); 00195 SUCCEED; 00196 } 00197 } 00198 00203 inline static int UTF16Decode (const utf16_char* str, size_t strlen, 00204 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00205 { 00206 if (str == 0) 00207 { 00208 FAIL(0); 00209 } 00210 int chUsed = 0; 00211 00212 utf16_char curCh; 00213 GET_NEXT(curCh); 00214 // Decode surrogate 00215 if (CS_UC_IS_SURROGATE (curCh)) 00216 { 00217 // Invalid code 00218 if (!CS_UC_IS_HIGH_SURROGATE (curCh)) 00219 { 00220 FAIL(chUsed); 00221 } 00222 ch = 0x10000 + ((curCh & 0x03ff) << 10); 00223 GET_NEXT(curCh); 00224 // Invalid code 00225 if (!CS_UC_IS_LOW_SURROGATE (curCh)) 00226 { 00227 // Fail with 1 so the char is handled upon the next Decode. 00228 FAIL(1); 00229 } 00230 ch |= (curCh & 0x3ff); 00231 } 00232 else 00233 { 00234 ch = curCh; 00235 } 00236 if (!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 00237 || CS_UC_IS_SURROGATE(ch))) 00238 FAIL(chUsed); 00239 SUCCEED; 00240 } 00241 00246 inline static int UTF32Decode (const utf32_char* str, size_t strlen, 00247 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00248 { 00249 if (str == 0) 00250 { 00251 FAIL(0); 00252 } 00253 int chUsed = 0; 00254 00255 GET_NEXT(ch); 00256 if ((!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 00257 || CS_UC_IS_SURROGATE(ch))) || (ch > CS_UC_LAST_CHAR)) 00258 FAIL(chUsed); 00259 SUCCEED; 00260 } 00261 00266 inline static int Decode (const utf8_char* str, size_t strlen, 00267 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00268 { 00269 return UTF8Decode (str, strlen, ch, isValid, returnNonChar); 00270 } 00275 inline static int Decode (const utf16_char* str, size_t strlen, 00276 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00277 { 00278 return UTF16Decode (str, strlen, ch, isValid, returnNonChar); 00279 } 00284 inline static int Decode (const utf32_char* str, size_t strlen, 00285 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00286 { 00287 return UTF32Decode (str, strlen, ch, isValid, returnNonChar); 00288 } 00289 00291 #undef FAIL 00292 #undef SUCCEED 00293 #undef GET_NEXT 00294 00297 #define _OUTPUT_CHAR(buf, chr) \ 00298 if (bufRemaining > 0) \ 00299 { \ 00300 if(buf) *buf++ = chr; \ 00301 bufRemaining--; \ 00302 } \ 00303 encodedLen++; 00304 00305 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(buf, chr) 00306 00320 inline static int EncodeUTF8 (const utf32_char ch, utf8_char* buf, 00321 size_t bufsize, bool allowNonchars = false) 00322 { 00323 if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 00324 || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR)) 00325 return 0; 00326 size_t bufRemaining = bufsize; 00327 int encodedLen = 0; 00328 00329 if (ch < 0x80) 00330 { 00331 OUTPUT_CHAR ((utf8_char)ch); 00332 } 00333 else if (ch < 0x800) 00334 { 00335 OUTPUT_CHAR ((utf8_char)(0xc0 | (ch >> 6))); 00336 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00337 } 00338 else if (ch < 0x10000) 00339 { 00340 OUTPUT_CHAR ((utf8_char)(0xe0 | (ch >> 12))); 00341 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00342 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00343 } 00344 else if (ch < 0x200000) 00345 { 00346 OUTPUT_CHAR ((utf8_char)(0xf0 | (ch >> 18))); 00347 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00348 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00349 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00350 } 00351 /* 00352 else if (ch < 0x4000000) 00353 { 00354 OUTPUT_CHAR ((utf8_char)(0xf8 | (ch >> 24))); 00355 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f))); 00356 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00357 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00358 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00359 } 00360 else if (ch < 0x80000000) 00361 { 00362 OUTPUT_CHAR ((utf8_char)(0xfc | (ch >> 30))); 00363 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 24) & 0x3f))); 00364 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f))); 00365 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00366 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00367 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00368 } 00369 */ 00370 return encodedLen; 00371 } 00372 00377 inline static int EncodeUTF16 (const utf32_char ch, utf16_char* buf, 00378 size_t bufsize, bool allowNonchars = false) 00379 { 00380 if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 00381 || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR)) 00382 return 0; 00383 size_t bufRemaining = bufsize; 00384 int encodedLen = 0; 00385 00386 if (ch < 0x10000) 00387 { 00388 OUTPUT_CHAR((utf16_char)ch); 00389 } 00390 else if (ch < 0x100000) 00391 { 00392 utf32_char ch_shifted = ch - 0x10000; 00393 OUTPUT_CHAR((utf16_char)((ch_shifted >> 10) 00394 | CS_UC_CHAR_HIGH_SURROGATE_FIRST)); 00395 OUTPUT_CHAR((utf16_char)((ch_shifted & 0x3ff) 00396 | CS_UC_CHAR_LOW_SURROGATE_FIRST)); 00397 } 00398 else 00399 return 0; 00400 00401 return encodedLen; 00402 } 00403 00408 inline static int EncodeUTF32 (const utf32_char ch, utf32_char* buf, 00409 size_t bufsize, bool allowNonchars = false) 00410 { 00411 if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 00412 || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR)) 00413 return 0; 00414 size_t bufRemaining = bufsize; 00415 int encodedLen = 0; 00416 00417 OUTPUT_CHAR(ch); 00418 00419 return encodedLen; 00420 } 00421 00426 inline static int Encode (const utf32_char ch, utf8_char* buf, 00427 size_t bufsize, bool allowNonchars = false) 00428 { 00429 return EncodeUTF8 (ch, buf, bufsize, allowNonchars); 00430 } 00435 inline static int Encode (const utf32_char ch, utf16_char* buf, 00436 size_t bufsize, bool allowNonchars = false) 00437 { 00438 return EncodeUTF16 (ch, buf, bufsize, allowNonchars); 00439 } 00444 inline static int Encode (const utf32_char ch, utf32_char* buf, 00445 size_t bufsize, bool allowNonchars = false) 00446 { 00447 return EncodeUTF32 (ch, buf, bufsize, allowNonchars); 00448 } 00450 #undef OUTPUT_CHAR 00451 00454 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(dest, chr) 00455 00456 #define UCTF_CONVERTER(funcName, fromType, decoder, toType, encoder) \ 00457 inline static size_t funcName (toType* dest, size_t destSize, \ 00458 const fromType* source, size_t srcSize = (size_t)-1) \ 00459 { \ 00460 if ((srcSize == 0) || (source == 0)) \ 00461 return 0; \ 00462 \ 00463 size_t bufRemaining = (destSize > 0) ? destSize - 1 : 0; \ 00464 size_t encodedLen = 0; \ 00465 \ 00466 size_t srcChars = srcSize; \ 00467 \ 00468 if (srcSize == (size_t)-1) \ 00469 { \ 00470 srcChars = 0; \ 00471 const fromType* sptr = source; \ 00472 while (*sptr++ != 0) srcChars++; \ 00473 } \ 00474 \ 00475 while (srcChars > 0) \ 00476 { \ 00477 utf32_char ch; \ 00478 int scnt = decoder (source, srcChars, ch, 0); \ 00479 if (scnt == 0) break; \ 00480 int dcnt = encoder (ch, dest, bufRemaining); \ 00481 if (dcnt == 0) \ 00482 { \ 00483 dcnt = encoder (CS_UC_CHAR_REPLACER, dest, bufRemaining); \ 00484 } \ 00485 \ 00486 if ((size_t)dcnt >= bufRemaining) \ 00487 { \ 00488 if (dest && (destSize > 0)) dest += bufRemaining; \ 00489 bufRemaining = 0; \ 00490 } \ 00491 else \ 00492 { \ 00493 bufRemaining -= dcnt; \ 00494 if (dest && (destSize > 0)) dest += dcnt; \ 00495 } \ 00496 encodedLen += dcnt; \ 00497 if ((size_t)scnt >= srcChars) break; \ 00498 srcChars -= scnt; \ 00499 source += scnt; \ 00500 } \ 00501 \ 00502 if (dest) *dest = 0; \ 00503 \ 00504 return encodedLen + 1; \ 00505 } 00506 00522 UCTF_CONVERTER (UTF8to16, utf8_char, UTF8Decode, utf16_char, EncodeUTF16); 00527 UCTF_CONVERTER (UTF8to32, utf8_char, UTF8Decode, utf32_char, EncodeUTF32); 00528 00533 UCTF_CONVERTER (UTF16to8, utf16_char, UTF16Decode, utf8_char, EncodeUTF8); 00538 UCTF_CONVERTER (UTF16to32, utf16_char, UTF16Decode, utf32_char, EncodeUTF32); 00539 00544 UCTF_CONVERTER (UTF32to8, utf32_char, UTF32Decode, utf8_char, EncodeUTF8); 00549 UCTF_CONVERTER (UTF32to16, utf32_char, UTF32Decode, utf16_char, EncodeUTF16); 00552 #undef UCTF_CONVERTER 00553 #undef OUTPUT_CHAR 00554 #undef _OUTPUT_CHAR 00555 00556 #if (CS_WCHAR_T_SIZE == 1) 00557 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00558 const utf8_char* source, size_t srcSize) 00559 { 00560 size_t srcChars = srcSize; 00561 if (srcSize == (size_t)-1) 00562 { 00563 srcChars = 0; 00564 const utf8_char* sptr = source; 00565 while (*sptr++ != 0) srcChars++; 00566 } 00567 if ((dest != 0) && (destSize != 0)) 00568 { 00569 size_t len = MIN (destSize - 1, srcChars); 00570 memcpy (dest, source, size * sizeof (wchar_t)); 00571 *(dest + len) = 0; 00572 } 00573 return srcChars + 1; 00574 }; 00575 00576 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00577 const utf16_char* source, size_t srcSize) 00578 { 00579 return UTF16to8 ((utf8_char*)dest, destSize, source, srcSize); 00580 }; 00581 00582 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00583 const utf32_char* source, size_t srcSize) 00584 { 00585 return UTF32to8 ((utf8_char*)dest, destSize, source, srcSize); 00586 }; 00587 00588 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00589 const wchar_t* source, size_t srcSize) 00590 { 00591 size_t srcChars = srcSize; 00592 if (srcSize == (size_t)-1) 00593 { 00594 srcChars = 0; 00595 const wchar_t* sptr = source; 00596 while (*sptr++ != 0) srcChars++; 00597 } 00598 if ((dest != 0) && (destSize != 0)) 00599 { 00600 size_t len = MIN (destSize - 1, srcChars); 00601 memcpy (dest, source, len * sizeof (wchar_t)); 00602 *(dest + len) = 0; 00603 } 00604 return srcChars + 1; 00605 }; 00606 00607 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00608 const wchar_t* source, size_t srcSize) 00609 { 00610 return UTF8to16 (dest, destSize, source, srcSize); 00611 }; 00612 00613 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00614 const wchar_t* source, size_t srcSize) 00615 { 00616 return UTF8to32 (dest, destSize, source, srcSize); 00617 }; 00618 00619 inline static int Decode (const wchar_t* str, size_t strlen, 00620 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00621 { 00622 return UTF8Decode ((utf8_char*)str, strlen, ch, isValid, returnNonChar); 00623 } 00624 inline static int Encode (const utf32_char ch, wchar_t* buf, 00625 size_t bufsize, bool allowNonchars = false) 00626 { 00627 return EncodeUTF8 (ch, (utf8_char*)buf, bufsize, allowNonchars); 00628 } 00629 #elif (CS_WCHAR_T_SIZE == 2) 00630 // Methods below for doxygen documentation are here as the size '2' is 00631 // default. 00632 00639 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00640 const utf8_char* source, size_t srcSize) 00641 { 00642 return UTF8to16 ((utf16_char*)dest, destSize, source, srcSize); 00643 }; 00644 00649 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00650 const utf16_char* source, size_t srcSize) 00651 { 00652 size_t srcChars = srcSize; 00653 if (srcSize == (size_t)-1) 00654 { 00655 srcChars = 0; 00656 const utf16_char* sptr = source; 00657 while (*sptr++ != 0) srcChars++; 00658 } 00659 if ((dest != 0) && (destSize != 0)) 00660 { 00661 size_t len = MIN (destSize - 1, srcChars); 00662 memcpy (dest, source, len * sizeof (wchar_t)); 00663 *(dest + len) = 0; 00664 } 00665 return srcChars + 1; 00666 }; 00667 00672 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00673 const utf32_char* source, size_t srcSize) 00674 { 00675 return UTF32to16 ((utf16_char*)dest, destSize, source, srcSize); 00676 }; 00677 00682 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00683 const wchar_t* source, size_t srcSize) 00684 { 00685 return UTF16to8 (dest, destSize, (utf16_char*)source, srcSize); 00686 }; 00687 00692 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00693 const wchar_t* source, size_t srcSize) 00694 { 00695 size_t srcChars = srcSize; 00696 if (srcSize == (size_t)-1) 00697 { 00698 srcChars = 0; 00699 const wchar_t* sptr = source; 00700 while (*sptr++ != 0) srcChars++; 00701 } 00702 if ((dest != 0) && (destSize != 0)) 00703 { 00704 size_t len = MIN (destSize - 1, srcChars); 00705 memcpy (dest, source, len * sizeof (wchar_t)); 00706 *(dest + len) = 0; 00707 } 00708 return srcChars + 1; 00709 }; 00710 00715 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00716 const wchar_t* source, size_t srcSize) 00717 { 00718 return UTF16to32 (dest, destSize, (utf16_char*)source, srcSize); 00719 }; 00720 00721 /* Decode()/Encode() overloads for wchar_t. 00722 * - On VC7+, wchar_t may be an unsigned short or the special type __wchar_t. 00723 * - On VC6 wchar_t is always an unsigned short. __wchar_t does not exist. 00724 * Now there may be conflicts with the utf16_char overloads if wchar_t is 00725 * an unsigned short. On the other hand, we would like to support VC7+'s 00726 * built-in wchar_t as well. 00727 * So: on VC7+, provide overloads for __wchar_t, on VC6, don't compile this 00728 * code at all, on other compilers, provide overloads for wchar_t instead 00729 * (by re#definining __wchar_t). 00730 */ 00731 #if !defined(CS_COMPILER_MSVC) || (_MSC_VER > 1300) 00732 #if !defined(CS_COMPILER_MSVC) 00733 #define __wchar_t wchar_t 00734 #endif 00735 00739 inline static int Decode (const __wchar_t* str, size_t strlen, 00740 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00741 { 00742 return UTF16Decode ((utf16_char*)str, strlen, ch, isValid, returnNonChar); 00743 } 00748 inline static int Encode (const utf32_char ch, __wchar_t* buf, 00749 size_t bufsize, bool allowNonchars = false) 00750 { 00751 return EncodeUTF16 (ch, (utf16_char*)buf, bufsize, allowNonchars); 00752 } 00753 #ifdef __wchar_t 00754 #undef __wchar_t 00755 #endif 00756 #endif 00757 00758 #elif (CS_WCHAR_T_SIZE == 4) 00759 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00760 const utf8_char* source, size_t srcSize) 00761 { 00762 return UTF8to32 ((utf32_char*)dest, destSize, source, srcSize); 00763 }; 00764 00765 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00766 const utf16_char* source, size_t srcSize) 00767 { 00768 return UTF16to32 ((utf32_char*)dest, destSize, source, srcSize); 00769 }; 00770 00771 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00772 const utf32_char* source, size_t srcSize) 00773 { 00774 size_t srcChars = srcSize; 00775 if (srcSize == (size_t)-1) 00776 { 00777 srcChars = 0; 00778 const utf32_char* sptr = source; 00779 while (*sptr++ != 0) srcChars++; 00780 } 00781 if ((dest != 0) && (destSize != 0)) 00782 { 00783 size_t len = MIN (destSize - 1, srcChars); 00784 memcpy (dest, source, len * sizeof (wchar_t)); 00785 *(dest + len) = 0; 00786 } 00787 return srcChars + 1; 00788 }; 00789 00790 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00791 const wchar_t* source, size_t srcSize) 00792 { 00793 return UTF32to8 (dest, destSize, (utf32_char*)source, srcSize); 00794 }; 00795 00796 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00797 const wchar_t* source, size_t srcSize) 00798 { 00799 return UTF32to16 (dest, destSize, (utf32_char*)source, srcSize); 00800 }; 00801 00802 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00803 const wchar_t* source, size_t srcSize) 00804 { 00805 size_t srcChars = srcSize; 00806 if (srcSize == (size_t)-1) 00807 { 00808 srcChars = 0; 00809 const wchar_t* sptr = source; 00810 while (*sptr++ != 0) srcChars++; 00811 } 00812 if ((dest != 0) && (destSize != 0)) 00813 { 00814 size_t len = MIN (destSize - 1, srcChars); 00815 memcpy (dest, source, len * sizeof (wchar_t)); 00816 *(dest + len) = 0; 00817 } 00818 return srcChars + 1; 00819 }; 00820 00821 inline static int Decode (const wchar_t* str, size_t strlen, 00822 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00823 { 00824 return UTF32Decode ((utf32_char*)str, strlen, ch, isValid, returnNonChar); 00825 } 00826 inline static int Encode (const utf32_char ch, wchar_t* buf, 00827 size_t bufsize, bool allowNonchars = false) 00828 { 00829 return EncodeUTF32 (ch, (utf32_char*)buf, bufsize, allowNonchars); 00830 } 00831 #else 00832 #error Odd-sized, unsupported wchar_t! 00833 #endif 00834 00847 inline static int UTF8Skip (const utf8_char* str, size_t maxSkip) 00848 { 00849 if (maxSkip < 1) return 0; 00850 00851 if ((*str & 0x80) == 0) 00852 { 00853 return 1; 00854 } 00855 else 00856 { 00857 int n = 0; 00858 while ((n < 7) && ((*str & (1 << (7 - n))) != 0)) { n++; } 00859 00860 if ((n < 2) || (n > 6)) 00861 { 00862 return 1; 00863 } 00864 00865 int skip = 1; 00866 00867 for (; skip < n; skip++) 00868 { 00869 if (((str[skip] & 0xc0) != 0x80) || ((size_t)skip > maxSkip)) 00870 { 00871 break; 00872 } 00873 } 00874 return skip; 00875 } 00876 } 00877 00888 inline static int UTF8Rewind (const utf8_char* str, size_t maxRew) 00889 { 00890 if (maxRew < 1) return 0; 00891 00892 const utf8_char* pos = str - 1; 00893 00894 if ((*pos & 0x80) == 0) 00895 { 00896 return 1; 00897 } 00898 00899 // Skip backward to the first byte of the sequence. 00900 int skip = 1; 00901 while (((*pos & 0xc0) == 0x80) && ((size_t)skip < maxRew)) 00902 { 00903 skip++; 00904 pos--; 00905 } 00906 00907 return skip; 00908 } 00909 00915 inline static int UTF16Skip (const utf16_char* str, size_t maxSkip) 00916 { 00917 if (CS_UC_IS_HIGH_SURROGATE (*str)) 00918 return (int)(MIN(maxSkip, 2)); 00919 else 00920 return (int)(MIN(maxSkip, 1)); 00921 } 00922 00928 inline static int UTF16Rewind (const utf16_char* str, size_t maxRew) 00929 { 00930 if (maxRew < 1) return 0; 00931 00932 const utf16_char* pos = str - 1; 00933 if (!CS_UC_IS_SURROGATE(*pos)) 00934 return 1; 00935 else 00936 { 00937 if ((maxRew > 1) && (CS_UC_IS_HIGH_SURROGATE(*(pos - 1)))) 00938 return 2; 00939 else 00940 return 1; 00941 } 00942 } 00943 00949 inline static int UTF32Skip (const utf32_char* str, size_t maxSkip) 00950 { 00951 return (int)(MIN(maxSkip, 1)); 00952 } 00953 00959 inline static int UTF32Rewind (const utf32_char* str, size_t maxRew) 00960 { 00961 if (maxRew < 1) return 0; 00962 return 1; 00963 } 00978 static size_t MapToUpper (const utf32_char ch, utf32_char* dest, 00979 size_t destSize, uint flags = 0); 00984 static size_t MapToLower (const utf32_char ch, utf32_char* dest, 00985 size_t destSize, uint flags = 0); 00991 static size_t MapToFold (const utf32_char ch, utf32_char* dest, 00992 size_t destSize, uint flags = 0); 00994 }; 00995 00998 #endif 00999
Generated for Crystal Space by doxygen 1.4.4