Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

ucnv_utf.c

00001 /*  
00002 **********************************************************************
00003 *   Copyright (C) 2000, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  ucnv_utf.cpp
00007 *   encoding:   US-ASCII
00008 *   tab size:   8 (not used)
00009 *   indentation:4
00010 *
00011 *   created on: 2000feb03
00012 *   created by: Markus W. Scherer
00013 *
00014 *   Change history:
00015 *
00016 *   06/29/2000  helena      Major rewrite of the callback APIs.
00017 *   07/20/2000  george      Change the coding style to conform to the coding guidelines,
00018 *                           and a few miscellaneous bug fixes.
00019 *   11/15/2000  george      Added UTF-32
00020 */
00021 
00022 #include "cmemory.h"
00023 #include "unicode/utypes.h"
00024 #include "ucmp16.h"
00025 #include "ucmp8.h"
00026 #include "unicode/ucnv_err.h"
00027 #include "ucnv_bld.h"
00028 #include "unicode/ucnv.h"
00029 #include "ucnv_cnv.h"
00030 
00031 /* UTF-8 -------------------------------------------------------------------- */
00032 
00033 /* UTF-8 Conversion DATA
00034  *   for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9
00035  */
00036 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
00037 static const uint32_t MAXIMUM_UCS2 = 0x0000FFFF;
00038 static const uint32_t MAXIMUM_UTF = 0x0010FFFF;
00039 static const uint32_t MAXIMUM_UCS4 = 0x7FFFFFFF;
00040 static const int8_t HALF_SHIFT = 10;
00041 static const uint32_t HALF_BASE = 0x0010000;
00042 static const uint32_t HALF_MASK = 0x3FF;
00043 static const uint32_t SURROGATE_HIGH_START = 0xD800;
00044 static const uint32_t SURROGATE_HIGH_END = 0xDBFF;
00045 static const uint32_t SURROGATE_LOW_START = 0xDC00;
00046 static const uint32_t SURROGATE_LOW_END = 0xDFFF;
00047 static const uint32_t SURROGATE_LOW_BASE = 9216; /* -SURROGATE_LOW_START + HALF_BASE */
00048 
00049 static const uint32_t offsetsFromUTF8[7] = {0,
00050   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
00051   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
00052 };
00053 
00054 /* END OF UTF-8 Conversion DATA */
00055 
00056 static const int8_t bytesFromUTF8[256] = {
00057   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00060   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00061   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00062   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00063   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00064   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
00065 };
00066 
00067 /* static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};*/
00068 
00069 #define INVALID_UTF8_TAIL(utf8) (((utf8) & 0xC0) != 0x80)
00070 
00077 static UBool
00078 T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
00079                                                   UErrorCode *err)
00080 {
00081     UConverter *converter = args->converter;
00082 
00083     if (U_SUCCESS(*err))
00084     {
00085         *err = U_ILLEGAL_CHAR_FOUND;
00086     }
00087 
00088     /* Make the toUBytes invalid */
00089     uprv_memcpy(converter->invalidCharBuffer,
00090                 converter->toUBytes,
00091                 converter->invalidCharLength);
00092 
00093     /* Call the ErrorFunction */
00094     args->converter->fromCharErrorBehaviour(converter->toUContext,
00095                                             args,
00096                                             converter->invalidCharBuffer,
00097                                             converter->invalidCharLength,
00098                                             UCNV_ILLEGAL,
00099                                             err);
00100 
00101     return (UBool)U_FAILURE(*err);
00102 }
00103 
00104 static UBool
00105 T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
00106                                                         int32_t currentOffset,
00107                                                         UErrorCode *err)
00108 {
00109     int32_t *saveOffsets = args->offsets;
00110     UBool result;
00111     
00112     result = T_UConverter_toUnicode_InvalidChar_Callback(args, err);
00113 
00114     while (saveOffsets < args->offsets)
00115     {
00116         *(saveOffsets++) = currentOffset;
00117     }
00118     return result;
00119 }
00120 
00121 U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
00122                                   UErrorCode * err)
00123 {
00124     const unsigned char *mySource = (unsigned char *) args->source;
00125     UChar *myTarget = args->target;
00126     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
00127     const UChar *targetLimit = args->targetLimit;
00128     unsigned char *toUBytes = args->converter->toUBytes;
00129     UBool invalidTailChar = FALSE;
00130     uint32_t ch, ch2 = 0, i;
00131     uint32_t inBytes;  /* Total number of bytes in the current UTF8 sequence */
00132   
00133     if (U_FAILURE(*err))
00134     {
00135         return;
00136     }
00137 
00138     /* Restore size of current sequence */
00139     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
00140     {
00141         inBytes = args->converter->toULength;       /* restore # of bytes to consume */
00142         i = args->converter->invalidCharLength;     /* restore # of bytes consumed */
00143 
00144         ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
00145         args->converter->toUnicodeStatus = 0;
00146         goto morebytes;
00147     }
00148 
00149 
00150     while (mySource < sourceLimit && myTarget < targetLimit)
00151     {
00152         ch = *(mySource++);
00153         if (ch < 0x80)        /* Simple case */
00154         {
00155             *(myTarget++) = (UChar) ch;
00156         }
00157         else
00158         {
00159             /* store the first char */
00160             toUBytes[0] = (char)ch;
00161             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
00162             i = 1;
00163 
00164 morebytes:
00165             while (i < inBytes)
00166             {
00167                 if (mySource < sourceLimit)
00168                 {
00169                     toUBytes[i] = (char) (ch2 = *(mySource++));
00170                     if (INVALID_UTF8_TAIL(ch2))
00171                     {
00172                         *err = U_TRUNCATED_CHAR_FOUND;
00173                         invalidTailChar = TRUE;
00174                         break;
00175                     }
00176                     ch = (ch << 6) + ch2;
00177                     i++;
00178                 }
00179                 else
00180                 {
00181                     if (args->flush)
00182                     {
00183                         if (U_SUCCESS(*err))
00184                         {
00185                             *err = U_TRUNCATED_CHAR_FOUND;
00186                         }
00187                     }
00188                     else
00189                     {    /* stores a partially calculated target*/
00190                         args->converter->toUnicodeStatus = ch;
00191                         args->converter->toULength = (int8_t) inBytes;
00192                         args->converter->invalidCharLength = (int8_t) i;
00193                     }
00194                     goto donefornow;
00195                 }
00196             }
00197 
00198             /* Remove the acummulated high bits */
00199             ch -= offsetsFromUTF8[inBytes];
00200 
00201             if (i == inBytes && ch <= MAXIMUM_UTF)
00202             {
00203                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
00204                 if (ch <= MAXIMUM_UCS2) 
00205                 {
00206                     /* fits in 16 bits */
00207                     *(myTarget++) = (UChar) ch;
00208                 }
00209                 else
00210                 {
00211                     /* write out the surrogates */
00212                     ch -= HALF_BASE;
00213                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
00214                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
00215                     if (myTarget < targetLimit)
00216                     {
00217                         *(myTarget++) = (UChar)ch;
00218                     }
00219                     else
00220                     {
00221                         /* Put in overflow buffer (not handled here) */
00222                         args->converter->UCharErrorBuffer[0] = (UChar) ch;
00223                         args->converter->UCharErrorBufferLength = 1;
00224                         *err = U_BUFFER_OVERFLOW_ERROR;
00225                         break;
00226                     }
00227                 }
00228             }
00229             else
00230             {
00231                 args->source = (const char *) mySource;
00232                 args->target = myTarget;
00233                 args->converter->invalidCharLength = (int8_t)i;
00234                 if (T_UConverter_toUnicode_InvalidChar_Callback(args, err))
00235                 {
00236                     /* Stop if the error wasn't handled */
00237                     break;
00238                 }
00239                 args->converter->invalidCharLength = 0;
00240                 mySource = (unsigned char *) args->source;
00241                 myTarget = args->target;
00242                 if (invalidTailChar)
00243                 {
00244                     /* Treat the tail as ASCII*/
00245                     if (myTarget < targetLimit)
00246                     {
00247                         *(myTarget++) = (UChar) ch2;
00248                         invalidTailChar = FALSE;
00249                     }
00250                     else
00251                     {
00252                         /* Put in overflow buffer (not handled here) */
00253                         args->converter->UCharErrorBuffer[0] = (UChar) ch2;
00254                         args->converter->UCharErrorBufferLength = 1;
00255                         *err = U_BUFFER_OVERFLOW_ERROR;
00256                         break;
00257                     }
00258                 }
00259             }
00260         }
00261     }
00262 
00263 donefornow:
00264     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
00265     {
00266         /* End of target buffer */
00267         *err = U_BUFFER_OVERFLOW_ERROR;
00268     }
00269 
00270     args->target = myTarget;
00271     args->source = (const char *) mySource;
00272 }
00273 
00274 U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
00275                                                 UErrorCode * err)
00276 {
00277     const unsigned char *mySource = (unsigned char *) args->source;
00278     UChar *myTarget = args->target;
00279     int32_t *myOffsets = args->offsets;
00280     int32_t offsetNum = 0;
00281     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
00282     const UChar *targetLimit = args->targetLimit;
00283     unsigned char *toUBytes = args->converter->toUBytes;
00284     UBool invalidTailChar = FALSE;
00285     uint32_t ch, ch2 = 0, i;
00286     uint32_t inBytes;
00287 
00288     /* Restore size of current sequence */
00289     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
00290     {
00291         inBytes = args->converter->toULength;       /* restore # of bytes to consume */
00292         i = args->converter->invalidCharLength;     /* restore # of bytes consumed */
00293 
00294         ch = args->converter->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
00295         args->converter->toUnicodeStatus = 0;
00296         goto morebytes;
00297     }
00298 
00299     while (mySource < sourceLimit && myTarget < targetLimit)
00300     {
00301         ch = *(mySource++);
00302         if (ch < 0x80)        /* Simple case */
00303         {
00304             *(myTarget++) = (UChar) ch;
00305             *(myOffsets++) = offsetNum++;
00306         }
00307         else
00308         {
00309             toUBytes[0] = (char)ch;
00310             inBytes = bytesFromUTF8[ch];
00311             i = 1;
00312 
00313 morebytes:
00314             while (i < inBytes)
00315             {
00316                 if (mySource < sourceLimit)
00317                 {
00318                     toUBytes[i] = (char) (ch2 = *(mySource++));
00319                     if (INVALID_UTF8_TAIL(ch2))
00320                     {
00321                         *err = U_TRUNCATED_CHAR_FOUND;
00322                         invalidTailChar = TRUE;
00323                         break;
00324                     }
00325                     ch = (ch << 6) + ch2;
00326                     i++;
00327                 }
00328                 else
00329                 {
00330                     if (args->flush)
00331                     {
00332                         if (U_SUCCESS(*err)) 
00333                         {
00334                             *err = U_TRUNCATED_CHAR_FOUND;
00335                             args->converter->toUnicodeStatus = 0;
00336                         }
00337                     }
00338                     else
00339                     {
00340                         args->converter->toUnicodeStatus = ch;
00341                         args->converter->toULength = (int8_t)inBytes;
00342                         args->converter->invalidCharLength = (int8_t)i;
00343                     }
00344                     goto donefornow;
00345                 }
00346             }
00347 
00348             /* Remove the acummulated high bits */
00349             ch -= offsetsFromUTF8[inBytes];
00350 
00351             if (i == inBytes && ch <= MAXIMUM_UTF)
00352             {
00353                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
00354                 if (ch <= MAXIMUM_UCS2) 
00355                 {
00356                     /* fits in 16 bits */
00357                     *(myTarget++) = (UChar) ch;
00358                     *(myOffsets++) = offsetNum;
00359                 }
00360                 else
00361                 {
00362                     /* write out the surrogates */
00363                     *(myOffsets++) = offsetNum;
00364                     ch -= HALF_BASE;
00365                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
00366                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
00367                     if (myTarget < targetLimit)
00368                     {
00369                         *(myTarget++) = (UChar)ch;
00370                         *(myOffsets++) = offsetNum;
00371                     }
00372                     else
00373                     {
00374                         args->converter->UCharErrorBuffer[0] = (UChar) ch;
00375                         args->converter->UCharErrorBufferLength = 1;
00376                         *err = U_BUFFER_OVERFLOW_ERROR;
00377                     }
00378                 }
00379                 offsetNum += i;
00380             }
00381             else
00382             {
00383                 UBool useOffset;
00384 
00385                 args->source = (const char *) mySource;
00386                 args->target = myTarget;
00387                 args->offsets = myOffsets;
00388                 args->converter->invalidCharLength = (int8_t)i;
00389                 if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args,
00390                  offsetNum, err))
00391                 {
00392                     /* Stop if the error wasn't handled */
00393                     break;
00394                 }
00395 
00396                 args->converter->invalidCharLength = 0;
00397                 mySource = (unsigned char *) args->source;
00398                 myTarget = args->target;
00399 
00400                 useOffset = (UBool)(myOffsets != args->offsets);
00401                 myOffsets = args->offsets;
00402                 offsetNum += i;
00403 
00404                 if (invalidTailChar)
00405                 {
00406                     /* Treat the tail as ASCII*/
00407                     if (myTarget < targetLimit)
00408                     {
00409                         *(myTarget++) = (UChar) ch2;
00410                         *myOffsets = offsetNum++;
00411                         if (useOffset)
00412                         {
00413                             /* Increment when the target was consumed */
00414                             myOffsets++;
00415                         }
00416                         invalidTailChar = FALSE;
00417                     }
00418                     else
00419                     {
00420                         /* Put in overflow buffer (not handled here) */
00421                         args->converter->UCharErrorBuffer[0] = (UChar) ch2;
00422                         args->converter->UCharErrorBufferLength = 1;
00423                         *err = U_BUFFER_OVERFLOW_ERROR;
00424                         break;
00425                     }
00426                 }
00427             }
00428         }
00429     }
00430 
00431 donefornow:
00432     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
00433     {   /* End of target buffer */
00434         *err = U_BUFFER_OVERFLOW_ERROR;
00435     }
00436 
00437     args->target = myTarget;
00438     args->source = (const char *) mySource;
00439     args->offsets = myOffsets;
00440 }
00441 
00442 U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
00443                                     UErrorCode * err)
00444 {
00445     const UChar *mySource = args->source;
00446     unsigned char *myTarget = (unsigned char *) args->target;
00447     const UChar *sourceLimit = args->sourceLimit;
00448     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
00449     uint32_t ch, ch2;
00450     int16_t indexToWrite;
00451     char temp[4];
00452 
00453     if (args->converter->fromUnicodeStatus && myTarget < targetLimit)
00454     {
00455         ch = args->converter->fromUnicodeStatus;
00456         args->converter->fromUnicodeStatus = 0;
00457         goto lowsurogate;
00458     }
00459 
00460     while (mySource < sourceLimit && myTarget < targetLimit)
00461     {
00462         ch = *(mySource++);
00463 
00464         if (ch < 0x80)        /* Single byte */
00465         {
00466             *(myTarget++) = (char) ch;
00467         }
00468         else if (ch < 0x800)  /* Double byte */
00469         {
00470             *(myTarget++) = (char) ((ch >> 6) | 0xc0);
00471             if (myTarget < targetLimit)
00472             {
00473                 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
00474             }
00475             else
00476             {
00477                 args->converter->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
00478                 args->converter->charErrorBufferLength = 1;
00479                 *err = U_BUFFER_OVERFLOW_ERROR;
00480             }
00481         }
00482         else
00483         /* Check for surogates */
00484         {
00485             if ((ch >= SURROGATE_HIGH_START) && (ch <= SURROGATE_HIGH_END))
00486             {
00487 lowsurogate:
00488                 if (mySource < sourceLimit)
00489                 {
00490                     ch2 = *mySource;
00491                     if ((ch2 >= SURROGATE_LOW_START) && (ch2 <= SURROGATE_LOW_END))
00492                     {
00493                         /* If there were two surrogates, combine them otherwise treat them normally */
00494                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
00495                         mySource++;
00496                     }
00497                 }
00498                 else if (!args->flush)
00499                 {
00500                     args->converter->fromUnicodeStatus = ch;
00501                     break;
00502                 }
00503             }
00504 
00505             if (ch < 0x10000)
00506             {
00507                 indexToWrite = 2;
00508                 temp[2] = (char) ((ch >> 12) | 0xe0);
00509             }
00510             else
00511             {
00512                 indexToWrite = 3;
00513                 temp[3] = (char) ((ch >> 18) | 0xf0);
00514                 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
00515             }
00516             temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
00517             temp[0] = (char) ((ch & 0x3f) | 0x80);
00518 
00519             for (; indexToWrite >= 0; indexToWrite--)
00520             {
00521                 if (myTarget < targetLimit)
00522                 {
00523                     *(myTarget++) = temp[indexToWrite];
00524                 }
00525                 else
00526                 {
00527                     args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
00528                     *err = U_BUFFER_OVERFLOW_ERROR;
00529                 }
00530             }
00531         }
00532     }
00533 
00534     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
00535     {
00536         *err = U_BUFFER_OVERFLOW_ERROR;
00537     }
00538 
00539     args->target = (char *) myTarget;
00540     args->source = mySource;
00541 }
00542 
00543 U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
00544                                                   UErrorCode * err)
00545 {
00546     const UChar *mySource = args->source;
00547     unsigned char *myTarget = (unsigned char *) args->target;
00548     int32_t *myOffsets = args->offsets;
00549     const UChar *sourceLimit = args->sourceLimit;
00550     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
00551     uint32_t ch, ch2;
00552     int32_t offsetNum = 0;
00553     int16_t indexToWrite;
00554     char temp[4];
00555 
00556     if (args->converter->fromUnicodeStatus && myTarget < targetLimit)
00557     {
00558         ch = args->converter->fromUnicodeStatus;
00559         args->converter->fromUnicodeStatus = 0;
00560         goto lowsurogate;
00561     }
00562 
00563     while (mySource < sourceLimit && myTarget < targetLimit)
00564     {
00565         ch = *(mySource++);
00566 
00567         if (ch < 0x80)        /* Single byte */
00568         {
00569             *(myOffsets++) = offsetNum++;
00570             *(myTarget++) = (char) ch;
00571         }
00572         else if (ch < 0x800)  /* Double byte */
00573         {
00574             *(myOffsets++) = offsetNum;
00575             *(myTarget++) = (char) ((ch >> 6) | 0xc0);
00576             if (myTarget < targetLimit)
00577             {
00578                 *(myOffsets++) = offsetNum++;
00579                 *(myTarget++) = (char) ((ch & 0x3f) | 0x80);
00580             }
00581             else
00582             {
00583                 args->converter->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
00584                 args->converter->charErrorBufferLength = 1;
00585                 *err = U_BUFFER_OVERFLOW_ERROR;
00586             }
00587         }
00588         else
00589         /* Check for surogates */
00590         {
00591             if ((ch >= SURROGATE_HIGH_START) && (ch <= SURROGATE_HIGH_END))
00592             {
00593 lowsurogate:
00594                 if (mySource < sourceLimit)
00595                 {
00596                     ch2 = *mySource;
00597                     if ((ch2 >= SURROGATE_LOW_START) && (ch2 <= SURROGATE_LOW_END))
00598                     {
00599                         /* If there were two surrogates, combine them otherwise treat them normally */
00600                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
00601                         mySource++;
00602                     }
00603                 }
00604                 else if (!args->flush)
00605                 {
00606                     args->converter->fromUnicodeStatus = ch;
00607                     break;
00608                 }
00609             }
00610 
00611             if (ch < 0x10000)
00612             {
00613                 indexToWrite = 2;
00614                 temp[2] = (char) ((ch >> 12) | 0xe0);
00615             }
00616             else
00617             {
00618                 indexToWrite = 3;
00619                 temp[3] = (char) ((ch >> 18) | 0xf0);
00620                 temp[2] = (char) (((ch >> 12) & 0x3f) | 0x80);
00621             }
00622             temp[1] = (char) (((ch >> 6) & 0x3f) | 0x80);
00623             temp[0] = (char) ((ch & 0x3f) | 0x80);
00624 
00625             for (; indexToWrite >= 0; indexToWrite--)
00626             {
00627                 if (myTarget < targetLimit)
00628                 {
00629                     *(myOffsets++) = offsetNum;
00630                     *(myTarget++) = temp[indexToWrite];
00631                 }
00632                 else
00633                 {
00634                     args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
00635                     *err = U_BUFFER_OVERFLOW_ERROR;
00636                 }
00637             }
00638             offsetNum += (ch >= 0x10000) + 1;
00639         }
00640     }
00641 
00642     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
00643     {
00644         *err = U_BUFFER_OVERFLOW_ERROR;
00645     }
00646 
00647     args->target = (char *) myTarget;
00648     args->source = mySource;
00649 }
00650 
00651 U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
00652                                                UErrorCode* err)
00653 {
00654     /*safe keeps a ptr to the beginning in case we need to step back*/
00655     char const *sourceInitial = args->source;
00656     uint16_t extraBytesToWrite;
00657     uint8_t myByte;
00658     UChar32 ch;
00659     int8_t isLegalSequence = 1;
00660 
00661     /*Input boundary check*/
00662     if (args->source >= args->sourceLimit) 
00663     {
00664         *err = U_INDEX_OUTOFBOUNDS_ERROR;
00665         return 0xffff;
00666     }
00667 
00668     myByte = (uint8_t)*(args->source++);
00669     if (myByte < 0x80)
00670     {
00671         return (UChar32)myByte;
00672     }
00673     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
00674     if (extraBytesToWrite == 0) {
00675         goto CALL_ERROR_FUNCTION;
00676     }
00677 
00678     /*The byte sequence is longer than the buffer area passed*/
00679 
00680     if ((args->source + extraBytesToWrite - 1) > args->sourceLimit)
00681     {
00682         *err = U_TRUNCATED_CHAR_FOUND;
00683         return 0xffff;
00684     }
00685     else
00686     {
00687         ch = myByte << 6;
00688         switch(extraBytesToWrite)
00689         {     
00690           /* note: code falls through cases! (sic)*/ 
00691         case 6:
00692             ch += (myByte = (uint8_t)*(args->source++));
00693             ch <<= 6;
00694             if ((myByte & 0xC0) != 0x80) 
00695             {
00696                 isLegalSequence = 0;
00697                 break;
00698             }
00699         case 5:
00700             ch += (myByte = *(args->source++));
00701             ch <<= 6;
00702             if ((myByte & 0xC0) != 0x80) 
00703             {
00704                 isLegalSequence = 0;
00705                 break;
00706             }
00707         case 4:
00708             ch += (myByte = *(args->source++));
00709             ch <<= 6;
00710             if ((myByte & 0xC0) != 0x80) 
00711             {
00712                 isLegalSequence = 0;
00713                 break;
00714             }
00715         case 3:
00716             ch += (myByte = *(args->source++));
00717             ch <<= 6;
00718             if ((myByte & 0xC0) != 0x80) 
00719             {
00720                 isLegalSequence = 0;
00721                 break;
00722             }
00723         case 2:
00724             ch += (myByte = *(args->source++));
00725             if ((myByte & 0xC0) != 0x80) 
00726             {
00727                 isLegalSequence = 0;
00728             }
00729         };
00730     }
00731     ch -= offsetsFromUTF8[extraBytesToWrite];
00732 
00733     if (isLegalSequence)
00734         return ch; /* return the code point */
00735 
00736 CALL_ERROR_FUNCTION:
00737     {
00738     UChar myUChar = (UChar)0xffff; /* ### TODO: this is a hack until we prepare the callbacks for code points */
00739     UChar* myUCharPtr = &myUChar;
00740     
00741     *err = U_ILLEGAL_CHAR_FOUND;
00742     
00743     /*It is very likely that the ErrorFunctor will write to the
00744      *internal buffers */
00745     args->target = myUCharPtr;
00746     args->targetLimit = myUCharPtr + 1;
00747     args->converter->fromCharErrorBehaviour(args->converter->toUContext,
00748                                     args,
00749                                     sourceInitial,
00750                                     args->source-sourceInitial,
00751                                     UCNV_ILLEGAL,
00752                                     err);
00753 
00754     /*makes the internal caching transparent to the user*/
00755     if (*err == U_BUFFER_OVERFLOW_ERROR)
00756         *err = U_ZERO_ERROR;
00757 
00758     return (UChar32)myUChar;
00759     }
00760 } 
00761 
00762 static const UConverterImpl _UTF8Impl={
00763     UCNV_UTF8,
00764 
00765     NULL,
00766     NULL,
00767 
00768     NULL,
00769     NULL,
00770     NULL,
00771 
00772     T_UConverter_toUnicode_UTF8,
00773     T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC,
00774     T_UConverter_fromUnicode_UTF8,
00775     T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC,
00776     T_UConverter_getNextUChar_UTF8,
00777 
00778     NULL,
00779     NULL
00780 };
00781 
00782 /* Todo: verify that UTF-8 == (ccsid (ibm-codepage) 1208) for unicode version 2.0 and 3.0 */
00783 const UConverterStaticData _UTF8StaticData={
00784   sizeof(UConverterStaticData),
00785 "UTF8",
00786     1208, UCNV_IBM, UCNV_UTF8, 1, 4,
00787     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
00788 };
00789 
00790 
00791 const UConverterSharedData _UTF8Data={
00792     sizeof(UConverterSharedData), ~((uint32_t) 0),
00793     NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl, 
00794     0
00795 };
00796 
00797 /* UTF-16BE ----------------------------------------------------------------- */
00798 
00799 U_CFUNC void T_UConverter_toUnicode_UTF16_BE (UConverterToUnicodeArgs * args,
00800                                       UErrorCode * err)
00801 {
00802     const unsigned char *mySource = (unsigned char *) args->source;
00803     UChar *myTarget = args->target;
00804     int32_t mySourceIndex = 0;
00805     int32_t myTargetIndex = 0;
00806     int32_t targetLength = args->targetLimit - myTarget;
00807     int32_t sourceLength = args->sourceLimit - (char *) mySource;
00808     UChar mySourceChar = 0x0000;
00809     UChar oldmySourceChar = 0x0000;
00810 
00811     while (mySourceIndex < sourceLength)
00812     {
00813         if (myTargetIndex < targetLength)
00814         {
00815             /*gets the corresponding UChar */
00816             mySourceChar = (unsigned char) mySource[mySourceIndex++];
00817             oldmySourceChar = mySourceChar;
00818             if (args->converter->toUnicodeStatus == 0)
00819             {
00820                 args->converter->toUnicodeStatus = 
00821                     (unsigned char) mySourceChar == 0 ? 0xFFFF : mySourceChar;
00822             }
00823             else
00824             {
00825                 if (args->converter->toUnicodeStatus != 0xFFFF)
00826                     mySourceChar = (UChar) ((args->converter->toUnicodeStatus << 8) | mySourceChar);
00827                 args->converter->toUnicodeStatus = 0;
00828 
00829                 myTarget[myTargetIndex++] = mySourceChar;
00830             }
00831         }
00832         else
00833         {
00834             *err = U_BUFFER_OVERFLOW_ERROR;
00835             break;
00836         }
00837     }
00838 
00839     if (U_SUCCESS(*err) && args->flush
00840       && (mySourceIndex == sourceLength)
00841       && (args->converter->toUnicodeStatus != 0x00))
00842     {
00843         if (U_SUCCESS(*err)) 
00844         {
00845             *err = U_TRUNCATED_CHAR_FOUND;
00846             args->converter->toUnicodeStatus = 0x00;
00847         }
00848     }
00849 
00850     args->target += myTargetIndex;
00851     args->source += mySourceIndex;
00852 }
00853 
00854 U_CFUNC void  T_UConverter_fromUnicode_UTF16_BE (UConverterFromUnicodeArgs * args,
00855                                          UErrorCode * err)
00856 {
00857     const UChar *mySource = args->source;
00858     unsigned char *myTarget = (unsigned char *) args->target;
00859     int32_t mySourceIndex = 0;
00860     int32_t myTargetIndex = 0;
00861     int32_t targetLength = args->targetLimit - (char *) myTarget;
00862     int32_t sourceLength = args->sourceLimit - mySource;
00863     UChar mySourceChar;
00864 
00865     /*writing the char to the output stream */
00866     while (mySourceIndex < sourceLength)
00867     {
00868         if (myTargetIndex < targetLength)
00869         {
00870             mySourceChar = (UChar) mySource[mySourceIndex++];
00871             myTarget[myTargetIndex++] = (char) (mySourceChar >> 8);
00872             if (myTargetIndex < targetLength)
00873             {
00874                 myTarget[myTargetIndex++] = (char) mySourceChar;
00875             }
00876             else
00877             {
00878                 args->converter->charErrorBuffer[0] = (char) mySourceChar;
00879                 args->converter->charErrorBufferLength = 1;
00880                 *err = U_BUFFER_OVERFLOW_ERROR;
00881             }
00882         }
00883         else
00884         {
00885             *err = U_BUFFER_OVERFLOW_ERROR;
00886             break;
00887         }
00888     }
00889 
00890     args->target += myTargetIndex;
00891     args->source += mySourceIndex;
00892 }
00893 
00894 U_CFUNC UChar32 T_UConverter_getNextUChar_UTF16_BE(UConverterToUnicodeArgs* args,
00895                                                    UErrorCode* err)
00896 {
00897     UChar32 myUChar;
00898     uint16_t first;
00899     /*Checks boundaries and set appropriate error codes*/
00900     if (args->source+2 > args->sourceLimit) 
00901     {
00902         if (args->source >= args->sourceLimit)
00903         {
00904             /*Either caller has reached the end of the byte stream*/
00905             *err = U_INDEX_OUTOFBOUNDS_ERROR;
00906         }
00907         else
00908         {
00909             /* a character was cut in half*/
00910             *err = U_TRUNCATED_CHAR_FOUND;
00911         }
00912         return 0xffff;
00913     }
00914 
00915     /*Gets the corresponding codepoint*/
00916     first = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*((args->source)+1)));
00917     myUChar = first;
00918     args->source += 2;
00919 
00920     if(UTF_IS_FIRST_SURROGATE(first)) {
00921         uint16_t second;
00922 
00923         if (args->source+2 > args->sourceLimit) {
00924             *err = U_TRUNCATED_CHAR_FOUND;
00925             return 0xffff;
00926         }
00927 
00928         /* get the second surrogate and assemble the code point */
00929         second = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*(args->source+1)));
00930 
00931         /* ignore unmatched surrogates and just deliver the first one in such a case */
00932         if(UTF_IS_SECOND_SURROGATE(second)) {
00933             /* matched pair, get pair value */
00934             myUChar = UTF16_GET_PAIR_VALUE(first, second);
00935             args->source += 2;
00936         }
00937     }
00938 
00939     return myUChar;
00940 } 
00941 
00942 static const UConverterImpl _UTF16BEImpl={
00943     UCNV_UTF16_BigEndian,
00944 
00945     NULL,
00946     NULL,
00947 
00948     NULL,
00949     NULL,
00950     NULL,
00951 
00952     T_UConverter_toUnicode_UTF16_BE,
00953     NULL,
00954     T_UConverter_fromUnicode_UTF16_BE,
00955     NULL,
00956     T_UConverter_getNextUChar_UTF16_BE,
00957 
00958     NULL,
00959     NULL
00960 };
00961 
00962 /* Todo: verify that UTF-16BE == (ccsid (ibm-codepage) 1200) for unicode version 2.0 and 3.0 */
00963 const UConverterStaticData _UTF16BEStaticData={
00964   sizeof(UConverterStaticData),
00965 "UTF16_BigEndian",
00966     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
00967     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
00968 };
00969 
00970 
00971 const UConverterSharedData _UTF16BEData={
00972     sizeof(UConverterSharedData), ~((uint32_t) 0),
00973     NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, 
00974     0
00975 };
00976 
00977 /* UTF-16LE ----------------------------------------------------------------- */
00978 
00979 U_CFUNC void  T_UConverter_toUnicode_UTF16_LE (UConverterToUnicodeArgs * args,
00980                                        UErrorCode * err)
00981 {
00982     const unsigned char *mySource = (unsigned char *) args->source;
00983     UChar *myTarget = args->target;
00984     int32_t mySourceIndex = 0;
00985     int32_t myTargetIndex = 0;
00986     int32_t targetLength = args->targetLimit - myTarget;
00987     int32_t sourceLength = args->sourceLimit - (char *) mySource;
00988     UChar mySourceChar = 0x0000;
00989 
00990     while (mySourceIndex < sourceLength)
00991     {
00992         if (myTargetIndex < targetLength)
00993         {
00994             /*gets the corresponding UniChar */
00995             mySourceChar = (unsigned char) mySource[mySourceIndex++];
00996 
00997             if (args->converter->toUnicodeStatus == 0x00)
00998             {
00999                 args->converter->toUnicodeStatus = (unsigned char) mySourceChar == 0x00 ? 0xFFFF : mySourceChar;
01000             }
01001             else
01002             {
01003                 if (args->converter->toUnicodeStatus == 0xFFFF) {
01004                     mySourceChar = (UChar) (mySourceChar << 8);
01005                 }
01006                 else
01007                 {
01008                     mySourceChar <<= 8;
01009                     mySourceChar |= (UChar) (args->converter->toUnicodeStatus);
01010                 }
01011                 args->converter->toUnicodeStatus = 0x00;
01012                 myTarget[myTargetIndex++] = mySourceChar;
01013             }
01014         }
01015         else
01016         {
01017             *err = U_BUFFER_OVERFLOW_ERROR;
01018             break;
01019         }
01020     }
01021 
01022 
01023     if (U_SUCCESS(*err) && args->flush
01024       && (mySourceIndex == sourceLength)
01025       && (args->converter->toUnicodeStatus != 0x00))
01026     {
01027         if (U_SUCCESS(*err)) 
01028         {
01029           *err = U_TRUNCATED_CHAR_FOUND; 
01030           args->converter->toUnicodeStatus = 0x00;
01031         }
01032     }
01033 
01034     args->target += myTargetIndex;
01035     args->source += mySourceIndex;
01036 }
01037 
01038 U_CFUNC void T_UConverter_fromUnicode_UTF16_LE (UConverterFromUnicodeArgs * args,
01039                                           UErrorCode * err)
01040 {
01041     const UChar *mySource = args->source;
01042     unsigned char *myTarget = (unsigned char *) args->target;
01043     int32_t mySourceIndex = 0;
01044     int32_t myTargetIndex = 0;
01045     int32_t targetLength = args->targetLimit - (char *) myTarget;
01046     int32_t sourceLength = args->sourceLimit - mySource;
01047     UChar mySourceChar;
01048 
01049     /*writing the char to the output stream */
01050     while (mySourceIndex < sourceLength)
01051     {
01052         if (myTargetIndex < targetLength)
01053         {
01054             mySourceChar = (UChar) mySource[mySourceIndex++];
01055             myTarget[myTargetIndex++] = (char) mySourceChar;
01056             if (myTargetIndex < targetLength)
01057             {
01058                 myTarget[myTargetIndex++] = (char) (mySourceChar >> 8);
01059             }
01060             else
01061             {
01062                 args->converter->charErrorBuffer[0] = (char) (mySourceChar >> 8);
01063                 args->converter->charErrorBufferLength = 1;
01064                 *err = U_BUFFER_OVERFLOW_ERROR;
01065             }
01066         }
01067         else
01068         {
01069             *err = U_BUFFER_OVERFLOW_ERROR;
01070             break;
01071         }
01072     }
01073 
01074     args->target += myTargetIndex;
01075     args->source += mySourceIndex;
01076 }
01077 
01078 U_CFUNC UChar32 T_UConverter_getNextUChar_UTF16_LE(UConverterToUnicodeArgs* args,
01079                                                    UErrorCode* err)
01080 {
01081     UChar32 myUChar;
01082     uint16_t first;
01083     /*Checks boundaries and set appropriate error codes*/
01084     if (args->source+2 > args->sourceLimit) 
01085     {
01086         if (args->source >= args->sourceLimit)
01087         {
01088             /*Either caller has reached the end of the byte stream*/
01089             *err = U_INDEX_OUTOFBOUNDS_ERROR;
01090         }
01091         else
01092         {
01093             /* a character was cut in half*/
01094             *err = U_TRUNCATED_CHAR_FOUND;
01095         }
01096 
01097         return 0xffff;
01098     }
01099 
01100     /*Gets the corresponding codepoint*/
01101     first = (uint16_t)(((uint16_t)*((args->source)+1) << 8) | ((uint8_t)(*(args->source))));
01102     myUChar=first;
01103     /*updates the source*/
01104     args->source += 2;  
01105 
01106     if (UTF_IS_FIRST_SURROGATE(first))
01107     {
01108         uint16_t second;
01109 
01110         if (args->source+2 > args->sourceLimit)
01111         {
01112            *err = U_TRUNCATED_CHAR_FOUND;
01113             return 0xffff;
01114         }
01115 
01116         /* get the second surrogate and assemble the code point */
01117         second = (uint16_t)(((uint16_t)*(args->source+1) << 8) |((uint8_t)(*(args->source))));
01118 
01119         /* ignore unmatched surrogates and just deliver the first one in such a case */
01120         if(UTF_IS_SECOND_SURROGATE(second))
01121         {
01122             /* matched pair, get pair value */
01123             myUChar = UTF16_GET_PAIR_VALUE(first, second);
01124             args->source += 2;
01125         }
01126     }
01127 
01128     return myUChar;
01129 } 
01130 
01131 static const UConverterImpl _UTF16LEImpl={
01132     UCNV_UTF16_LittleEndian,
01133 
01134     NULL,
01135     NULL,
01136 
01137     NULL,
01138     NULL,
01139     NULL,
01140 
01141     T_UConverter_toUnicode_UTF16_LE,
01142     NULL,
01143     T_UConverter_fromUnicode_UTF16_LE,
01144     NULL,
01145     T_UConverter_getNextUChar_UTF16_LE,
01146 
01147     NULL,
01148     NULL
01149 };
01150 
01151 
01152 /* Todo: verify that UTF-16LE == (ccsid (ibm-codepage) 1200) for unicode version 2.0 and 3.0 */
01153 const UConverterStaticData _UTF16LEStaticData={
01154     sizeof(UConverterStaticData),
01155     "UTF16_LittleEndian",
01156     1200, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
01157     { 0xfd, 0xff, 0, 0 },2,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
01158 };
01159 
01160 
01161 const UConverterSharedData _UTF16LEData={
01162     sizeof(UConverterSharedData), ~((uint32_t) 0),
01163     NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, 
01164     0
01165 };
01166 
01167 /* UTF-32BE ----------------------------------------------------------------- */
01168 
01169 void T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
01170                                      UErrorCode * err)
01171 {
01172     const unsigned char *mySource = (unsigned char *) args->source;
01173     UChar *myTarget = args->target;
01174     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
01175     const UChar *targetLimit = args->targetLimit;
01176     unsigned char *toUBytes = args->converter->toUBytes;
01177     uint32_t ch, i;
01178 
01179     /* UTF-8 returns here for only non-offset, this needs to change.*/
01180     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
01181     {
01182         i = args->converter->toULength;       /* restore # of bytes consumed */
01183 
01184         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
01185         args->converter->toUnicodeStatus = 0;
01186         goto morebytes;
01187     }
01188 
01189     while (mySource < sourceLimit && myTarget < targetLimit)
01190     {
01191         i = 0;
01192         ch = 0;
01193 morebytes:
01194         while (i < sizeof(uint32_t))
01195         {
01196             if (mySource < sourceLimit)
01197             {
01198                 ch = (ch << 8) | (uint8_t)(*mySource);
01199                 toUBytes[i++] = (char) *(mySource++);
01200             }
01201             else
01202             {
01203                 if (args->flush)
01204                 {
01205                     if (U_SUCCESS(*err))
01206                     {
01207                         *err = U_TRUNCATED_CHAR_FOUND;
01208                         args->converter->toUnicodeStatus = MAXIMUM_UCS4;
01209                     }
01210                 }
01211                 else
01212                 {   /* stores a partially calculated target*/
01213                     /* + 1 to make 0 a valid character */
01214                     args->converter->toUnicodeStatus = ch + 1;
01215                     args->converter->toULength = (int8_t) i;
01216                 }
01217                 goto donefornow;
01218             }
01219         }
01220 
01221         if (ch <= MAXIMUM_UTF)
01222         {
01223             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
01224             if (ch <= MAXIMUM_UCS2) 
01225             {
01226                 /* fits in 16 bits */
01227                 *(myTarget++) = (UChar) ch;
01228             }
01229             else
01230             {
01231                 /* write out the surrogates */
01232                 ch -= HALF_BASE;
01233                 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
01234                 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
01235                 if (myTarget < targetLimit)
01236                 {
01237                     *(myTarget++) = (UChar)ch;
01238                 }
01239                 else
01240                 {
01241                     /* Put in overflow buffer (not handled here) */
01242                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
01243                     args->converter->UCharErrorBufferLength = 1;
01244                     *err = U_BUFFER_OVERFLOW_ERROR;
01245                     break;
01246                 }
01247             }
01248         }
01249         else
01250         {
01251             args->source = (const char *) mySource;
01252             args->target = myTarget;
01253             args->converter->invalidCharLength = (int8_t)i;
01254             if (T_UConverter_toUnicode_InvalidChar_Callback(args, err))
01255             {
01256                 /* Stop if the error wasn't handled */
01257                 break;
01258             }
01259             args->converter->invalidCharLength = 0;
01260             mySource = (unsigned char *) args->source;
01261             myTarget = args->target;
01262         }
01263     }
01264 
01265 donefornow:
01266     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
01267     {
01268         /* End of target buffer */
01269         *err = U_BUFFER_OVERFLOW_ERROR;
01270     }
01271 
01272     args->target = myTarget;
01273     args->source = (const char *) mySource;
01274 }
01275 
01276 void T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
01277                                        UErrorCode * err)
01278 {
01279     const UChar *mySource = args->source;
01280     unsigned char *myTarget = (unsigned char *) args->target;
01281     const UChar *sourceLimit = args->sourceLimit;
01282     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
01283     UChar32 ch, ch2;
01284     unsigned int indexToWrite;
01285     unsigned char temp[sizeof(uint32_t)];
01286 
01287     temp[0] = 0;
01288 
01289     if (args->converter->fromUnicodeStatus)
01290     {
01291         ch = args->converter->fromUnicodeStatus;
01292         args->converter->fromUnicodeStatus = 0;
01293         goto lowsurogate;
01294     }
01295 
01296     while (mySource < sourceLimit && myTarget < targetLimit)
01297     {
01298         ch = *(mySource++);
01299 
01300         if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
01301         {
01302 lowsurogate:
01303             if (mySource < sourceLimit)
01304             {
01305                 ch2 = *mySource;
01306                 if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
01307                 {
01308                     ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
01309                     mySource++;
01310                 }
01311             }
01312             else if (!args->flush)
01313             {
01314                 /* ran out of source */
01315                 args->converter->fromUnicodeStatus = ch;
01316                 break;
01317             }
01318         }
01319 
01320         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
01321         /* Todo: Can the & part be left off implicitly? Does it really save time? */
01322         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
01323         temp[2] = (uint8_t) (ch >> 8 & 0xFF);
01324         temp[3] = (uint8_t) (ch & 0xFF);
01325 
01326         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
01327         {
01328             if (myTarget < targetLimit)
01329             {
01330                 *(myTarget++) = temp[indexToWrite];
01331             }
01332             else
01333             {
01334                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
01335                 *err = U_BUFFER_OVERFLOW_ERROR; /* Todo: is this needed because of ending if */
01336             }
01337         }
01338     }
01339 
01340     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
01341     {
01342         *err = U_BUFFER_OVERFLOW_ERROR;
01343     }
01344 
01345     args->target = (char *) myTarget;
01346     args->source = mySource;
01347 }
01348 
01349 UChar32 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
01350                                                    UErrorCode* err)
01351 {
01352     *err = U_UNSUPPORTED_ERROR;
01353     return 0;
01354 }
01355 
01356 static const UConverterImpl _UTF32BEImpl = {
01357     UCNV_UTF32_BigEndian,
01358 
01359     NULL,
01360     NULL,
01361 
01362     NULL,
01363     NULL,
01364     NULL,
01365 
01366     T_UConverter_toUnicode_UTF32_BE,
01367     NULL,
01368 /*    T_UConverter_toUnicode_UTF32_BE_OFFSETS_LOGIC, */
01369     T_UConverter_fromUnicode_UTF32_BE,
01370     NULL,
01371 /*    T_UConverter_fromUnicode_UTF32_BE_OFFSETS_LOGIC, */
01372     T_UConverter_getNextUChar_UTF32_BE,
01373 
01374     NULL,
01375     NULL
01376 };
01377 
01379 const UConverterStaticData _UTF32BEStaticData = {
01380   sizeof(UConverterStaticData),
01381 "UTF32_BigEndian",
01382     1200, UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
01383     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
01384     {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
01385 };
01386 
01387 const UConverterSharedData _UTF32BEData = {
01388     sizeof(UConverterSharedData), ~((uint32_t) 0),
01389     NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl, 
01390     0
01391 };
01392 
01393 /* UTF-32LE ---------------------------------------------------------- */
01394 
01395 void T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
01396                                       UErrorCode * err)
01397 {
01398     const unsigned char *mySource = (unsigned char *) args->source;
01399     UChar *myTarget = args->target;
01400     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
01401     const UChar *targetLimit = args->targetLimit;
01402     unsigned char *toUBytes = args->converter->toUBytes;
01403     uint32_t ch, i;
01404 
01405     /* UTF-8 returns here for only non-offset, this needs to change.*/
01406     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
01407     {
01408         i = args->converter->toULength;       /* restore # of bytes consumed */
01409 
01410         /* Stores the previously calculated ch from a previous call*/
01411         ch = args->converter->toUnicodeStatus - 1;
01412         args->converter->toUnicodeStatus = 0;
01413         goto morebytes;
01414     }
01415 
01416     while (mySource < sourceLimit && myTarget < targetLimit)
01417     {
01418         i = 0;
01419         ch = 0;
01420 morebytes:
01421         while (i < sizeof(uint32_t))
01422         {
01423             if (mySource < sourceLimit)
01424             {
01425                 ch |= ((uint8_t)(*mySource)) << (i * 8);
01426                 toUBytes[i++] = (char) *(mySource++);
01427             }
01428             else
01429             {
01430                 if (args->flush)
01431                 {
01432                     if (U_SUCCESS(*err))
01433                     {
01434                         *err = U_TRUNCATED_CHAR_FOUND;
01435                         args->converter->toUnicodeStatus = 0;
01436                     }
01437                 }
01438                 else
01439                 {   /* stores a partially calculated target*/
01440                     /* + 1 to make 0 a valid character */
01441                     args->converter->toUnicodeStatus = ch + 1;
01442                     args->converter->toULength = (int8_t) i;
01443                 }
01444                 goto donefornow;
01445             }
01446         }
01447 
01448         if (ch <= MAXIMUM_UTF)
01449         {
01450             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
01451             if (ch <= MAXIMUM_UCS2) 
01452             {
01453                 /* fits in 16 bits */
01454                 *(myTarget++) = (UChar) ch;
01455             }
01456             else
01457             {
01458                 /* write out the surrogates */
01459                 ch -= HALF_BASE;
01460                 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
01461                 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
01462                 if (myTarget < targetLimit)
01463                 {
01464                     *(myTarget++) = (UChar)ch;
01465                 }
01466                 else
01467                 {
01468                     /* Put in overflow buffer (not handled here) */
01469                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
01470                     args->converter->UCharErrorBufferLength = 1;
01471                     *err = U_BUFFER_OVERFLOW_ERROR;
01472                     break;
01473                 }
01474             }
01475         }
01476         else
01477         {
01478             args->source = (const char *) mySource;
01479             args->target = myTarget;
01480             args->converter->invalidCharLength = (int8_t)i;
01481             if (T_UConverter_toUnicode_InvalidChar_Callback(args, err))
01482             {
01483                 /* Stop if the error wasn't handled */
01484                 break;
01485             }
01486             args->converter->invalidCharLength = 0;
01487             mySource = (unsigned char *) args->source;
01488             myTarget = args->target;
01489         }
01490     }
01491 
01492 donefornow:
01493     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
01494     {
01495         /* End of target buffer */
01496         *err = U_BUFFER_OVERFLOW_ERROR;
01497     }
01498 
01499     args->target = myTarget;
01500     args->source = (const char *) mySource;
01501 }
01502 
01503 void  T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
01504                                          UErrorCode * err)
01505 {
01506     const UChar *mySource = args->source;
01507     unsigned char *myTarget = (unsigned char *) args->target;
01508     const UChar *sourceLimit = args->sourceLimit;
01509     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
01510     UChar32 ch, ch2;
01511     unsigned int indexToWrite;
01512     unsigned char temp[sizeof(uint32_t)];
01513 
01514     temp[3] = 0;
01515 
01516     if (args->converter->fromUnicodeStatus)
01517     {
01518         ch = args->converter->fromUnicodeStatus;
01519         args->converter->fromUnicodeStatus = 0;
01520         goto lowsurogate;
01521     }
01522 
01523     while (mySource < sourceLimit && myTarget < targetLimit)
01524     {
01525         ch = *(mySource++);
01526 
01527         if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
01528         {
01529 lowsurogate:
01530             if (mySource < sourceLimit)
01531             {
01532                 ch2 = *mySource;
01533                 if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
01534                 {
01535                     ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
01536                     mySource++;
01537                 }
01538             }
01539             else if (!args->flush)
01540             {
01541                 /* ran out of source */
01542                 args->converter->fromUnicodeStatus = ch;
01543                 break;
01544             }
01545         }
01546 
01547         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
01548         /* Todo: Can the & part be left off implicitly? Does it really save time? */
01549         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
01550         temp[1] = (uint8_t) (ch >> 8 & 0xFF);
01551         temp[0] = (uint8_t) (ch & 0xFF);
01552 
01553         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
01554         {
01555             if (myTarget < targetLimit)
01556             {
01557                 *(myTarget++) = temp[indexToWrite];
01558             }
01559             else
01560             {
01561                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
01562                 *err = U_BUFFER_OVERFLOW_ERROR; /* Todo: is this needed because of ending if */
01563             }
01564         }
01565     }
01566 
01567     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
01568     {
01569         *err = U_BUFFER_OVERFLOW_ERROR;
01570     }
01571 
01572     args->target = (char *) myTarget;
01573     args->source = mySource;
01574 }
01575 
01576 UChar32 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
01577                                                    UErrorCode* err)
01578 {
01579     *err = U_UNSUPPORTED_ERROR;
01580     return 0;
01581 }
01582 
01583 static const UConverterImpl _UTF32LEImpl = {
01584     UCNV_UTF32_LittleEndian,
01585 
01586     NULL,
01587     NULL,
01588 
01589     NULL,
01590     NULL,
01591     NULL,
01592 
01593     T_UConverter_toUnicode_UTF32_LE,
01594     NULL,
01595 /*    T_UConverter_toUnicode_UTF32_LE_OFFSETS_LOGIC, */
01596     T_UConverter_fromUnicode_UTF32_LE,
01597     NULL,
01598 /*    T_UConverter_fromUnicode_UTF32_LE_OFFSETS_LOGIC, */
01599     T_UConverter_getNextUChar_UTF32_LE,
01600 
01601     NULL,
01602     NULL
01603 };
01604 
01606 const UConverterStaticData _UTF32LEStaticData = {
01607   sizeof(UConverterStaticData),
01608 "UTF32_LittleEndian",
01609     1200, UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
01610     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
01611     {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
01612 };
01613 
01614 
01615 const UConverterSharedData _UTF32LEData = {
01616     sizeof(UConverterSharedData), ~((uint32_t) 0),
01617     NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl, 
01618     0
01619 };

Generated at Tue Dec 5 10:48:02 2000 for ICU by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000