Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

ucnvmbcs.c

00001 /*
00002 *******************************************************************************
00003 *
00004 *   Copyright (C) 2000, International Business Machines
00005 *   Corporation and others.  All Rights Reserved.
00006 *
00007 *******************************************************************************
00008 *   file name:  ucnvmbcs.c
00009 *   encoding:   US-ASCII
00010 *   tab size:   8 (not used)
00011 *   indentation:4
00012 *
00013 *   created on: 2000jul03
00014 *   created by: Markus W. Scherer
00015 *
00016 *   The current code in this file replaces the previous implementation
00017 *   of conversion code from multi-byte codepages to Unicode and back.
00018 *   This implementation supports the following:
00019 *   - legacy variable-length codepages with up to 4 bytes per character
00020 *   - all Unicode code points (up to 0x10ffff)
00021 *   - efficient distinction of unassigned vs. illegal byte sequences
00022 *   - it is possible in fromUnicode() to directly deal with simple
00023 *     stateful encodings
00024 *   - it is possible to convert Unicode code points other than U+0000
00025 *     to a single zero byte (but not as a fallback)
00026 *
00027 *   Remaining limitations in fromUnicode:
00028 *   - byte sequences must not have leading zero bytes
00029 *   - no fallback mapping from Unicode to a zero byte
00030 *   - limitation to up to 4 bytes per character
00031 */
00032 
00033 #include "unicode/utypes.h"
00034 #include "unicode/ucnv.h"
00035 #include "unicode/ucnv_cb.h"
00036 #include "ucnv_bld.h"
00037 #include "ucnvmbcs.h"
00038 #include "ucnv_cnv.h"
00039 #include "cstring.h"
00040 
00041 /*
00042  * Converting stateless codepage data
00043  * (or codepage data with simple states) to Unicode.
00044  *
00045  * Data structure and algorithm for converting from complex legacy codepages
00046  * to Unicode. (Designed before 2000-may-22.)
00047  *
00048  * The basic idea is that the structure of legacy codepages can be described
00049  * with state tables.
00050  * When reading a byte stream, each input byte causes a state transition.
00051  * Some transitions result in the output of a code point, some result in
00052  * "unassigned" or "illegal" output.
00053  * This is used here for character conversion.
00054  *
00055  * The data structure begins with a state table consisting of a row
00056  * per state, with 256 entries (columns) per row for each possible input
00057  * byte value.
00058  * Each entry is 32 bits wide, with the lower 7 bits containing the next state.
00059  * State 0 is the initial state.
00060  *
00061  * Bit 31 of each entry indicates whether the state is
00062  * terminal (bit 31 set) or not.
00063  *
00064  * Most of the time, the offset values of subsequent states are added
00065  * up to a scalar value. This value will eventually be the index of
00066  * the Unicode code point in a table that follows the state table.
00067  * The effect is that the code points for final state table rows
00068  * are contiguous. The code points of final state rows follow each other
00069  * in the order of the references to those final states by previous
00070  * states, etc.
00071  *
00072  * For some terminal states, the offset is itself the output Unicode
00073  * code point (16 bits for a BMP code point or 20 bits for a code point
00074  * that is written as a surrogate pair).
00075  * For others, the code point in the Unicode table is stored with either
00076  * one or two code units: one for BMP code points, two for a pair of
00077  * surrogates.
00078  * All code points for a final table take up the same number of code
00079  * units, regardless of whether they all actually _use_ the same number
00080  * of code units. This is necessary for simple array access.
00081  *
00082  * An additional feature comes in with what in ICU is called "fallback"
00083  * mappings:
00084  * In addition to round-trippable, precise, 1:1 mappings, there are often
00085  * mappings defined between similar, though not the same, characters.
00086  * Typically, such mappings occur only in fromUnicode mapping tables because
00087  * Unicode has a superset repertoire of most other codepages. However, it
00088  * is possible to provide such mappings in the toUnicode tables, too.
00089  * In this case, the fallback mappings are partly integrated into the
00090  * general state tables because the structure of the encoding includes their
00091  * byte sequences. They are optional mappings when the main mapping is
00092  * "unassigned", and are looked up by the scalar offset of the main mapping
00093  * in a separate table. Only when the main mapping does not have such a
00094  * scalar offset, i.e., in the case of action codes 5 of 6 below (valid-direct),
00095  * would there need to be some different mechanism. Therefore, there are
00096  * separate action codes 3 and 4 (fallback-direct) especially for that.
00097  * The "unassigned" action code 2 cannot be used for fallback lookups because
00098  * it also does not result in a scalar offset. This means that fallback mappings
00099  * require to fit into either fallback-direct action codes or valid-single or
00100  * valid-pair codes that result in scalar offsets.
00101  * "Unassigned" really means "structurally unassigned".
00102  *
00103  * The interpretation of the bits in each entry is as follows:
00104  *
00105  * Bit 31 not set, not a terminal entry:
00106  * 30..7  offset delta, to be added up
00107  *  6..0  next state
00108  *
00109  * Bit 31 set, terminal entry:
00110  * 30..27 action code:
00111  *        0  illegal byte sequence
00112  *           26..7  not used, 0
00113  *        1  state change only
00114  *           26..7  not used, 0
00115  *           useful for state changes in simple stateful encodings,
00116  *           at Shift-In/Shift-Out codes
00117  *        2  unassigned byte sequence
00118  *           26..7  not used, 0
00119  *                  this does not contain a final offset delta because the main
00120  *                  purpose of this action code is to save scalar offset values;
00121  *                  therefore, fallback values cannot be assigned to byte
00122  *                  sequences that result in this action code - use codes 5 or 6
00123  *        3  valid byte sequence (fallback)
00124  *           22..7  16-bit Unicode BMP code point as fallback result
00125  *        4  valid byte sequence (fallback)
00126  *           26..7  20-bit Unicode surrogate code point as fallback result
00127  *
00128  *        action codes 5, 6, 7, and 8 result in precise-mapping Unicode code points
00129  *        5  valid byte sequence
00130  *           22..7  16-bit Unicode BMP code point
00131  *                  never U+fffe or U+ffff (use action codes 0, 2, 3 or 4 for that)
00132  *        6  valid byte sequence
00133  *           26..7  20-bit Unicode surrogate code point
00134  *                  never U+fffe or U+ffff (use action codes 0, 2, 3 or 4 for that)
00135  *
00136  *        action codes 7 and 8 may result in U+fffe (unassigned), in which case the
00137  *        final offset is to be looked up in a special fallback table
00138  *        7  valid byte sequence
00139  *           26..16 not used, 0
00140  *           15..7  final offset delta
00141  *                  pointing to one 16-bit code unit
00142  *                  which may be U+fffe (unassigned) or U+ffff (illegal)
00143  *        8  valid byte sequence
00144  *           26..16 not used, 0
00145  *           15..7  final offset delta
00146  *                  pointing to two 16-bit code units
00147  *                  (UTF-16 surrogates)
00148  *                  the first code unit either is a lead surrogate and indicates
00149  *                  an assigned surrogate pair, or it is a single unit
00150  *                  which may be U+fffe (unassigned) or U+ffff (illegal)
00151  *           (the final offset deltas are at most 255 * 2,
00152  *            times 2 because of storing code unit pairs)
00153  *        9..15 reserved for future use
00154  *           current implementations will only perform a state change
00155  *           and ignore bits 26..7
00156  *  6..0  next state (regardless of action code)
00157  *
00158  * An encoding with contiguous ranges of unassigned byte sequences, like
00159  * Shift-JIS and especially EUC-TW, can be stored efficiently by having
00160  * at least two states for the trail bytes:
00161  * One trail byte state that results in code points, and one that only
00162  * has "unassigned" and "illegal" terminal states.
00163  *
00164  * Note: partly by accident, this data structure supports simple stateless
00165  * encodings without any additional logic.
00166  * Especially simple Shift-In/Shift-Out schemes could be handled with
00167  * appropriate state tables (especially EBCDIC_STATEFUL!).
00168  */
00169 
00170 /* prototypes --------------------------------------------------------------- */
00171 
00172 U_CFUNC void
00173 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
00174                                 UErrorCode *pErrorCode);
00175 
00176 U_CFUNC UChar32
00177 _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
00178                               uint8_t b, UBool useFallback);
00179 
00180 U_CFUNC void
00181 _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
00182                                   UErrorCode *pErrorCode);
00183 
00184 static void
00185 fromUCallback(UConverter *cnv,
00186               void *context, UConverterFromUnicodeArgs *pArgs,
00187               const UChar *codeUnits, int32_t length, UChar32 codePoint,
00188               UConverterCallbackReason reason, UErrorCode *pErrorCode);
00189 
00190 static void
00191 toUCallback(UConverter *cnv,
00192             void *context, UConverterToUnicodeArgs *pArgs,
00193             const char *codeUnits, int32_t length,
00194             UConverterCallbackReason reason, UErrorCode *pErrorCode);
00195 
00196 /* GB 18030 data ------------------------------------------------------------ */
00197 
00198 /* helper macros for linear values for GB 18030 four-byte sequences */
00199 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
00200 
00201 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
00202 
00203 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
00204 
00205 /*
00206  * Some ranges of GB 18030 where both the Unicode code points and the
00207  * GB four-byte sequences are contiguous and are handled algorithmically by
00208  * the special callback functions below.
00209  * The values are start & end of Unicode & GB codes.
00210  */
00211 static const uint32_t
00212 gb18030Ranges[13][4]={
00213     0x10000, 0x10ffff, LINEAR(0x90308130), LINEAR(0xe3329a35),
00214     0x9fa6, 0xdfff, LINEAR(0x82358f34), LINEAR(0x83389837),
00215     0x0452, 0x200f, LINEAR(0x8130d239), LINEAR(0x8136a530),
00216     0xe865, 0xf92b, LINEAR(0x83389838), LINEAR(0x8431cc32),
00217     0x2643, 0x2e80, LINEAR(0x8137a838), LINEAR(0x8138fd37),
00218     0xfa2a, 0xfe2f, LINEAR(0x8431e336), LINEAR(0x8432cc35),
00219     0x3ce1, 0x4055, LINEAR(0x8231d439), LINEAR(0x8232af33),
00220     0x361b, 0x3917, LINEAR(0x8230a634), LINEAR(0x8230f238),
00221     0x49b8, 0x4c76, LINEAR(0x8234a132), LINEAR(0x8234e734),
00222     0x4160, 0x4336, LINEAR(0x8232c938), LINEAR(0x8232f838),
00223     0x478e, 0x4946, LINEAR(0x8233e839), LINEAR(0x82349639),
00224     0x44d7, 0x464b, LINEAR(0x8233a430), LINEAR(0x8233c932),
00225     0xffe6, 0xffff, LINEAR(0x8432e932), LINEAR(0x8432eb37)
00226 };
00227 
00228 /* MBCS setup functions ----------------------------------------------------- */
00229 
00230 U_CFUNC void
00231 _MBCSLoad(UConverterSharedData *sharedData,
00232           const uint8_t *raw,
00233           UErrorCode *pErrorCode) {
00234     UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
00235     _MBCSHeader *header=(_MBCSHeader *)raw;
00236 
00237     if(header->version[0]!=1) {
00238         *pErrorCode=U_INVALID_TABLE_FORMAT;
00239         return;
00240     }
00241 
00242     mbcsTable->countStates=(uint8_t)header->countStates;
00243     mbcsTable->countToUFallbacks=header->countToUFallbacks;
00244     mbcsTable->stateTable=(const int32_t (*)[256])(raw+sizeof(_MBCSHeader));
00245     mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
00246     mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
00247 
00248     mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
00249     mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
00250     mbcsTable->outputType=(uint8_t)header->flags;
00251 }
00252 
00253 U_CFUNC void
00254 _MBCSReset(UConverter *cnv) {
00255     /* toUnicode */
00256     cnv->toUnicodeStatus=0;
00257     cnv->mode=0;
00258     cnv->toULength=0;
00259 
00260     /* fromUnicode */
00261     cnv->fromUSurrogateLead=0;
00262 }
00263 
00264 U_CFUNC void
00265 _MBCSOpen(UConverter *cnv,
00266           const char *name,
00267           const char *locale,
00268           uint32_t options,
00269           UErrorCode *pErrorCode) {
00270     _MBCSReset(cnv);
00271     if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) {
00272         /* set a flag for GB 18030 mode, which changes the callback behavior */
00273         cnv->extraInfo=(void *)gb18030Ranges;
00274     }
00275 }
00276 
00277 /* MBCS-to-Unicode conversion functions ------------------------------------- */
00278 
00279 static UChar32
00280 _MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
00281     const _MBCSToUFallback *toUFallbacks;
00282     uint32_t i, start, limit;
00283 
00284     limit=mbcsTable->countToUFallbacks;
00285     if(limit>0) {
00286         /* do a binary search for the fallback mapping */
00287         toUFallbacks=mbcsTable->toUFallbacks;
00288         start=0;
00289         while(start<limit-1) {
00290             i=(start+limit)/2;
00291             if(offset<toUFallbacks[i].offset) {
00292                 limit=i;
00293             } else {
00294                 start=i;
00295             }
00296         }
00297 
00298         /* did we really find it? */
00299         if(offset==toUFallbacks[start].offset) {
00300             return toUFallbacks[start].codePoint;
00301         }
00302     }
00303 
00304     return 0xfffe;
00305 }
00306 
00307 U_CFUNC void
00308 _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
00309                           UErrorCode *pErrorCode) {
00310     /* set up the local pointers */
00311     UConverter *cnv;
00312     const uint8_t *source, *sourceLimit;
00313     UChar *target;
00314     const UChar *targetLimit;
00315     int32_t *offsets;
00316 
00317     const int32_t (*stateTable)[256];
00318     const uint16_t *unicodeCodeUnits;
00319 
00320     uint32_t offset;
00321     uint8_t state;
00322     int8_t byteIndex;
00323     uint8_t *bytes;
00324 
00325     int32_t sourceIndex, nextSourceIndex;
00326 
00327     int32_t entry;
00328     UChar c;
00329     uint8_t b;
00330     UConverterCallbackReason reason;
00331 
00332     /* use optimized function if possible */
00333     cnv=pArgs->converter;
00334     if(cnv->sharedData->table->mbcs.countStates==1) {
00335         _MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
00336         return;
00337     }
00338 
00339     /* set up the local pointers */
00340     source=(const uint8_t *)pArgs->source;
00341     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
00342     target=pArgs->target;
00343     targetLimit=pArgs->targetLimit;
00344     offsets=pArgs->offsets;
00345 
00346     stateTable=cnv->sharedData->table->mbcs.stateTable;
00347     unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
00348 
00349     /* get the converter state from UConverter */
00350     offset=cnv->toUnicodeStatus;
00351     state=(uint8_t)(cnv->mode);
00352     byteIndex=cnv->toULength;
00353     bytes=cnv->toUBytes;
00354 
00355     /* sourceIndex=-1 if the current character began in the previous buffer */
00356     sourceIndex=byteIndex==0 ? 0 : -1;
00357     nextSourceIndex=0;
00358 
00359     /* conversion loop */
00360     while(source<sourceLimit) {
00361         /*
00362          * This following test is to see if available input would overflow the output.
00363          * It does not catch output of more than one code unit that
00364          * overflows as a result of a surrogate pair or callback output
00365          * from the last source byte.
00366          * Therefore, those situations also test for overflows and will
00367          * then break the loop, too.
00368          */
00369         if(target<targetLimit) {
00370             bytes[byteIndex++]=b=*source++;
00371             ++nextSourceIndex;
00372             entry=stateTable[state][b];
00373             if(entry>=0) {
00374                 /*
00375                  * bit 31 is not set, bits:
00376                  * 30..7  offset delta
00377                  *  6..0  next state
00378                  */
00379                 state=(uint8_t)(entry&0x7f);
00380                 offset+=entry>>7;
00381             } else {
00382                 /*
00383                  * bit 31 is set, bits:
00384                  * 30..27 action code
00385                  *        (do not mask out bit 31 for speed, include it in action values)
00386                  * 26..7  depend on the action code
00387                  *  6..0  next state
00388                  */
00389 
00390                 /* set the next state early so that we can reuse the entry variable */
00391                 state=(uint8_t)(entry&0x7f); /* typically 0 */
00392 
00393                 /* switch per action code */
00394                 switch((uint32_t)entry>>27U) {
00395                 case 16|MBCS_STATE_ILLEGAL:
00396                     /* bits 26..7 are not used, 0 */
00397                     /* callback(illegal) */
00398                     reason=UCNV_ILLEGAL;
00399                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
00400                     goto callback;
00401                 case 16|MBCS_STATE_CHANGE_ONLY:
00402                     /* bits 26..7 are not used, 0 */
00403                     /*
00404                      * This serves as a state change without any output.
00405                      * It is useful for reading simple stateful encodings,
00406                      * for example using just Shift-In/Shift-Out codes.
00407                      * The 21 unused bits may later be used for more sophisticated
00408                      * state transitions.
00409                      */
00410                     break;
00411                 case 16|MBCS_STATE_UNASSIGNED:
00412                     /* bits 26..7 are not used, 0 */
00413                     /* callback(unassigned) */
00414                     reason=UCNV_UNASSIGNED;
00415                     *pErrorCode=U_INVALID_CHAR_FOUND;
00416                     goto callback;
00417                 case 16|MBCS_STATE_FALLBACK_DIRECT_16:
00418                     /* bits 26..23 are not used, 0 */
00419                     /* bits 22..7 contain the Unicode BMP code point */
00420                     if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
00421                         /* callback(unassigned) */
00422                         reason=UCNV_UNASSIGNED;
00423                         *pErrorCode=U_INVALID_CHAR_FOUND;
00424                         goto callback;
00425                     }
00426                     /* fall through to the MBCS_STATE_VALID_DIRECT_16 branch */
00427                 case 16|MBCS_STATE_VALID_DIRECT_16:
00428                     /* bits 26..23 are not used, 0 */
00429                     /* bits 22..7 contain the Unicode BMP code point */
00430                     /* output BMP code point */
00431                     *target++=(UChar)(entry>>7);
00432                     if(offsets!=NULL) {
00433                         *offsets++=sourceIndex;
00434                     }
00435                     break;
00436                 case 16|MBCS_STATE_FALLBACK_DIRECT_20:
00437                     /* bits 26..7 contain the Unicode surrogate code point minus 0x10000 */
00438                     if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
00439                         /* callback(unassigned) */
00440                         reason=UCNV_UNASSIGNED;
00441                         *pErrorCode=U_INVALID_CHAR_FOUND;
00442                         goto callback;
00443                     }
00444                     /* fall through to the MBCS_STATE_VALID_DIRECT_20 branch */
00445                 case 16|MBCS_STATE_VALID_DIRECT_20:
00446                     /* bits 26..7 contain the Unicode surrogate code point minus 0x10000 */
00447                     entry=(entry>>7)&0xfffff;
00448                     /* output surrogate pair */
00449                     *target++=(UChar)(0xd800|(UChar)(entry>>10));
00450                     if(offsets!=NULL) {
00451                         *offsets++=sourceIndex;
00452                     }
00453                     c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
00454                     if(target<targetLimit) {
00455                         *target++=c;
00456                         if(offsets!=NULL) {
00457                             *offsets++=sourceIndex;
00458                         }
00459                     } else {
00460                         /* target overflow */
00461                         cnv->UCharErrorBuffer[0]=c;
00462                         cnv->UCharErrorBufferLength=1;
00463                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00464 
00465                         offset=0;
00466                         byteIndex=0;
00467                         goto endloop;
00468                     }
00469                     break;
00470                 case 16|MBCS_STATE_VALID_16:
00471                     /* bits 26..16 are not used, 0 */
00472                     /* bits 15..7 contain the final offset delta to one 16-bit code unit */
00473                     offset+=(uint16_t)entry>>7;
00474                     c=unicodeCodeUnits[offset];
00475                     if(c<0xfffe) {
00476                         /* output BMP code point */
00477                         *target++=c;
00478                         if(offsets!=NULL) {
00479                             *offsets++=sourceIndex;
00480                         }
00481                     } else if(c==0xfffe) {
00482                         if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset))!=0xfffe) {
00483                             goto output32;
00484                         }
00485                         /* callback(unassigned) */
00486                         reason=UCNV_UNASSIGNED;
00487                         *pErrorCode=U_INVALID_CHAR_FOUND;
00488                         goto callback;
00489                     } else {
00490                         /* callback(illegal) */
00491                         reason=UCNV_ILLEGAL;
00492                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
00493                         goto callback;
00494                     }
00495                     break;
00496                 case 16|MBCS_STATE_VALID_16_PAIR:
00497                     /* bits 26..16 are not used, 0 */
00498                     /* bits 15..7 contain the final offset delta to two 16-bit code units */
00499                     offset+=(uint16_t)entry>>7;
00500                     c=unicodeCodeUnits[offset++];
00501                     if(UTF_IS_FIRST_SURROGATE(c)) {
00502                         *target++=c;
00503                         if(offsets!=NULL) {
00504                             *offsets++=sourceIndex;
00505                         }
00506                         if(target<targetLimit) {
00507                             *target++=unicodeCodeUnits[offset];
00508                             if(offsets!=NULL) {
00509                                 *offsets++=sourceIndex;
00510                             }
00511                         } else {
00512                             /* target overflow */
00513                             cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
00514                             cnv->UCharErrorBufferLength=1;
00515                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00516 
00517                             offset=0;
00518                             byteIndex=0;
00519                             goto endloop;
00520                         }
00521                     } else if(c<0xfffe) {
00522                         /* output BMP code point */
00523                         *target++=c;
00524                         if(offsets!=NULL) {
00525                             *offsets++=sourceIndex;
00526                         }
00527                     } else if(c==0xfffe) {
00528                         /*
00529                          * For the fallback, we need to restore the offset that
00530                          * we had before the unicodeCodeUnits[offset++] above that incremented it!
00531                          */
00532                         if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset-1))!=0xfffe) {
00533                             goto output32;
00534                         }
00535                         /* callback(unassigned) */
00536                         reason=UCNV_UNASSIGNED;
00537                         *pErrorCode=U_INVALID_CHAR_FOUND;
00538                         goto callback;
00539                     } else {
00540                         /* callback(illegal) */
00541                         reason=UCNV_ILLEGAL;
00542                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
00543                         goto callback;
00544                     }
00545                     break;
00546                 default:
00547                     /* reserved, must never occur */
00548                     /* bits 26..7 are not used, 0 */
00549                     break;
00550                 }
00551 
00552                 /* normal end of action codes: prepare for a new character */
00553                 offset=0;
00554                 byteIndex=0;
00555                 sourceIndex=nextSourceIndex;
00556                 continue;
00557 
00558                 /*
00559                  * Markus Scherer 2000-jul-05
00560                  *
00561                  * The following is extremely ugly, and I apologize for it:
00562                  * Several places in the above switch statement need to call
00563                  * a callback function or output a 32-bit code point,
00564                  * each of which is an involved process with
00565                  * a couple dozen of statements.
00566                  *
00567                  * I could do this in a function call, but I fear that then
00568                  * the compiler does not keep the frequently used variables in
00569                  * registers because the function call would need them on the stack
00570                  * for input and output.
00571                  *
00572                  * I could do this with a macro, but that is harder to debug and
00573                  * bloats the compiled code.
00574                  *
00575                  * I could just copy and paste the code, but that would also bloat
00576                  * the program size, make the pieces harder to maintain, and make
00577                  * the switch statement extremely long and clumsy.
00578                  *
00579                  * Therefore, those places goto here and do it all in one place,
00580                  * while the normal processing has a continue above and skips this
00581                  * part.
00582                  * This actually _saves_ goto statements, too:
00583                  * Since it is not possible in C to break a loop from within a switch
00584                  * statement, the callback code in the switch statement would have to
00585                  * goto behind the loop. Here, it can break if necessary.
00586                  */
00587 
00588 output32:
00589                 /* output a 32-bit (21-bit) Unicode code point stored in entry */
00590                 if(entry<=0xffff) {
00591                     /* output BMP code point */
00592                     *target++=(UChar)entry;
00593                     if(offsets!=NULL) {
00594                         *offsets++=sourceIndex;
00595                     }
00596                 } else {
00597                     /* output surrogate pair */
00598                     *target++=(UChar)(0xd7c0+(entry>>10));
00599                     if(offsets!=NULL) {
00600                         *offsets++=sourceIndex;
00601                     }
00602                     c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
00603                     if(target<targetLimit) {
00604                         *target++=c;
00605                         if(offsets!=NULL) {
00606                             *offsets++=sourceIndex;
00607                         }
00608                     } else {
00609                         /* target overflow */
00610                         cnv->UCharErrorBuffer[0]=c;
00611                         cnv->UCharErrorBufferLength=1;
00612                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00613 
00614                         offset=0;
00615                         byteIndex=0;
00616                         break;
00617                     }
00618                 }
00619 
00620                 /* same as normal end of action codes: prepare for a new character */
00621                 offset=0;
00622                 byteIndex=0;
00623                 sourceIndex=nextSourceIndex;
00624                 continue;
00625 
00626 callback:
00627                 /* call the callback function with all the preparations and post-processing */
00628                 /* update the arguments structure */
00629                 pArgs->source=(const char *)source;
00630                 pArgs->target=target;
00631                 pArgs->offsets=offsets;
00632 
00633                 /* copy the current bytes to invalidCharBuffer */
00634                 for(b=0; b<(uint8_t)byteIndex; ++b) {
00635                     cnv->invalidCharBuffer[b]=(char)bytes[b];
00636                 }
00637                 cnv->invalidCharLength=byteIndex;
00638 
00639                 /* set the converter state in UConverter to deal with the next character */
00640                 cnv->toUnicodeStatus=0;
00641                 cnv->mode=state;
00642                 cnv->toULength=0;
00643 
00644                 /* call the callback function */
00645                 toUCallback(cnv, cnv->toUContext, pArgs, (const char *)bytes, byteIndex, reason, pErrorCode);
00646 
00647                 /* get the converter state from UConverter */
00648                 offset=cnv->toUnicodeStatus;
00649                 state=(uint8_t)cnv->mode;
00650                 byteIndex=cnv->toULength;
00651 
00652                 /* update target and deal with offsets if necessary */
00653                 offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
00654                 target=pArgs->target;
00655 
00656                 /* update the source pointer and index */
00657                 sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
00658                 source=(const uint8_t *)pArgs->source;
00659 
00660                 /*
00661                  * If the callback overflowed the target, then we need to
00662                  * stop here with an overflow indication.
00663                  */
00664                 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
00665                     break;
00666                 } else if(cnv->UCharErrorBufferLength>0) {
00667                     /* target is full */
00668                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00669                     break;
00670                 } else if(U_FAILURE(*pErrorCode)) {
00671                     /* break on error */
00672                     offset=0;
00673                     state=0;
00674                     byteIndex=0;
00675                     break;
00676                 }
00677 
00678                 /*
00679                  * We do not need to repeat the statements from the normal
00680                  * end of the action codes because we already updated all the
00681                  * necessary variables.
00682                  */
00683             }
00684         } else {
00685             /* target is full */
00686             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00687             break;
00688         }
00689     }
00690 endloop:
00691 
00692     if(pArgs->flush && source>=sourceLimit) {
00693         /* reset the state for the next conversion */
00694         if(byteIndex>0 && U_SUCCESS(*pErrorCode)) {
00695             /* a character byte sequence remains incomplete */
00696             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
00697         }
00698         cnv->toUnicodeStatus=0;
00699         cnv->mode=0;
00700         cnv->toULength=0;
00701     } else {
00702         /* set the converter state back into UConverter */
00703         cnv->toUnicodeStatus=offset;
00704         cnv->mode=state;
00705         cnv->toULength=byteIndex;
00706     }
00707 
00708     /* write back the updated pointers */
00709     pArgs->source=(const char *)source;
00710     pArgs->target=target;
00711     pArgs->offsets=offsets;
00712 }
00713 
00714 U_CFUNC void
00715 _MBCSToUnicode(UConverterToUnicodeArgs *pArgs,
00716                UErrorCode *pErrorCode) {
00717     _MBCSToUnicodeWithOffsets(pArgs, pErrorCode);
00718 }
00719 
00720 /* This version of _MBCSToUnicode() is optimized for single-byte, single-state codepages. */
00721 U_CFUNC void
00722 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
00723                                 UErrorCode *pErrorCode) {
00724     /* set up the local pointers */
00725     UConverter *cnv;
00726     const uint8_t *source, *sourceLimit;
00727     UChar *target;
00728     const UChar *targetLimit;
00729     int32_t *offsets;
00730 
00731     const int32_t (*stateTable)[256];
00732 
00733     int32_t sourceIndex, nextSourceIndex;
00734 
00735     int32_t entry;
00736     UChar c;
00737     uint8_t b;
00738     UConverterCallbackReason reason;
00739 
00740     /* set up the local pointers */
00741     cnv=pArgs->converter;
00742     source=(const uint8_t *)pArgs->source;
00743     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
00744     target=pArgs->target;
00745     targetLimit=pArgs->targetLimit;
00746     offsets=pArgs->offsets;
00747 
00748     stateTable=cnv->sharedData->table->mbcs.stateTable;
00749 
00750     /* sourceIndex=-1 if the current character began in the previous buffer */
00751     sourceIndex=0;
00752     nextSourceIndex=0;
00753 
00754     /* conversion loop */
00755     while(source<sourceLimit) {
00756         /*
00757          * This following test is to see if available input would overflow the output.
00758          * It does not catch output of more than one code unit that
00759          * overflows as a result of a surrogate pair or callback output
00760          * from the last source byte.
00761          * Therefore, those situations also test for overflows and will
00762          * then break the loop, too.
00763          */
00764         if(target<targetLimit) {
00765             b=*source++;
00766             ++nextSourceIndex;
00767             entry=stateTable[0][b];
00768             /* entry<0 */
00769             /*
00770              * bit 31 is set, bits:
00771              * 30..27 action code
00772              *        (do not mask out bit 31 for speed, include it in action values)
00773              * 26..7  depend on the action code
00774              *  6..0  next state
00775              */
00776 
00777             /* switch per action code */
00778             switch((uint32_t)entry>>27U) {
00779             case 16|MBCS_STATE_ILLEGAL:
00780                 /* bits 26..7 are not used, 0 */
00781                 /* callback(illegal) */
00782                 reason=UCNV_ILLEGAL;
00783                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
00784                 goto callback;
00785             case 16|MBCS_STATE_UNASSIGNED:
00786                 /* bits 26..7 are not used, 0 */
00787                 /* callback(unassigned) */
00788                 reason=UCNV_UNASSIGNED;
00789                 *pErrorCode=U_INVALID_CHAR_FOUND;
00790                 goto callback;
00791             case 16|MBCS_STATE_FALLBACK_DIRECT_16:
00792                 /* bits 26..23 are not used, 0 */
00793                 /* bits 22..7 contain the Unicode BMP code point */
00794                 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
00795                     /* callback(unassigned) */
00796                     reason=UCNV_UNASSIGNED;
00797                     *pErrorCode=U_INVALID_CHAR_FOUND;
00798                     goto callback;
00799                 }
00800                 /* fall through to the MBCS_STATE_VALID_DIRECT_16 branch */
00801             case 16|MBCS_STATE_VALID_DIRECT_16:
00802                 /* bits 26..23 are not used, 0 */
00803                 /* bits 22..7 contain the Unicode BMP code point */
00804                 /* output BMP code point */
00805                 *target++=(UChar)(entry>>7);
00806                 if(offsets!=NULL) {
00807                     *offsets++=sourceIndex;
00808                 }
00809                 break;
00810             case 16|MBCS_STATE_FALLBACK_DIRECT_20:
00811                 /* bits 26..7 contain the Unicode surrogate code point minus 0x10000 */
00812                 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
00813                     /* callback(unassigned) */
00814                     reason=UCNV_UNASSIGNED;
00815                     *pErrorCode=U_INVALID_CHAR_FOUND;
00816                     goto callback;
00817                 }
00818                 /* fall through to the MBCS_STATE_VALID_DIRECT_20 branch */
00819             case 16|MBCS_STATE_VALID_DIRECT_20:
00820                 /* bits 26..7 contain the Unicode surrogate code point minus 0x10000 */
00821                 entry=(entry>>7)&0xfffff;
00822                 /* output surrogate pair */
00823                 *target++=(UChar)(0xd800|(UChar)(entry>>10));
00824                 if(offsets!=NULL) {
00825                     *offsets++=sourceIndex;
00826                 }
00827                 c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
00828                 if(target<targetLimit) {
00829                     *target++=c;
00830                     if(offsets!=NULL) {
00831                         *offsets++=sourceIndex;
00832                     }
00833                 } else {
00834                     /* target overflow */
00835                     cnv->UCharErrorBuffer[0]=c;
00836                     cnv->UCharErrorBufferLength=1;
00837                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00838                     goto endloop;
00839                 }
00840                 break;
00841             default:
00842                 /* reserved, must never occur */
00843                 /* bits 26..7 are not used, 0 */
00844                 break;
00845             }
00846 
00847             /* normal end of action codes: prepare for a new character */
00848             sourceIndex=nextSourceIndex;
00849             continue;
00850 
00851 callback:
00852             /* call the callback function with all the preparations and post-processing */
00853             /* update the arguments structure */
00854             pArgs->source=(const char *)source;
00855             pArgs->target=target;
00856             pArgs->offsets=offsets;
00857 
00858             /* copy the current bytes to invalidCharBuffer */
00859             cnv->invalidCharBuffer[0]=b;
00860             cnv->invalidCharLength=1;
00861 
00862             /* call the callback function */
00863             toUCallback(cnv, cnv->toUContext, pArgs, (const char *)&b, 1, reason, pErrorCode);
00864 
00865             /* update target and deal with offsets if necessary */
00866             offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
00867             target=pArgs->target;
00868 
00869             /* update the source pointer and index */
00870             sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
00871             source=(const uint8_t *)pArgs->source;
00872 
00873             /*
00874              * If the callback overflowed the target, then we need to
00875              * stop here with an overflow indication.
00876              */
00877             if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
00878                 break;
00879             } else if(cnv->UCharErrorBufferLength>0) {
00880                 /* target is full */
00881                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00882                 break;
00883             } else if(U_FAILURE(*pErrorCode)) {
00884                 /* break on error */
00885                 break;
00886             }
00887 
00888             /*
00889              * We do not need to repeat the statements from the normal
00890              * end of the action codes because we already updated all the
00891              * necessary variables.
00892              */
00893         } else {
00894             /* target is full */
00895             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00896             break;
00897         }
00898     }
00899 endloop:
00900 
00901     /* write back the updated pointers */
00902     pArgs->source=(const char *)source;
00903     pArgs->target=target;
00904     pArgs->offsets=offsets;
00905 }
00906 
00907 /*
00908  * This is a simple, interim implementation of GetNextUChar()
00909  * that allows to concentrate on testing one single implementation
00910  * of the ToUnicode conversion before it gets copied to
00911  * multiple version that are then optimized for their needs
00912  * (with vs. without offsets and getNextUChar).
00913  * ### TODO: implement this directly similar to ToUnicode()
00914  */
00915 U_CFUNC UChar32
00916 _MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
00917                   UErrorCode *pErrorCode) {
00918     UChar buffer[UTF_MAX_CHAR_LENGTH];
00919     const char *realLimit=pArgs->sourceLimit;
00920 
00921     pArgs->target=buffer;
00922     pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
00923 
00924     while(pArgs->source<realLimit) {
00925         /* feed in one byte at a time to make sure to get only one character out */
00926         pArgs->sourceLimit=pArgs->source+1;
00927         pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit);
00928         _MBCSToUnicode(pArgs, pErrorCode);
00929         if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
00930             return 0xffff;
00931         } else {
00932             int32_t length=pArgs->target-buffer;
00933 #if 0
00934             /*
00935              *     markus 2000-oct-26
00936              *
00937              * This version of the exit condition is commented out because of
00938              * a clarification of the semantics of ucnv_getNextUChar() (see updated javadoc):
00939              *
00940              * Codepages that provide direct encodings of supplementary Unicode code points (U+10000 and up)
00941              * should return single surrogates without combining them into pairs if single surrogates
00942              * are encoded. This group of codepages includes UTF-8, UTF-32, and GB 18030.
00943              *
00944              * Codepages that provide direct encodings only of single surrogates
00945              * must attempt to match pairs of them into supplementary code points.
00946              * Single surrogates are returned only if they are not part of matched pairs.
00947              * This group of codepages includes SCSU, LMBCS, and UTF-16.
00948              *
00949              * Currently, there is no MBCS codepage in the second group. SCSU, LMBCS, and UTF-16
00950              * are implemented with separate code.
00951              *
00952              * Therefore, this feature is removed here.
00953              * It might need to be added back in later when some MBCS codepages are created that
00954              * fall into the second group. In this case, a flag in the .cnv file will be necessary
00955              * to indicate this. makeconv would need to set this flag based on whether the codepage
00956              * contains only mappings for single surrogates but
00957              * not directly for any supplementary code points.
00958              */
00959             if(/* some output and (source consumed or not a surrogate or a surrogate pair [UTF-16 specific]) */
00960                length>0 &&
00961                (pArgs->flush || !UTF_IS_FIRST_SURROGATE(buffer[0]) || length==2)
00962 #endif
00963             if(length>0) {
00964                 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
00965                     *pErrorCode=U_ZERO_ERROR;
00966                 }
00967                 return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, length);
00968             }
00969         }
00970     }
00971 
00972     /* no output because of empty input or only state changes and skipping callbacks */
00973     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
00974     return 0xffff;
00975 }
00976 
00977 /*
00978  * This is a simple version of getNextUChar() that is used
00979  * by other converter implementations.
00980  * It does not use state from the converter, nor error codes.
00981  *
00982  * Return value:
00983  * U+fffe   unassigned
00984  * U+ffff   illegal
00985  * otherwise the Unicode code point
00986  */
00987 U_CFUNC UChar32
00988 _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
00989                         const char **pSource, const char *sourceLimit,
00990                         UBool useFallback) {
00991     const uint8_t *source;
00992 
00993     const int32_t (*stateTable)[256];
00994     const uint16_t *unicodeCodeUnits;
00995 
00996     uint32_t offset;
00997     uint8_t state;
00998 
00999     int32_t entry;
01000 
01001     /* set up the local pointers */
01002     source=(const uint8_t *)*pSource;
01003     if(source>=(const uint8_t *)sourceLimit) {
01004         /* no input at all: "unassigned" */
01005         return 0xfffe;
01006     }
01007 
01008     /* use optimized function if possible */
01009     if(sharedData->table->mbcs.countStates==1) {
01010         return _MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)(*(*pSource)++), useFallback);
01011     }
01012 
01013     stateTable=sharedData->table->mbcs.stateTable;
01014     unicodeCodeUnits=sharedData->table->mbcs.unicodeCodeUnits;
01015 
01016     /* converter state */
01017     offset=0;
01018     state=0;
01019 
01020     /* conversion loop */
01021     do {
01022         entry=stateTable[state][*source++];
01023         if(entry>=0) {
01024             /*
01025              * bit 31 is not set, bits:
01026              * 30..7  offset delta
01027              *  6..0  next state
01028              */
01029             state=(uint8_t)(entry&0x7f);
01030             offset+=entry>>7;
01031         } else {
01032             /*
01033              * bit 31 is set, bits:
01034              * 30..27 action code
01035              *        (do not mask out bit 31 for speed, include it in action values)
01036              * 26..7  depend on the action code
01037              *  6..0  next state
01038              */
01039 
01040             *pSource=(const char *)source;
01041 
01042             /* switch per action code */
01043             switch((uint32_t)entry>>27U) {
01044             case 16|MBCS_STATE_ILLEGAL:
01045                 /* bits 26..7 are not used, 0 */
01046                 return 0xffff;
01047             case 16|MBCS_STATE_CHANGE_ONLY:
01048                 /* bits 26..7 are not used, 0 */
01049                 /*
01050                  * This serves as a state change without any output.
01051                  * It is useful for reading simple stateful encodings,
01052                  * for example using just Shift-In/Shift-Out codes.
01053                  * The 21 unused bits may later be used for more sophisticated
01054                  * state transitions.
01055                  */
01056                 if(source==(const uint8_t *)sourceLimit) {
01057                     /* if there are only state changes, then return "unassigned" */
01058                     return 0xfffe;
01059                 }
01060                 break;
01061             case 16|MBCS_STATE_UNASSIGNED:
01062                 /* bits 26..7 are not used, 0 */
01063                 return 0xfffe;
01064             case 16|MBCS_STATE_FALLBACK_DIRECT_16:
01065                 /* bits 26..23 are not used, 0 */
01066                 /* bits 22..7 contain the Unicode BMP code point */
01067                 if(!TO_U_USE_FALLBACK(useFallback)) {
01068                     return 0xfffe;
01069                 }
01070                 /* fall through to the MBCS_STATE_VALID_DIRECT_16 branch */
01071             case 16|MBCS_STATE_VALID_DIRECT_16:
01072                 /* bits 26..23 are not used, 0 */
01073                 /* bits 22..7 contain the Unicode BMP code point */
01074                 /* output BMP code point */
01075                 return (UChar)(entry>>7);
01076             case 16|MBCS_STATE_FALLBACK_DIRECT_20:
01077                 /* bits 26..7 contain the Unicode surrogate code point minus 0x10000 */
01078                 if(!TO_U_USE_FALLBACK(useFallback)) {
01079                     return 0xfffe;
01080                 }
01081                 /* fall through to the MBCS_STATE_VALID_DIRECT_20 branch */
01082             case 16|MBCS_STATE_VALID_DIRECT_20:
01083                 /* bits 26..7 contain the Unicode surrogate code point minus 0x10000 */
01084                 return 0x10000+((entry>>7)&0xfffff);
01085             case 16|MBCS_STATE_VALID_16:
01086                 /* bits 26..16 are not used, 0 */
01087                 /* bits 15..7 contain the final offset delta to one 16-bit code unit */
01088                 offset+=(uint16_t)entry>>7;
01089                 entry=unicodeCodeUnits[offset];
01090                 if(entry!=0xfffe) {
01091                     return (UChar32)entry;
01092                 } else {
01093                     return _MBCSGetFallback(&sharedData->table->mbcs, offset);
01094                 }
01095             case 16|MBCS_STATE_VALID_16_PAIR:
01096                 /* bits 26..16 are not used, 0 */
01097                 /* bits 15..7 contain the final offset delta to two 16-bit code units */
01098                 offset+=(uint16_t)entry>>7;
01099                 entry=unicodeCodeUnits[offset++];
01100                 if(UTF_IS_FIRST_SURROGATE(entry)) {
01101                     return UTF16_GET_PAIR_VALUE(entry, unicodeCodeUnits[offset]);
01102                 } else if(entry!=0xfffe) {
01103                     /* output BMP code point */
01104                     return (UChar32)entry;
01105                 } else {
01106                     /*
01107                      * For the fallback, we need to restore the offset that
01108                      * we had before the unicodeCodeUnits[offset++] above that incremented it!
01109                      */
01110                     return _MBCSGetFallback(&sharedData->table->mbcs, offset-1);
01111                 }
01112             default:
01113                 /* reserved, must never occur */
01114                 /* bits 26..7 are not used, 0 */
01115                 break;
01116             }
01117 
01118             /* state change only - prepare for a new character */
01119             state=(uint8_t)(entry&0x7f); /* typically 0 */
01120             offset=0;
01121         }
01122     } while(source<(const uint8_t *)sourceLimit);
01123 
01124     *pSource=(const char *)source;
01125     return 0xffff;
01126 }
01127 
01128 /* This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. */
01129 U_CFUNC UChar32
01130 _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
01131                               uint8_t b, UBool useFallback) {
01132     int32_t entry;
01133 
01134     entry=sharedData->table->mbcs.stateTable[0][b];
01135     /* entry<0 */
01136     /*
01137      * bit 31 is set, bits:
01138      * 30..27 action code
01139      *        (do not mask out bit 31 for speed, include it in action values)
01140      * 26..7  depend on the action code
01141      *  6..0  next state
01142      */
01143 
01144     /* switch per action code */
01145     switch((uint32_t)entry>>27U) {
01146     case 16|MBCS_STATE_ILLEGAL:
01147         /* bits 26..7 are not used, 0 */
01148         return 0xffff;
01149     case 16|MBCS_STATE_UNASSIGNED:
01150         /* bits 26..7 are not used, 0 */
01151         return 0xfffe;
01152     case 16|MBCS_STATE_FALLBACK_DIRECT_16:
01153         /* bits 26..23 are not used, 0 */
01154         /* bits 22..7 contain the Unicode BMP code point */
01155         if(!TO_U_USE_FALLBACK(useFallback)) {
01156             return 0xfffe;
01157         }
01158         /* fall through to the MBCS_STATE_VALID_DIRECT_16 branch */
01159     case 16|MBCS_STATE_VALID_DIRECT_16:
01160         /* bits 26..23 are not used, 0 */
01161         /* bits 22..7 contain the Unicode BMP code point */
01162         /* output BMP code point */
01163         return (UChar)(entry>>7);
01164     case 16|MBCS_STATE_FALLBACK_DIRECT_20:
01165         /* bits 26..7 contain the Unicode surrogate code point minus 0x10000 */
01166         if(!TO_U_USE_FALLBACK(useFallback)) {
01167             return 0xfffe;
01168         }
01169         /* fall through to the MBCS_STATE_VALID_DIRECT_20 branch */
01170     case 16|MBCS_STATE_VALID_DIRECT_20:
01171         /* bits 26..7 contain the Unicode surrogate code point minus 0x10000 */
01172         return 0x10000+((entry>>7)&0xfffff);
01173     default:
01174         /* reserved, must never occur */
01175         /* bits 26..7 are not used, 0 */
01176         return 0xffff;
01177     }
01178 }
01179 
01180 /* MBCS-from-Unicode conversion functions ----------------------------------- */
01181 
01182 U_CFUNC void
01183 _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
01184                             UErrorCode *pErrorCode) {
01185     UConverter *cnv;
01186     const UChar *source, *sourceLimit;
01187     uint8_t *target;
01188     int32_t targetCapacity;
01189     int32_t *offsets;
01190 
01191     const uint16_t *table;
01192     const uint8_t *bytes;
01193     uint8_t outputType;
01194 
01195     UChar32 c;
01196 
01197     int32_t sourceIndex, nextSourceIndex;
01198 
01199     UConverterCallbackReason reason;
01200     uint32_t i;
01201     uint32_t value;
01202     int32_t length;
01203 
01204     /* use optimized function if possible */
01205     cnv=pArgs->converter;
01206     outputType=cnv->sharedData->table->mbcs.outputType;
01207     if(outputType==MBCS_OUTPUT_1) {
01208         _MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
01209         return;
01210     }
01211 
01212     /* set up the local pointers */
01213     source=pArgs->source;
01214     sourceLimit=pArgs->sourceLimit;
01215     target=(uint8_t *)pArgs->target;
01216     targetCapacity=pArgs->targetLimit-pArgs->target;
01217     offsets=pArgs->offsets;
01218 
01219     table=cnv->sharedData->table->mbcs.fromUnicodeTable;
01220     bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
01221 
01222     /* get the converter state from UConverter */
01223     c=cnv->fromUSurrogateLead;
01224 
01225     /* sourceIndex=-1 if the current character began in the previous buffer */
01226     sourceIndex= c==0 ? 0 : -1;
01227     nextSourceIndex=0;
01228 
01229     /* conversion loop */
01230     /*
01231      * This is another piece of ugly code:
01232      * A goto into the loop if the converter state contains a first surrogate
01233      * from the previous function call.
01234      * It saves me to check in each loop iteration a check of if(c==0)
01235      * and duplicating the trail-surrogate-handling code in the else
01236      * branch of that check.
01237      * I could not find any other way to get around this other than
01238      * using a function call for the conversion and callback, which would
01239      * be even more inefficient.
01240      *
01241      * Markus Scherer 2000-jul-19
01242      */
01243     if(c!=0 && targetCapacity>0) {
01244         goto getTrail;
01245     }
01246 
01247     while(source<sourceLimit) {
01248         /*
01249          * This following test is to see if available input would overflow the output.
01250          * It does not catch output of more than one byte that
01251          * overflows as a result of a multi-byte character or callback output
01252          * from the last source character.
01253          * Therefore, those situations also test for overflows and will
01254          * then break the loop, too.
01255          */
01256         if(targetCapacity>0) {
01257             /*
01258              * Get a correct Unicode code point:
01259              * a single UChar for a BMP code point or
01260              * a matched surrogate pair for a "surrogate code point".
01261              */
01262             c=*source++;
01263             ++nextSourceIndex;
01264             if(UTF_IS_SURROGATE(c)) {
01265                 if(UTF_IS_SURROGATE_FIRST(c)) {
01266 getTrail:
01267                     if(source<sourceLimit) {
01268                         /* test the following code unit */
01269                         UChar trail=*source;
01270                         if(UTF_IS_SECOND_SURROGATE(trail)) {
01271                             ++source;
01272                             ++nextSourceIndex;
01273                             c=UTF16_GET_PAIR_VALUE(c, trail);
01274                             /* convert this surrogate code point */
01275                             /* exit this condition tree */
01276                         } else {
01277                             /* this is an unmatched lead code unit (1st surrogate) */
01278                             /* callback(illegal) */
01279                             reason=UCNV_ILLEGAL;
01280                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
01281                             goto callback;
01282                         }
01283                     } else {
01284                         /* no more input */
01285                         break;
01286                     }
01287                 } else {
01288                     /* this is an unmatched trail code unit (2nd surrogate) */
01289                     /* callback(illegal) */
01290                     reason=UCNV_ILLEGAL;
01291                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
01292                     goto callback;
01293                 }
01294             }
01295 
01296             /* convert the Unicode code point in c into codepage bytes */
01297 
01298             /*
01299              * The basic lookup is a triple-stage compact array lookup:
01300              *
01301              * Bits 21..10 (0x440 different values because Unicode code points
01302              * reach up to 0x10ffff) are used as an index into table[],
01303              * then bits 9..4 are added to that and together multiplied by 2
01304              * to be used as an index into a second table that starts at table+0x440.
01305              *
01306              * In that second table, there will be two 16-bit values
01307              * (and therefore we multiplied by two in the previous step):
01308              * One 16-bit value stores a bit for each of the 16 Unicode code points
01309              * that are grouped here to indicate if it is assigned or not.
01310              * If it is not assigned, there may still be a codepage character
01311              * stored in the third stage: a fallback value. It is used only when
01312              * fallbacks are turned on for the converter. If the code point is
01313              * unassigned and fallbacks not used or there is no fallback character
01314              * (all bytes 0), then the callback function is called.
01315              *
01316              * The second value in the second table (stage) is an index into
01317              * the third table. It is multiplied by 16*(bytes stored per character)
01318              * to get to the first of 16 characters. At last, bits 3..0 of
01319              * the Unicode code point are multiplied by (bytes stored per character)
01320              * and added to that index for the address of the output codepage
01321              * character.
01322              *
01323              * For EUC encodings that use only either 0x8e or 0x8f as the first
01324              * byte of their longest byte sequences, the first two bytes in
01325              * this third stage indicate with their 7th bits whether these bytes
01326              * are to be written directly or actually need to be preceeded by
01327              * one of the two Single-Shift codes. With this, the third stage
01328              * stores one byte fewer per character than the actual maximum length of
01329              * EUC byte sequences.
01330              *
01331              * Other than that, leading zero bytes are removed and the other
01332              * bytes output. A single zero byte may be output if the "assigned"
01333              * bit in stage 2 was on or also if the Unicode code point is U+0000.
01334              * The data structure does not support zero byte output as a fallback
01335              * for other code points, and also does not allow output of leading zeros.
01336              */
01337             i=0x440+2*((uint32_t)table[c>>10]+((c>>4)&0x3f));
01338 
01339             /* is this code point assigned, or do we use fallbacks? */
01340             if((table[i++]&(1<<(c&0xf)))!=0 || UCNV_FROM_U_USE_FALLBACK(cnv, c)) {
01341                 const uint8_t *p=bytes;
01342 
01343                 /* get the bytes and the length for the output */
01344                 switch(outputType) {
01345                 case MBCS_OUTPUT_1:
01346                     p+=(16*(uint32_t)table[i]+(c&0xf));
01347                     value=*p;
01348                     length=1;
01349                     break;
01350                 case MBCS_OUTPUT_2:
01351                     p+=(16*(uint32_t)table[i]+(c&0xf))*2;
01352 #                   if U_IS_BIG_ENDIAN
01353                         value=*(uint16_t *)p;
01354 #                   else
01355                         value=((uint32_t)*p<<8)|p[1];
01356 #                   endif
01357                     if(value<=0xff) {
01358                         length=1;
01359                     } else {
01360                         length=2;
01361                     }
01362                     break;
01363                 case MBCS_OUTPUT_3:
01364                     p+=(16*(uint32_t)table[i]+(c&0xf))*3;
01365                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
01366                     if(value<=0xff) {
01367                         length=1;
01368                     } else if(value<=0xffff) {
01369                         length=2;
01370                     } else {
01371                         length=3;
01372                     }
01373                     break;
01374                 case MBCS_OUTPUT_4:
01375                     p+=(16*(uint32_t)table[i]+(c&0xf))*4;
01376 #                   if U_IS_BIG_ENDIAN
01377                         value=*(uint32_t *)p;
01378 #                   else
01379                         value=((uint32_t)*p<<24)|((uint32_t)p[1]<<16)|((uint32_t)p[2]<<8)|p[3];
01380 #                   endif
01381                     if(value<=0xff) {
01382                         length=1;
01383                     } else if(value<=0xffff) {
01384                         length=2;
01385                     } else if(value<=0xffffff) {
01386                         length=3;
01387                     } else {
01388                         length=4;
01389                     }
01390                     break;
01391                 case MBCS_OUTPUT_3_EUC:
01392                     p+=(16*(uint32_t)table[i]+(c&0xf))*2;
01393 #                   if U_IS_BIG_ENDIAN
01394                         value=*(uint16_t *)p;
01395 #                   else
01396                         value=((uint32_t)*p<<8)|p[1];
01397 #                   endif
01398                     /* EUC 16-bit fixed-length representation */
01399                     if(value<=0xff) {
01400                         length=1;
01401                     } else if((value&0x8000)==0) {
01402                         value|=0x8e8000;
01403                         length=3;
01404                     } else if((value&0x80)==0) {
01405                         value|=0x8f0080;
01406                         length=3;
01407                     } else {
01408                         length=2;
01409                     }
01410                     break;
01411                 case MBCS_OUTPUT_4_EUC:
01412                     p+=(16*(uint32_t)table[i]+(c&0xf))*3;
01413                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
01414                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
01415                     if(value<=0xff) {
01416                         length=1;
01417                     } else if(value<=0xffff) {
01418                         length=2;
01419                     } else if((value&0x800000)==0) {
01420                         value|=0x8e800000;
01421                         length=4;
01422                     } else if((value&0x8000)==0) {
01423                         value|=0x8f008000;
01424                         length=4;
01425                     } else {
01426                         length=3;
01427                     }
01428                     break;
01429                 default:
01430                     /* must not occur */
01431                     /*
01432                      * To avoid compiler warnings that value & length may be
01433                      * used without having been initialized, we set them here.
01434                      * In reality, this is unreachable code.
01435                      * Not having a default branch also causes warnings with
01436                      * some compilers.
01437                      */
01438                     value=0;
01439                     length=0;
01440                     break;
01441                 }
01442 
01443                 /* is the codepage value really an "unassigned" indicator? */
01444                 if(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0) {
01445                     /*
01446                      * We allow a 0 byte output if the Unicode code point is
01447                      * U+0000 and also if the "assigned" bit is set for this entry.
01448                      * There is no way with this data structure for fallback output
01449                      * for other than U+0000 to be a zero byte.
01450                      */
01451                     /* callback(unassigned) */
01452                     reason=UCNV_UNASSIGNED;
01453                     *pErrorCode=U_INVALID_CHAR_FOUND;
01454                     goto callback;
01455                 }
01456             } else {
01457                 /* callback(unassigned) */
01458                 reason=UCNV_UNASSIGNED;
01459                 *pErrorCode=U_INVALID_CHAR_FOUND;
01460                 goto callback;
01461             }
01462 
01463             /* write the output character bytes from value and length */
01464             /* from the first if in the loop we know that targetCapacity>0 */
01465             if(length<=targetCapacity) {
01466                 if(offsets==NULL) {
01467                     switch(length) {
01468                         /* each branch falls through to the next one */
01469                     case 4:
01470                         *target++=(uint8_t)(value>>24);
01471                     case 3:
01472                         *target++=(uint8_t)(value>>16);
01473                     case 2:
01474                         *target++=(uint8_t)(value>>8);
01475                     case 1:
01476                         *target++=(uint8_t)value;
01477                     default:
01478                         /* will never occur */
01479                         break;
01480                     }
01481                 } else {
01482                     switch(length) {
01483                         /* each branch falls through to the next one */
01484                     case 4:
01485                         *target++=(uint8_t)(value>>24);
01486                         *offsets++=sourceIndex;
01487                     case 3:
01488                         *target++=(uint8_t)(value>>16);
01489                         *offsets++=sourceIndex;
01490                     case 2:
01491                         *target++=(uint8_t)(value>>8);
01492                         *offsets++=sourceIndex;
01493                     case 1:
01494                         *target++=(uint8_t)value;
01495                         *offsets++=sourceIndex;
01496                     default:
01497                         /* will never occur */
01498                         break;
01499                     }
01500                 }
01501                 targetCapacity-=length;
01502             } else {
01503                 uint8_t *p;
01504 
01505                 /*
01506                  * We actually do this backwards here:
01507                  * In order to save an intermediate variable, we output
01508                  * first to the overflow buffer what does not fit into the
01509                  * regular target.
01510                  */
01511                 /* we know that 1<=targetCapacity<length<=4 */
01512                 length-=targetCapacity;
01513                 p=(uint8_t *)cnv->charErrorBuffer;
01514                 switch(length) {
01515                     /* each branch falls through to the next one */
01516                 case 3:
01517                     *p++=(uint8_t)(value>>16);
01518                 case 2:
01519                     *p++=(uint8_t)(value>>8);
01520                 case 1:
01521                     *p=(uint8_t)value;
01522                 default:
01523                     /* will never occur */
01524                     break;
01525                 }
01526                 cnv->charErrorBufferLength=(int8_t)length;
01527 
01528                 /* now output what fits into the regular target */
01529                 value>>=8*length; /* length was reduced by targetCapacity */
01530                 switch(targetCapacity) {
01531                     /* each branch falls through to the next one */
01532                 case 3:
01533                     *target++=(uint8_t)(value>>16);
01534                     if(offsets!=NULL) {
01535                         *offsets++=sourceIndex;
01536                     }
01537                 case 2:
01538                     *target++=(uint8_t)(value>>8);
01539                     if(offsets!=NULL) {
01540                         *offsets++=sourceIndex;
01541                     }
01542                 case 1:
01543                     *target++=(uint8_t)value;
01544                     if(offsets!=NULL) {
01545                         *offsets++=sourceIndex;
01546                     }
01547                 default:
01548                     /* will never occur */
01549                     break;
01550                 }
01551 
01552                 /* target overflow */
01553                 targetCapacity=0;
01554                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
01555                 c=0;
01556                 break;
01557             }
01558 
01559             /* normal end of conversion: prepare for a new character */
01560             c=0;
01561             sourceIndex=nextSourceIndex;
01562             continue;
01563 
01564             /*
01565              * This is the same ugly trick as in ToUnicode(), for the
01566              * same reasons...
01567              */
01568 callback:
01569             /* call the callback function with all the preparations and post-processing */
01570             /* update the arguments structure */
01571             pArgs->source=source;
01572             pArgs->target=(char *)target;
01573             pArgs->offsets=offsets;
01574 
01575             /* set the converter state in UConverter to deal with the next character */
01576             cnv->fromUSurrogateLead=0;
01577 
01578             /* write the code point as code units */
01579             i=0;
01580             UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
01581             cnv->invalidUCharLength=(int8_t)i;
01582 
01583             /* call the callback function */
01584             fromUCallback(cnv, cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, reason, pErrorCode);
01585 
01586             /* get the converter state from UConverter */
01587             c=cnv->fromUSurrogateLead;
01588 
01589             /* update target and deal with offsets if necessary */
01590             offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
01591             target=(uint8_t *)pArgs->target;
01592 
01593             /* update the source pointer and index */
01594             sourceIndex=nextSourceIndex+(pArgs->source-source);
01595             source=pArgs->source;
01596             targetCapacity=(uint8_t *)pArgs->targetLimit-target;
01597 
01598             /*
01599              * If the callback overflowed the target, then we need to
01600              * stop here with an overflow indication.
01601              */
01602             if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
01603                 break;
01604             } else if(cnv->charErrorBufferLength>0) {
01605                 /* target is full */
01606                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
01607                 break;
01608             } else if(U_FAILURE(*pErrorCode)) {
01609                 /* break on error */
01610                 c=0;
01611                 break;
01612             }
01613 
01614             /*
01615              * We do not need to repeat the statements from the normal
01616              * end of the conversion because we already updated all the
01617              * necessary variables.
01618              */
01619         } else {
01620             /* target is full */
01621             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
01622             break;
01623         }
01624     }
01625 
01626     if(pArgs->flush && source>=sourceLimit) {
01627         /* reset the state for the next conversion */
01628         if(c!=0 && U_SUCCESS(*pErrorCode)) {
01629             /* a character byte sequence remains incomplete */
01630             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
01631         }
01632         cnv->fromUSurrogateLead=0;
01633     } else {
01634         /* set the converter state back into UConverter */
01635         cnv->fromUSurrogateLead=(UChar)c;
01636     }
01637 
01638     /* write back the updated pointers */
01639     pArgs->source=source;
01640     pArgs->target=(char *)target;
01641     pArgs->offsets=offsets;
01642 }
01643 
01644 U_CFUNC void
01645 _MBCSFromUnicode(UConverterFromUnicodeArgs *pArgs,
01646                  UErrorCode *pErrorCode) {
01647     _MBCSFromUnicodeWithOffsets(pArgs, pErrorCode);
01648 }
01649 
01650 /* This version of _MBCSFromUnicode() is optimized for single-byte codepages. */
01651 U_CFUNC void
01652 _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
01653                                   UErrorCode *pErrorCode) {
01654     UConverter *cnv;
01655     const UChar *source, *sourceLimit;
01656     uint8_t *target;
01657     int32_t targetCapacity;
01658     int32_t *offsets;
01659 
01660     const uint16_t *table;
01661     const uint8_t *bytes;
01662     uint8_t outputType;
01663 
01664     UChar32 c;
01665 
01666     int32_t sourceIndex, nextSourceIndex;
01667 
01668     UConverterCallbackReason reason;
01669     uint32_t i;
01670     uint32_t value;
01671 
01672     /* set up the local pointers */
01673     cnv=pArgs->converter;
01674     source=pArgs->source;
01675     sourceLimit=pArgs->sourceLimit;
01676     target=(uint8_t *)pArgs->target;
01677     targetCapacity=pArgs->targetLimit-pArgs->target;
01678     offsets=pArgs->offsets;
01679 
01680     table=cnv->sharedData->table->mbcs.fromUnicodeTable;
01681     bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
01682     outputType=cnv->sharedData->table->mbcs.outputType;
01683 
01684     /* get the converter state from UConverter */
01685     c=cnv->fromUSurrogateLead;
01686 
01687     /* sourceIndex=-1 if the current character began in the previous buffer */
01688     sourceIndex= c==0 ? 0 : -1;
01689     nextSourceIndex=0;
01690 
01691     /* conversion loop */
01692     if(c!=0 && targetCapacity>0) {
01693         goto getTrail;
01694     }
01695 
01696     while(source<sourceLimit) {
01697         /*
01698          * This following test is to see if available input would overflow the output.
01699          * It does not catch output of more than one byte that
01700          * overflows as a result of a multi-byte character or callback output
01701          * from the last source character.
01702          * Therefore, those situations also test for overflows and will
01703          * then break the loop, too.
01704          */
01705         if(targetCapacity>0) {
01706             /*
01707              * Get a correct Unicode code point:
01708              * a single UChar for a BMP code point or
01709              * a matched surrogate pair for a "surrogate code point".
01710              */
01711             c=*source++;
01712             ++nextSourceIndex;
01713             if(UTF_IS_SURROGATE(c)) {
01714                 if(UTF_IS_SURROGATE_FIRST(c)) {
01715 getTrail:
01716                     if(source<sourceLimit) {
01717                         /* test the following code unit */
01718                         UChar trail=*source;
01719                         if(UTF_IS_SECOND_SURROGATE(trail)) {
01720                             ++source;
01721                             ++nextSourceIndex;
01722                             c=UTF16_GET_PAIR_VALUE(c, trail);
01723                             /* convert this surrogate code point */
01724                             /* exit this condition tree */
01725                         } else {
01726                             /* this is an unmatched lead code unit (1st surrogate) */
01727                             /* callback(illegal) */
01728                             reason=UCNV_ILLEGAL;
01729                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
01730                             goto callback;
01731                         }
01732                     } else {
01733                         /* no more input */
01734                         break;
01735                     }
01736                 } else {
01737                     /* this is an unmatched trail code unit (2nd surrogate) */
01738                     /* callback(illegal) */
01739                     reason=UCNV_ILLEGAL;
01740                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
01741                     goto callback;
01742                 }
01743             }
01744 
01745             /* convert the Unicode code point in c into codepage bytes */
01746             i=0x440+2*((uint32_t)table[c>>10]+((c>>4)&0x3f));
01747 
01748             /* is this code point assigned, or do we use fallbacks? */
01749             if((table[i++]&(1<<(c&0xf)))!=0 || UCNV_FROM_U_USE_FALLBACK(cnv, c)) {
01750                 const uint8_t *p=bytes;
01751 
01752                 /* MBCS_OUTPUT_1 */
01753                 p+=(16*(uint32_t)table[i]+(c&0xf));
01754                 value=*p;
01755 
01756                 /* is the codepage value really an "unassigned" indicator? */
01757                 if(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0) {
01758                     /*
01759                      * We allow a 0 byte output if the Unicode code point is
01760                      * U+0000 and also if the "assigned" bit is set for this entry.
01761                      * There is no way with this data structure for fallback output
01762                      * for other than U+0000 to be a zero byte.
01763                      */
01764                     /* callback(unassigned) */
01765                     reason=UCNV_UNASSIGNED;
01766                     *pErrorCode=U_INVALID_CHAR_FOUND;
01767                     goto callback;
01768                 }
01769             } else {
01770                 /* callback(unassigned) */
01771                 reason=UCNV_UNASSIGNED;
01772                 *pErrorCode=U_INVALID_CHAR_FOUND;
01773                 goto callback;
01774             }
01775 
01776             /* write the output character bytes from value and length */
01777             /* length==1 */
01778             /* this is easy because we know that there is enough space */
01779             *target++=(uint8_t)value;
01780             if(offsets!=NULL) {
01781                 *offsets++=sourceIndex;
01782             }
01783             --targetCapacity;
01784 
01785             /* normal end of conversion: prepare for a new character */
01786             c=0;
01787             sourceIndex=nextSourceIndex;
01788             continue;
01789 
01790 callback:
01791             /* call the callback function with all the preparations and post-processing */
01792             /* update the arguments structure */
01793             pArgs->source=source;
01794             pArgs->target=(char *)target;
01795             pArgs->offsets=offsets;
01796 
01797             /* set the converter state in UConverter to deal with the next character */
01798             cnv->fromUSurrogateLead=0;
01799 
01800             /* write the code point as code units */
01801             i=0;
01802             UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
01803             cnv->invalidUCharLength=(int8_t)i;
01804 
01805             /* call the callback function */
01806             fromUCallback(cnv, cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, reason, pErrorCode);
01807 
01808             /* get the converter state from UConverter */
01809             c=cnv->fromUSurrogateLead;
01810 
01811             /* update target and deal with offsets if necessary */
01812             offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
01813             target=(uint8_t *)pArgs->target;
01814 
01815             /* update the source pointer and index */
01816             sourceIndex=nextSourceIndex+(pArgs->source-source);
01817             source=pArgs->source;
01818             targetCapacity=(uint8_t *)pArgs->targetLimit-target;
01819 
01820             /*
01821              * If the callback overflowed the target, then we need to
01822              * stop here with an overflow indication.
01823              */
01824             if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
01825                 break;
01826             } else if(cnv->charErrorBufferLength>0) {
01827                 /* target is full */
01828                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
01829                 break;
01830             } else if(U_FAILURE(*pErrorCode)) {
01831                 /* break on error */
01832                 c=0;
01833                 break;
01834             }
01835 
01836             /*
01837              * We do not need to repeat the statements from the normal
01838              * end of the conversion because we already updated all the
01839              * necessary variables.
01840              */
01841         } else {
01842             /* target is full */
01843             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
01844             break;
01845         }
01846     }
01847 
01848     if(pArgs->flush && source>=sourceLimit) {
01849         /* reset the state for the next conversion */
01850         if(c!=0 && U_SUCCESS(*pErrorCode)) {
01851             /* a character byte sequence remains incomplete */
01852             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
01853         }
01854         cnv->fromUSurrogateLead=0;
01855     } else {
01856         /* set the converter state back into UConverter */
01857         cnv->fromUSurrogateLead=(UChar)c;
01858     }
01859 
01860     /* write back the updated pointers */
01861     pArgs->source=source;
01862     pArgs->target=(char *)target;
01863     pArgs->offsets=offsets;
01864 }
01865 
01866 /*
01867  * This is another simple conversion function for internal use by other
01868  * conversion implementations.
01869  * It does not use the converter state nor call callbacks.
01870  * It converts one single Unicode code point into codepage bytes, encoded
01871  * as one 32-bit value. The function returns the number of bytes in *pValue:
01872  * 1..4 the number of bytes in *pValue
01873  * 0    unassigned (*pValue undefined)
01874  * -1   illegal (currently not used, *pValue undefined)
01875  *
01876  * *pValue will contain the resulting bytes with the last byte in bits 7..0,
01877  * the second to last byte in bits 15..8, etc.
01878  * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
01879  */
01880 U_CFUNC int32_t
01881 _MBCSFromUChar32(UConverterSharedData *sharedData,
01882                  UChar32 c, uint32_t *pValue,
01883                  UBool useFallback) {
01884     const uint16_t *table=sharedData->table->mbcs.fromUnicodeTable;
01885     uint32_t i;
01886     uint32_t value;
01887     int32_t length;
01888 
01889     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
01890     i=0x440+2*((uint32_t)table[c>>10]+((c>>4)&0x3f));
01891 
01892     /* is this code point assigned, or do we use fallbacks? */
01893     if((table[i++]&(1<<(c&0xf)))!=0 || FROM_U_USE_FALLBACK(useFallback, c)) {
01894         const uint8_t *p=sharedData->table->mbcs.fromUnicodeBytes;
01895 
01896         /* get the bytes and the length for the output */
01897         switch(sharedData->table->mbcs.outputType) {
01898         case MBCS_OUTPUT_1:
01899             p+=(16*(uint32_t)table[i]+(c&0xf));
01900             value=*p;
01901             length=1;
01902             break;
01903         case MBCS_OUTPUT_2:
01904             p+=(16*(uint32_t)table[i]+(c&0xf))*2;
01905 #           if U_IS_BIG_ENDIAN
01906                 value=*(uint16_t *)p;
01907 #           else
01908                 value=((uint32_t)*p<<8)|p[1];
01909 #           endif
01910             if(value<=0xff) {
01911                 length=1;
01912             } else {
01913                 length=2;
01914             }
01915             break;
01916         case MBCS_OUTPUT_3:
01917             p+=(16*(uint32_t)table[i]+(c&0xf))*3;
01918             value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
01919             if(value<=0xff) {
01920                 length=1;
01921             } else if(value<=0xffff) {
01922                 length=2;
01923             } else {
01924                 length=3;
01925             }
01926             break;
01927         case MBCS_OUTPUT_4:
01928             p+=(16*(uint32_t)table[i]+(c&0xf))*4;
01929 #           if U_IS_BIG_ENDIAN
01930                 value=*(uint32_t *)p;
01931 #           else
01932                 value=((uint32_t)*p<<24)|((uint32_t)p[1]<<16)|((uint32_t)p[2]<<8)|p[3];
01933 #           endif
01934             if(value<=0xff) {
01935                 length=1;
01936             } else if(value<=0xffff) {
01937                 length=2;
01938             } else if(value<=0xffffff) {
01939                 length=3;
01940             } else {
01941                 length=4;
01942             }
01943             break;
01944         case MBCS_OUTPUT_3_EUC:
01945             p+=(16*(uint32_t)table[i]+(c&0xf))*2;
01946 #           if U_IS_BIG_ENDIAN
01947                 value=*(uint16_t *)p;
01948 #           else
01949                 value=((uint32_t)*p<<8)|p[1];
01950 #           endif
01951             /* EUC 16-bit fixed-length representation */
01952             if(value<=0xff) {
01953                 length=1;
01954             } else if((value&0x8000)==0) {
01955                 value|=0x8e8000;
01956                 length=3;
01957             } else if((value&0x80)==0) {
01958                 value|=0x8f0080;
01959                 length=3;
01960             } else {
01961                 length=2;
01962             }
01963             break;
01964         case MBCS_OUTPUT_4_EUC:
01965             p+=(16*(uint32_t)table[i]+(c&0xf))*3;
01966             value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
01967             /* EUC 16-bit fixed-length representation applied to the first two bytes */
01968             if(value<=0xff) {
01969                 length=1;
01970             } else if(value<=0xffff) {
01971                 length=2;
01972             } else if((value&0x800000)==0) {
01973                 value|=0x8e800000;
01974                 length=4;
01975             } else if((value&0x8000)==0) {
01976                 value|=0x8f008000;
01977                 length=4;
01978             } else {
01979                 length=3;
01980             }
01981             break;
01982         default:
01983             /* must not occur */
01984             return -1;
01985         }
01986 
01987         /* is the codepage value really an "unassigned" indicator? */
01988         if(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0) {
01989             /*
01990              * We allow a 0 byte output if the Unicode code point is
01991              * U+0000 and also if the "assigned" bit is set for this entry.
01992              * There is no way with this data structure for fallback output
01993              * for other than U+0000 to be a zero byte.
01994              */
01995             return 0;
01996         } else {
01997             *pValue=value;
01998             return length;
01999         }
02000     } else {
02001         return 0;
02002     }
02003 }
02004 
02005 /* miscellaneous ------------------------------------------------------------ */
02006 
02007 static void
02008 _MBCSGetStarters(const UConverter* cnv,
02009                  UBool starters[256],
02010                  UErrorCode *pErrorCode) {
02011     const int32_t *state0=cnv->sharedData->table->mbcs.stateTable[0];
02012     int i;
02013 
02014     for(i=0; i<256; ++i) {
02015         /* all bytes that cause a state transition from state 0 are lead bytes */
02016         starters[i]= (UBool)(state0[i]>=0);
02017     }
02018 }
02019 
02020 /*
02021  * This is an internal function that allows other converter implementations
02022  * to check whether a byte is a lead byte.
02023  */
02024 U_CFUNC UBool
02025 _MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
02026     return (UBool)(sharedData->table->mbcs.stateTable[0][(uint8_t)byte]>=0);
02027 }
02028 
02029 static const UConverterImpl _MBCSImpl={
02030     UCNV_MBCS,
02031 
02032     _MBCSLoad,
02033     NULL,
02034 
02035     _MBCSOpen,
02036     NULL,
02037     _MBCSReset,
02038 
02039     _MBCSToUnicode,
02040     _MBCSToUnicodeWithOffsets,
02041     _MBCSFromUnicode,
02042     _MBCSFromUnicodeWithOffsets,
02043     _MBCSGetNextUChar,
02044 
02045     _MBCSGetStarters,
02046     NULL
02047 };
02048 
02049 
02050 /* Static data is in tools/makeconv/ucnvstat.c for data-based
02051  * converters. Be sure to update it as well.
02052  */
02053 
02054 const UConverterSharedData _MBCSData={
02055     sizeof(UConverterSharedData), 1,
02056     NULL, NULL, NULL, FALSE, &_MBCSImpl, 
02057     0
02058 };
02059 
02060 /* GB 18030 special handling ------------------------------------------------ */
02061 
02062 /* ### IMPORTANT: THIS IS ALPHA-VERSION SUPPORT CODE FOR GB 18030 AND MAY CHANGE WITHOUT NOTICE */
02063 
02064 /* definition of LINEAR macros and gb18030Ranges see near the beginning of the file */
02065 
02066 /* the callback functions handle GB 18030 specially */
02067 static void
02068 fromUCallback(UConverter *cnv,
02069               void *context, UConverterFromUnicodeArgs *pArgs,
02070               const UChar *codeUnits, int32_t length, UChar32 codePoint,
02071               UConverterCallbackReason reason, UErrorCode *pErrorCode) {
02072     if(cnv->extraInfo==gb18030Ranges && (reason==UCNV_UNASSIGNED || reason==UCNV_ILLEGAL)) {
02073         const uint32_t *range;
02074         int i;
02075 
02076         range=gb18030Ranges[0];
02077         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
02078             if(range[0]<=(uint32_t)codePoint && (uint32_t)codePoint<=range[1]) {
02079                 uint32_t linear;
02080                 char bytes[4];
02081 
02082                 /* found the Unicode code point, output the four-byte sequence for it */
02083                 *pErrorCode=U_ZERO_ERROR;
02084 
02085                 /* get the linear value of the first GB 18030 code in this range */
02086                 linear=range[2]-LINEAR_18030_BASE;
02087 
02088                 /* add the offset from the beginning of the range */
02089                 linear+=((uint32_t)codePoint-range[0]);
02090 
02091                 /* turn this into a four-byte sequence */
02092                 bytes[3]=(const char)(0x30+linear%10); linear/=10;
02093                 bytes[2]=(const char)(0x81+linear%126); linear/=126;
02094                 bytes[1]=(const char)(0x30+linear%10); linear/=10;
02095                 bytes[0]=(const char)(0x81+linear);
02096 
02097                 /* output this sequence */
02098                 ucnv_cbFromUWriteBytes(pArgs, bytes, 4, 0, pErrorCode);
02099                 return;
02100             }
02101         }
02102     }
02103 
02104     /* call the normal callback function */
02105     cnv->fromUCharErrorBehaviour(context, pArgs, codeUnits, length, codePoint, reason, pErrorCode);
02106 }
02107 
02108 static void
02109 toUCallback(UConverter *cnv,
02110             void *context, UConverterToUnicodeArgs *pArgs,
02111             const char *codeUnits, int32_t length,
02112             UConverterCallbackReason reason, UErrorCode *pErrorCode) {
02113     if(cnv->extraInfo==gb18030Ranges && reason==UCNV_UNASSIGNED && length==4) {
02114         const uint32_t *range;
02115         uint32_t linear;
02116         int i;
02117 
02118         linear=LINEAR_18030((uint8_t)codeUnits[0], (uint8_t)codeUnits[1], (uint8_t)codeUnits[2], (uint8_t)codeUnits[3]);
02119         range=gb18030Ranges[0];
02120         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
02121             if(range[2]<=linear && linear<=range[3]) {
02122                 UChar u[UTF_MAX_CHAR_LENGTH];
02123 
02124                 /* found the sequence, output the Unicode code point for it */
02125                 *pErrorCode=U_ZERO_ERROR;
02126 
02127                 /* add the linear difference between the input and start sequences to the start code point */
02128                 linear=range[0]+(linear-range[2]);
02129 
02130                 /* write the result as UChars and output */
02131                 i=0;
02132                 UTF_APPEND_CHAR_UNSAFE(u, i, linear);
02133                 ucnv_cbToUWriteUChars(pArgs, u, i, 0, pErrorCode);
02134                 return;
02135             }
02136         }
02137     }
02138 
02139     /* call the normal callback function */
02140     cnv->fromCharErrorBehaviour(context, pArgs, codeUnits, length, reason, pErrorCode);
02141 }

Generated at Tue Dec 5 10:48:04 2000 for ICU by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000