Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

ucnv_lmb.c

00001 /*  
00002 **********************************************************************
00003 *   Copyright (C) 2000, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  ucnv_lmb.cpp
00007 *   encoding:   US-ASCII
00008 *   tab size:   4 (not used)
00009 *   indentation:4
00010 *
00011 *   created on: 2000feb09
00012 *   created by: Brendan Murray
00013 *   extensively hacked up by: Jim Snyder-Grant
00014 *
00015 * Modification History:
00016 * 
00017 *   Date        Name             Description
00018 * 
00019 *   06/20/2000  helena           OS/400 port changes; mostly typecast.
00020 *   06/27/2000  Jim Snyder-Grant Deal with partial characters and small buffers.
00021 *                                Add comments to document LMBCS format and implementation
00022 *                                restructured order & breakdown of functions
00023 *   06/28/2000  helena           Major rewrite for the callback API changes.
00024 */
00025 
00026 #include "unicode/utypes.h"
00027 #include "cmemory.h"
00028 #include "ucmp16.h"
00029 #include "ucmp8.h"
00030 #include "unicode/ucnv_err.h"
00031 #include "ucnv_bld.h"
00032 #include "unicode/ucnv.h"
00033 #include "ucnv_cnv.h"
00034 
00035 /*
00036   LMBCS
00037 
00038   (Lotus Multi-Byte Character Set)
00039 
00040   LMBCS was invented in the late 1980's and is primarily used in Lotus Notes 
00041   databases and in Lotus 1-2-3 files. Programmers who work with the APIs 
00042   into these products will sometimes need to deal with strings in this format.
00043 
00044   The code in this file provides an implementation for an ICU converter of 
00045   LMBCS to and from Unicode. 
00046 
00047   Since the LMBCS character set is only sparsely documented in existing 
00048   printed or online material, we have added  extensive annotation to this 
00049   file to serve as a guide to understanding LMBCS. 
00050 
00051   LMBCS was originally designed with these four sometimes-competing design goals:
00052 
00053   -Provide encodings for the characters in 12 existing national standards
00054    (plus a few other characters)
00055   -Minimal memory footprint
00056   -Maximal speed of conversion into the existing national character sets
00057   -No need to track a changing state as you interpret a string.
00058 
00059 
00060   All of the national character sets LMBCS was trying to encode are 'ANSI'
00061   based, in that the bytes from 0x20 - 0x7F are almost exactly the 
00062   same common Latin unaccented characters and symbols in all character sets. 
00063 
00064   So, in order to help meet the speed & memory design goals, the common ANSI 
00065   bytes from 0x20-0x7F are represented by the same single-byte values in LMBCS. 
00066 
00067   The general LMBCS code unit is from 1-3 bytes. We can describe the 3 bytes as
00068   follows:
00069 
00070   [G] D1 [D2]
00071 
00072   That is, a sometimes-optional 'group' byte, followed by 1 and sometimes 2
00073   data bytes. The maximum size of a LMBCS chjaracter is 3 bytes:
00074 */
00075 #define ULMBCS_CHARSIZE_MAX      3
00076 /*
00077   The single-byte values from 0x20 to 0x7F are examples of single D1 bytes.
00078   We often have to figure out if byte values are below or above this, so we 
00079   use the ANSI nomenclature 'C0' and 'C1' to refer to the range of control 
00080   characters just above & below the common lower-ANSI  range */
00081 #define ULMBCS_C0END           0x1F   
00082 #define ULMBCS_C1START         0x80   
00083 /*
00084   Since LMBCS is always dealing in byte units. we create a local type here for 
00085   dealing with these units of LMBCS code units:
00086 
00087 */  
00088 typedef uint8_t ulmbcs_byte_t;
00089 
00090 /* 
00091    Most of the values less than 0x20 are reserved in LMBCS to announce 
00092    which national  character standard is being used for the 'D' bytes. 
00093    In the comments we show the common name and the IBM character-set ID
00094    for these character-set announcers:
00095 */
00096 
00097 #define ULMBCS_GRP_L1         0x01   /* Latin-1    :ibm-850  */
00098 #define ULMBCS_GRP_GR         0x02   /* Greek      :ibm-851  */
00099 #define ULMBCS_GRP_HE         0x03   /* Hebrew     :ibm-1255 */
00100 #define ULMBCS_GRP_AR         0x04   /* Arabic     :ibm-1256 */
00101 #define ULMBCS_GRP_RU         0x05   /* Cyrillic   :ibm-1251 */
00102 #define ULMBCS_GRP_L2         0x06   /* Latin-2    :ibm-852  */
00103 #define ULMBCS_GRP_TR         0x08   /* Turkish    :ibm-1254 */
00104 #define ULMBCS_GRP_TH         0x0B   /* Thai       :ibm-874  */
00105 #define ULMBCS_GRP_JA         0x10   /* Japanese   :ibm-943  */
00106 #define ULMBCS_GRP_KO         0x11   /* Korean     :ibm-1261 */
00107 #define ULMBCS_GRP_CN         0x12   /* Chinese SC :ibm-950  */
00108 #define ULMBCS_GRP_TW         0x13   /* Chinese TC :ibm-1386 */
00109 
00110 /*
00111    So, the beginning of understanding LMBCS is that IF the first byte of a LMBCS 
00112    character is one of those 12 values, you can interpret the remaining bytes of 
00113    that character as coming from one of those character sets. Since the lower 
00114    ANSI bytes already are represented in single bytes, using one of the character 
00115    set announcers is used to announce a character that starts with a byte of 
00116    0x80 or greater.
00117 
00118    The character sets are  arranged so that the single byte sets all appear 
00119    before the multi-byte character sets. When we need to tell whether a 
00120    group byte is for a single byte char set or not we use this define: */
00121 
00122 #define ULMBCS_DOUBLEOPTGROUP_START  0x10   
00123 
00124 /* 
00125 However, to fully understand LMBCS, you must also understand a series of 
00126 exceptions & optimizations made in service of the design goals. 
00127 
00128 First, those of you who are character set mavens may have noticed that
00129 the 'double-byte' character sets are actually multi-byte character sets 
00130 that can have 1 or two bytes, even in the upper-ascii range. To force
00131 each group byte to introduce a fixed-width encoding (to make it faster to 
00132 count characters), we use a convention of doubling up on the group byte 
00133 to introduce any single-byte character > 0x80 in an otherwise double-byte
00134 character set. So, for example, the LMBCS sequence x10 x10 xAE is the 
00135 same as '0xAE' in the Japanese code page 943.
00136 
00137 Next, you will notice that the list of group bytes has some gaps. 
00138 These are used in various ways.
00139 
00140 We reserve a few special single byte values for common control 
00141 characters. These are in the same place as their ANSI eqivalents for speed.
00142 */
00143                      
00144 #define ULMBCS_HT    0x09   /* Fixed control char - Horizontal Tab */
00145 #define ULMBCS_LF    0x0A   /* Fixed control char - Line Feed */
00146 #define ULMBCS_CR    0x0D   /* Fixed control char - Carriage Return */
00147 
00148 /* Then, 1-2-3 reserved a special single-byte character to put at the 
00149 beginning of internal 'system' range names: */
00150 
00151 #define ULMBCS_123SYSTEMRANGE  0x19   
00152 
00153 /* Then we needed a place to put all the other ansi control characters 
00154 that must be moved to different values because LMBCS reserves those 
00155 values for other purposes. To represent the control characters, we start 
00156 with a first byte of 0xF & add the control chaarcter value as the 
00157 second byte */
00158 #define ULMBCS_GRP_CTRL       0x0F   
00159 
00160 /* For the C0 controls (less than 0x20), we add 0x20 to preserve the 
00161 useful doctrine that any byte less than 0x20 in a LMBCS char must be 
00162 the first byte of a character:*/
00163 #define ULMBCS_CTRLOFFSET      0x20   
00164 
00165 /* 
00166 Where to put the characters that aren't part of any of the 12 national 
00167 character sets? The first thing that was done, in the earlier years of 
00168 LMBCS, was to use up the spaces of the form
00169 
00170   [G] D1, 
00171   
00172  where  'G' was one of the single-byte character groups, and
00173  D1 was less than 0x80. These sequences are gathered together 
00174  into a Lotus-invented doublebyte character set to represent a 
00175  lot of stray values. Internally, in this implementation, we track this 
00176  as group '0', as a place to tuck this exceptions list.*/
00177 
00178 #define ULMBCS_GRP_EXCEPT     0x00    
00179 /*
00180  Finally, as the durability and usefulness of UNICODE became clear, 
00181  LOTUS added a new group 0x14 to hold Unicode values not otherwise 
00182  represented in LMBCS: */
00183 #define ULMBCS_GRP_UNICODE    0x14   
00184 /* The two bytes appearing after a 0x14 are intrepreted as UFT-16 BE
00185 (Big-Endian) characters. The exception comes when the UTF16 
00186 representation would have a zero as the second byte. In that case,
00187 'F6' is used in its place, and the bytes are swapped. (This prevents 
00188 LMBCS from encoding any Unicode values of the form U+F6xx, but that's OK:
00189 0xF6xx is in the middle of the Private Use Area.)*/
00190 #define ULMBCS_UNICOMPATZERO   0xF6   
00191 
00192 /* It is also useful in our code to have a constant for the size of 
00193 a LMBCS char that holds a literal Unicode value */
00194 #define ULMBCS_UNICODE_SIZE      3    
00195 
00196 /* 
00197 To squish the LMBCS representations down even further, and to make 
00198 translations even faster,sometimes the optimization group byte can be dropped 
00199 from a LMBCS character. This is decided on a process-by-process basis. The 
00200 group byte that is dropped is called the 'optimization group'.
00201 
00202 For Notes, the optimzation group is always 0x1.*/
00203 #define ULMBCS_DEFAULTOPTGROUP 0x1    
00204 /* For 1-2-3 files, the optimzation group is stored in the header of the 1-2-3 
00205 file. 
00206 
00207  In any case, when using ICU, you either pass in the 
00208 optimization group as part of the name of the converter (LMBCS-1, LMBCS-2, 
00209 etc.). Using plain 'LMBCS' as the name of the converter will give you 
00210 LMBCS-1.
00211 
00212 
00213 *** Implementation strategy ***
00214 
00215 
00216 Because of the extensive use of other character sets, the LMBCS converter
00217 keeps a mapping between optimization groups and IBM character sets, so that
00218 ICU converters can be created and used as needed. */
00219 
00220 static const char * OptGroupByteToCPName[ULMBCS_CTRLOFFSET] = {
00221    /* 0x0000 */ "lmb-excp", /* internal home for the LOTUS exceptions list */
00222    /* 0x0001 */ "ibm-850",
00223    /* 0x0002 */ "ibm-851",
00224    /* 0x0003 */ "ibm-1255",
00225    /* 0x0004 */ "ibm-1256",
00226    /* 0x0005 */ "ibm-1251",
00227    /* 0x0006 */ "ibm-852",
00228    /* 0x0007 */ NULL,      /* Unused */
00229    /* 0x0008 */ "ibm-1254",
00230    /* 0x0009 */ NULL,      /* Control char HT */
00231    /* 0x000A */ NULL,      /* Control char LF */
00232    /* 0x000B */ "ibm-874",
00233    /* 0x000C */ NULL,      /* Unused */
00234    /* 0x000D */ NULL,      /* Control char CR */
00235    /* 0x000E */ NULL,      /* Unused */
00236    /* 0x000F */ NULL,      /* Control chars: 0x0F20 + C0/C1 character: algorithmic */
00237    /* 0x0010 */ "ibm-943",
00238    /* 0x0011 */ "ibm-1363",
00239    /* 0x0012 */ "ibm-950",
00240    /* 0x0013 */ "ibm-1386"
00241 
00242    /* The rest are null, including the 0x0014 Unicode compatibility region
00243    and 0x0019, the 1-2-3 system range control char */      
00244 };
00245 
00246 /* As you can see, even though any byte below 0x20 could be an optimization 
00247 byte, only those at 0x13 or below can map to an actual converter. To limit
00248 some loops and searches, we define a value for that last group converter:*/
00249 
00250 #define ULMBCS_GRP_LAST       0x13   /* last LMBCS group that has a converter */
00251 
00252 
00253 /* That's approximately all the data that's needed for translating 
00254   LMBCS to Unicode. 
00255 
00256 
00257 However, to translate Unicode to LMBCS, we need some more support.
00258 
00259 That's because there are often more than one possible mappings from a Unicode
00260 code point back into LMBCS. The first thing we do is look up into a table
00261 to figure out if there are more than one possible mappings. This table,
00262 arranged by Unicode values (including ranges) either lists which group 
00263 to use, or says that it could go into one or more of the SBCS sets, or
00264 into one or more of the DBCS sets.  (If the character exists in both DBCS & 
00265 SBCS, the table will place it in the SBCS sets, to make the LMBCS code point 
00266 length as small as possible. Here's the two special markers we use to indicate
00267 ambiguous mappings: */
00268 
00269 #define ULMBCS_AMBIGUOUS_SBCS   0x80   /* could fit in more than one 
00270                                           LMBCS sbcs native encoding 
00271                                           (example: most accented latin) */
00272 #define ULMBCS_AMBIGUOUS_MBCS   0x81   /* could fit in more than one 
00273                                           LMBCS mbcs native encoding 
00274                                           (example: Unihan) */
00275 
00276 /* And here's a simple way to see if a group falls in an appropriate range */
00277 #define ULMBCS_AMBIGUOUS_MATCH(agroup, xgroup) \
00278                   ((((agroup) == ULMBCS_AMBIGUOUS_SBCS) && \
00279                   (xgroup) < ULMBCS_DOUBLEOPTGROUP_START) || \
00280                   (((agroup) == ULMBCS_AMBIGUOUS_MBCS) && \
00281                   (xgroup) >= ULMBCS_DOUBLEOPTGROUP_START))
00282 
00283 
00284 /* The table & some code to use it: */
00285 
00286 
00287 struct _UniLMBCSGrpMap  
00288 {
00289    UChar uniStartRange;
00290    UChar uniEndRange;
00291    ulmbcs_byte_t  GrpType;
00292 } UniLMBCSGrpMap[]
00293 =
00294 {
00295 
00296    {0x0001, 0x001F,  ULMBCS_GRP_CTRL},
00297    {0x0080, 0x009F,  ULMBCS_GRP_CTRL},
00298    {0x00A0, 0x01CD,  ULMBCS_AMBIGUOUS_SBCS},
00299    {0x01CE, 0x01CE,  ULMBCS_GRP_TW }, 
00300    {0x01CF, 0x02B9,  ULMBCS_AMBIGUOUS_SBCS},
00301    {0x02BA, 0x02BA,  ULMBCS_GRP_CN},
00302    {0x02BC, 0x02C8,  ULMBCS_AMBIGUOUS_SBCS},
00303    {0x02C9, 0x02D0,  ULMBCS_AMBIGUOUS_MBCS},
00304    {0x02D8, 0x02DD,  ULMBCS_AMBIGUOUS_SBCS},
00305    {0x0384, 0x03CE,  ULMBCS_AMBIGUOUS_SBCS},
00306    {0x0400, 0x044E,  ULMBCS_GRP_RU},
00307    {0x044F, 0x044F,  ULMBCS_AMBIGUOUS_MBCS},
00308    {0x0450, 0x0491,  ULMBCS_GRP_RU},
00309    {0x05B0, 0x05F2,  ULMBCS_GRP_HE},
00310    {0x060C, 0x06AF,  ULMBCS_GRP_AR}, 
00311    {0x0E01, 0x0E5B,  ULMBCS_GRP_TH},
00312    {0x200C, 0x200F,  ULMBCS_AMBIGUOUS_SBCS},
00313    {0x2010, 0x2010,  ULMBCS_AMBIGUOUS_MBCS},
00314    {0x2013, 0x2015,  ULMBCS_AMBIGUOUS_SBCS},
00315    {0x2016, 0x2016,  ULMBCS_AMBIGUOUS_MBCS},
00316    {0x2017, 0x2024,  ULMBCS_AMBIGUOUS_SBCS},
00317    {0x2025, 0x2025,  ULMBCS_AMBIGUOUS_MBCS},
00318    {0x2026, 0x2026,  ULMBCS_AMBIGUOUS_SBCS},
00319    {0x2027, 0x2027,  ULMBCS_GRP_CN},
00320    {0x2030, 0x2033,  ULMBCS_AMBIGUOUS_SBCS},
00321    {0x2035, 0x2035,  ULMBCS_AMBIGUOUS_MBCS},
00322    {0x2039, 0x203A,  ULMBCS_AMBIGUOUS_SBCS},
00323    {0x203B, 0x203B,  ULMBCS_AMBIGUOUS_MBCS},
00324    {0x2074, 0x2074,  ULMBCS_GRP_KO},
00325    {0x207F, 0x207F,  ULMBCS_GRP_EXCEPT},
00326    {0x2081, 0x2084,  ULMBCS_GRP_KO},
00327    {0x20A4, 0x20AC,  ULMBCS_AMBIGUOUS_SBCS},
00328    {0x2103, 0x2109,  ULMBCS_AMBIGUOUS_MBCS},
00329    {0x2111, 0x2126,  ULMBCS_AMBIGUOUS_SBCS},
00330    {0x212B, 0x212B,  ULMBCS_AMBIGUOUS_MBCS},
00331    {0x2135, 0x2135,  ULMBCS_AMBIGUOUS_SBCS},
00332    {0x2153, 0x2154,  ULMBCS_GRP_KO},
00333    {0x215B, 0x215E,  ULMBCS_GRP_EXCEPT},
00334    {0x2160, 0x2179,  ULMBCS_AMBIGUOUS_MBCS},
00335    {0x2190, 0x2195,  ULMBCS_GRP_EXCEPT},
00336    {0x2196, 0x2199,  ULMBCS_AMBIGUOUS_MBCS},
00337    {0x21A8, 0x21A8,  ULMBCS_GRP_EXCEPT},
00338    {0x21B8, 0x21B9,  ULMBCS_GRP_CN},
00339    {0x21D0, 0x21D5,  ULMBCS_GRP_EXCEPT},
00340    {0x21E7, 0x21E7,  ULMBCS_GRP_CN},
00341    {0x2200, 0x220B,  ULMBCS_GRP_EXCEPT},
00342    {0x220F, 0x2215,  ULMBCS_AMBIGUOUS_MBCS},
00343    {0x2219, 0x2220,  ULMBCS_GRP_EXCEPT},
00344    {0x2223, 0x2228,  ULMBCS_AMBIGUOUS_MBCS},
00345    {0x2229, 0x222B,  ULMBCS_GRP_EXCEPT},
00346    {0x222C, 0x223D,  ULMBCS_AMBIGUOUS_MBCS},
00347    {0x2245, 0x2248,  ULMBCS_GRP_EXCEPT},
00348    {0x224C, 0x224C,  ULMBCS_GRP_TW},
00349    {0x2252, 0x2252,  ULMBCS_AMBIGUOUS_MBCS},
00350    {0x2260, 0x2265,  ULMBCS_GRP_EXCEPT},
00351    {0x2266, 0x226F,  ULMBCS_AMBIGUOUS_MBCS},
00352    {0x2282, 0x2297,  ULMBCS_GRP_EXCEPT},
00353    {0x2299, 0x22BF,  ULMBCS_AMBIGUOUS_MBCS},
00354    {0x22C0, 0x22C0,  ULMBCS_GRP_EXCEPT},
00355    {0x2310, 0x2310,  ULMBCS_GRP_EXCEPT},
00356    {0x2312, 0x2312,  ULMBCS_AMBIGUOUS_MBCS},
00357    {0x2318, 0x2321,  ULMBCS_GRP_EXCEPT},
00358    {0x2318, 0x2321,  ULMBCS_GRP_CN},
00359    {0x2460, 0x24E9,  ULMBCS_AMBIGUOUS_MBCS},
00360    {0x2500, 0x2500,  ULMBCS_AMBIGUOUS_SBCS},
00361    {0x2501, 0x2501,  ULMBCS_AMBIGUOUS_MBCS},
00362    {0x2502, 0x2502,  ULMBCS_AMBIGUOUS_SBCS},
00363    {0x2503, 0x2503,  ULMBCS_AMBIGUOUS_MBCS},
00364    {0x2504, 0x2505,  ULMBCS_GRP_TW},
00365    {0x2506, 0x2665,  ULMBCS_AMBIGUOUS_MBCS},
00366    {0x2666, 0x2666,  ULMBCS_GRP_EXCEPT},
00367    {0x2667, 0xFFFE,  ULMBCS_AMBIGUOUS_MBCS},
00368    {0xFFFF, 0xFFFF,  ULMBCS_GRP_UNICODE}
00369 };
00370    
00371 ulmbcs_byte_t 
00372 FindLMBCSUniRange(UChar uniChar)
00373 {
00374    struct _UniLMBCSGrpMap * pTable = UniLMBCSGrpMap;
00375 
00376    while (uniChar > pTable->uniEndRange) 
00377    {
00378       pTable++;
00379    }
00380 
00381    if (uniChar >= pTable->uniStartRange) 
00382    {
00383       return pTable->GrpType;
00384    }
00385    return ULMBCS_GRP_UNICODE;
00386 }
00387 
00388 /* 
00389 We also ask the creator of a converter to send in a preferred locale 
00390 that we can use in resolving ambiguous mappings. They send the locale
00391 in as a string, and we map it, if possible, to one of the 
00392 LMBCS groups. We use this table, and the associated code, to 
00393 do the lookup: */
00394 
00395 /**************************************************
00396   This table maps locale ID's to LMBCS opt groups.
00397   The default return is group 0x01. Note that for
00398   performance reasons, the table is sorted in
00399   increasing alphabetic order, with the notable
00400   exception of zh_TW. This is to force the check
00401   for Traditonal Chinese before dropping back to
00402   Simplified.
00403 
00404   Note too that the Latin-1 groups have been
00405   commented out because it's the default, and
00406   this shortens the table, allowing a serial
00407   search to go quickly.
00408  *************************************************/
00409 
00410 struct _LocaleLMBCSGrpMap
00411 {
00412    const char    *LocaleID;
00413    ulmbcs_byte_t OptGroup;
00414 }  LocaleLMBCSGrpMap[] =
00415 {
00416    "ar", ULMBCS_GRP_AR,
00417    "be", ULMBCS_GRP_RU,
00418    "bg", ULMBCS_GRP_L2,
00419    /* "ca", ULMBCS_GRP_L1, */
00420    "cs", ULMBCS_GRP_L2,
00421    /* "da", ULMBCS_GRP_L1, */
00422    /* "de", ULMBCS_GRP_L1, */
00423    "el", ULMBCS_GRP_GR,
00424    /* "en", ULMBCS_GRP_L1, */
00425    /* "es", ULMBCS_GRP_L1, */
00426    /* "et", ULMBCS_GRP_L1, */
00427    /* "fi", ULMBCS_GRP_L1, */
00428    /* "fr", ULMBCS_GRP_L1, */
00429    "he", ULMBCS_GRP_HE,
00430    "hu", ULMBCS_GRP_L2,
00431    /* "is", ULMBCS_GRP_L1, */
00432    /* "it", ULMBCS_GRP_L1, */
00433    "iw", ULMBCS_GRP_HE,
00434    "ja", ULMBCS_GRP_JA,
00435    "ko", ULMBCS_GRP_KO,
00436    /* "lt", ULMBCS_GRP_L1, */
00437    /* "lv", ULMBCS_GRP_L1, */
00438    "mk", ULMBCS_GRP_RU,
00439    /* "nl", ULMBCS_GRP_L1, */
00440    /* "no", ULMBCS_GRP_L1, */
00441    "pl", ULMBCS_GRP_L2,
00442    /* "pt", ULMBCS_GRP_L1, */
00443    "ro", ULMBCS_GRP_L2,
00444    "ru", ULMBCS_GRP_RU,
00445    "sh", ULMBCS_GRP_L2,
00446    "sk", ULMBCS_GRP_L2,
00447    "sl", ULMBCS_GRP_L2,
00448    "sq", ULMBCS_GRP_L2,
00449    "sr", ULMBCS_GRP_RU,
00450    /* "sv", ULMBCS_GRP_L1, */
00451    "th", ULMBCS_GRP_TH,
00452    "tr", ULMBCS_GRP_TR,
00453    "uk", ULMBCS_GRP_RU,
00454    /* "vi", ULMBCS_GRP_L1, */
00455    "zh_TW", ULMBCS_GRP_TW,
00456    "zh", ULMBCS_GRP_CN,
00457    NULL, ULMBCS_GRP_L1
00458 };
00459 
00460 
00461 ulmbcs_byte_t 
00462 FindLMBCSLocale(const char *LocaleID)
00463 {
00464    struct _LocaleLMBCSGrpMap *pTable = LocaleLMBCSGrpMap;
00465 
00466    if ((!LocaleID) || (!*LocaleID)) 
00467    {
00468       return 0;
00469    }
00470 
00471    while (pTable->LocaleID)
00472    {
00473       if (*pTable->LocaleID == *LocaleID) /* Check only first char for speed */
00474       {
00475          /* First char matches - check whole name, for entry-length */
00476          if (strncmp(pTable->LocaleID, LocaleID, strlen(pTable->LocaleID)) == 0)
00477             return pTable->OptGroup;
00478       }
00479       else
00480       if (*pTable->LocaleID > *LocaleID) /* Sorted alphabetically - exit */
00481          break;
00482       pTable++;
00483    }
00484    return ULMBCS_GRP_L1;
00485 }
00486 
00487 
00488 /* 
00489   Before we get to the main body of code, here's how we hook up to the rest 
00490   of ICU. ICU converters are required to define a structure that includes 
00491   some function pointers, and some common data, in the style of a C++
00492   vtable. There is also room in there for converter-specific data. LMBCS
00493   uses that converter-specific data to keep track of the 12 subconverters
00494   we use, the optimization group, and the group (if any) that matches the 
00495   locale. We have one structure instantiated for each of the 12 possible
00496   optimization groups. To avoid typos & to avoid boring the reader, we 
00497   put the declarations of these structures and functions into macros. To see 
00498   the definitions of these structures, see unicode\ucnv_bld.h
00499 */
00500 
00501 
00502 
00503 #define DECLARE_LMBCS_DATA(n) \
00504  static const UConverterImpl _LMBCSImpl##n={\
00505     UCNV_LMBCS_##n,\
00506     NULL,NULL,\
00507     _LMBCSOpen##n,\
00508     _LMBCSClose,\
00509     NULL,\
00510     _LMBCSToUnicodeWithOffsets,\
00511     _LMBCSToUnicodeWithOffsets,\
00512     _LMBCSFromUnicode,\
00513     _LMBCSFromUnicode,\
00514     _LMBCSGetNextUChar,\
00515     NULL\
00516 };\
00517 const UConverterStaticData _LMBCSStaticData##n={\
00518   sizeof(UConverterStaticData),\
00519  "LMBCS-"  #n,\
00520     0, UCNV_IBM, UCNV_LMBCS_##n, 1, 1,\
00521     { 0x3f, 0, 0, 0 },1,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
00522 };\
00523 const UConverterSharedData _LMBCSData##n={\
00524     sizeof(UConverterSharedData), ~((uint32_t) 0),\
00525     NULL, NULL, &_LMBCSStaticData##n, FALSE, &_LMBCSImpl##n, \
00526     0 \
00527 };
00528 
00529  /* The only function we needed to duplicate 12 times was the 'open'
00530 function, which will do basically the same thing except set a  different
00531 optimization group. So, we put the common stuff into a worker function, 
00532 and set up another macro to stamp out the 12 open functions:*/
00533 #define DEFINE_LMBCS_OPEN(n) \
00534 static void \
00535    _LMBCSOpen##n(UConverter*  _this,const char* name,const char* locale,uint32_t options,UErrorCode*  err) \
00536 { _LMBCSOpenWorker(_this, name,locale,options, err, n);} 
00537 
00538 
00539 
00540 /* Here's the prototypes for the functions we will put into the ICU structures:
00541 */
00542 
00543 void 
00544 _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *args,
00545                            UErrorCode*    err);         /* Std ICU err code */
00546 
00547 
00548 void 
00549 _LMBCSFromUnicode(UConverterFromUnicodeArgs *args,
00550                   UErrorCode*     err);
00551 
00552 UChar32 
00553 _LMBCSGetNextUChar(UConverterToUnicodeArgs *args,
00554                    UErrorCode*   err);
00555 
00556 
00557 /* Here's the open worker & the common close function */
00558 static void 
00559 _LMBCSOpenWorker(UConverter*  _this, 
00560                        const char*  name, 
00561                        const char*  locale,
00562                        uint32_t options,
00563                        UErrorCode*  err,
00564                        ulmbcs_byte_t OptGroup
00565                        )
00566 {
00567    UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS*)uprv_malloc (sizeof (UConverterDataLMBCS));
00568    if(extraInfo != NULL)
00569     {
00570        ulmbcs_byte_t i;
00571        ulmbcs_byte_t imax;
00572        imax = sizeof(extraInfo->OptGrpConverter)/sizeof(extraInfo->OptGrpConverter[0]);
00573 
00574        for (i=0; i < imax; i++)         
00575        {
00576             extraInfo->OptGrpConverter[i] =
00577                (OptGroupByteToCPName[i] != NULL) ? 
00578                ucnv_open(OptGroupByteToCPName[i], err) : NULL;
00579        }
00580        extraInfo->OptGroup = OptGroup;
00581        extraInfo->localeConverterIndex = FindLMBCSLocale(locale);
00582    } 
00583    else
00584    {
00585        *err = U_MEMORY_ALLOCATION_ERROR;
00586    }
00587    _this->extraInfo = extraInfo;
00588 }
00589 
00590 static void 
00591 _LMBCSClose(UConverter *   _this) 
00592 {
00593     if (_this->extraInfo != NULL)
00594     {
00595         ulmbcs_byte_t Ix;
00596         UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
00597 
00598         for (Ix=0; Ix < ULMBCS_GRP_UNICODE; Ix++)
00599         {
00600            if (extraInfo->OptGrpConverter[Ix] != NULL)
00601               ucnv_close (extraInfo->OptGrpConverter[Ix]);
00602         }
00603         uprv_free (_this->extraInfo);
00604     }
00605 }
00606 
00607 /* And now, the macroized declarations of data & functions: */
00608 DEFINE_LMBCS_OPEN(1)
00609 DEFINE_LMBCS_OPEN(2)
00610 DEFINE_LMBCS_OPEN(3)
00611 DEFINE_LMBCS_OPEN(4)
00612 DEFINE_LMBCS_OPEN(5)
00613 DEFINE_LMBCS_OPEN(6)
00614 DEFINE_LMBCS_OPEN(8)
00615 DEFINE_LMBCS_OPEN(11)
00616 DEFINE_LMBCS_OPEN(16)
00617 DEFINE_LMBCS_OPEN(17)
00618 DEFINE_LMBCS_OPEN(18)
00619 DEFINE_LMBCS_OPEN(19)
00620 
00621 
00622 DECLARE_LMBCS_DATA(1)
00623 DECLARE_LMBCS_DATA(2)
00624 DECLARE_LMBCS_DATA(3)
00625 DECLARE_LMBCS_DATA(4)
00626 DECLARE_LMBCS_DATA(5)
00627 DECLARE_LMBCS_DATA(6)
00628 DECLARE_LMBCS_DATA(8)
00629 DECLARE_LMBCS_DATA(11)
00630 DECLARE_LMBCS_DATA(16)
00631 DECLARE_LMBCS_DATA(17)
00632 DECLARE_LMBCS_DATA(18)
00633 DECLARE_LMBCS_DATA(19)
00634 
00635 /* 
00636 Here's an all-crash stop for debugging, since ICU does not have asserts.
00637 Turn this on by defining LMBCS_DEBUG, or by changing it to 
00638 #if 1 
00639 */
00640 #if LMBCS_DEBUG
00641 #define MyAssert(b) {if (!(b)) {*(char *)0 = 1;}}
00642 #else
00643 #define MyAssert(b) 
00644 #endif
00645 
00646 /* 
00647    Here's the basic helper function that we use when converting from
00648    Unicode to LMBCS, and we suspect that a Unicode character will fit into 
00649    one of the 12 groups. The return value is the number of bytes written 
00650    starting at pStartLMBCS (if any).
00651 */
00652 
00653 size_t
00654 LMBCSConversionWorker (
00655    UConverterDataLMBCS * extraInfo,    /* subconverters, opt & locale groups */
00656    ulmbcs_byte_t group,                /* The group to try */
00657    ulmbcs_byte_t  * pStartLMBCS,              /* where to put the results */
00658    UChar * pUniChar,                   /* The input unicode character */
00659    ulmbcs_byte_t * lastConverterIndex, /* output: track last successful group used */
00660    UBool * groups_tried                /* output: track any unsuccessful groups */
00661 )   
00662 {
00663    ulmbcs_byte_t  * pLMBCS = pStartLMBCS;
00664    UConverter * xcnv = extraInfo->OptGrpConverter[group];
00665 
00666    ulmbcs_byte_t  mbChar [ULMBCS_CHARSIZE_MAX];
00667    ulmbcs_byte_t  * pmbChar = mbChar;
00668    UBool isDoubleByteGroup = (UBool)((group >= ULMBCS_DOUBLEOPTGROUP_START) ? TRUE : FALSE);
00669    UErrorCode localErr = U_ZERO_ERROR;
00670    int bytesConverted =0;
00671 
00672    MyAssert(xcnv);
00673    MyAssert(group<ULMBCS_GRP_UNICODE);
00674 
00675    ucnv_fromUnicode(
00676       xcnv, 
00677       (char **)&pmbChar,(char *)mbChar+sizeof(mbChar),
00678       (const UChar **)&pUniChar,pUniChar+1,
00679       NULL,TRUE,&localErr);
00680    bytesConverted = pmbChar - mbChar;
00681    pmbChar = mbChar;
00682 
00683    /* most common failure mode is the sub-converter using the substitution char (0x7f for our converters)
00684    */
00685    if (*mbChar == xcnv->subChar[0] || U_FAILURE(localErr) || !bytesConverted )
00686    {
00687       groups_tried[group] = TRUE;
00688       return 0;
00689    }
00690    *lastConverterIndex = group;
00691 
00692    /* All initial byte values in lower ascii range should have been caught by now,
00693       except with the exception group.
00694     */
00695    MyAssert((*pmbChar <= ULMBCS_C0END) || (*pmbChar >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT));
00696    
00697    /* use converted data: first write 0, 1 or two group bytes */
00698    if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group)
00699    {
00700       *pLMBCS++ = group;
00701       if (bytesConverted == 1 && isDoubleByteGroup)
00702       {
00703          *pLMBCS++ = group;
00704       }
00705    }
00706    /* then move over the converted data */
00707    do 
00708    {
00709       *pLMBCS++ = *pmbChar++;
00710    } 
00711    while(--bytesConverted);   
00712       
00713    return (pLMBCS - pStartLMBCS);
00714 }
00715 
00716 
00717 /* This is a much simpler version of above, when we 
00718 know we are writing LMBCS using the Unicode group
00719 */
00720 size_t 
00721 LMBCSConvertUni(ulmbcs_byte_t * pLMBCS, UChar uniChar)  
00722 {
00723      /* encode into LMBCS Unicode range */
00724    uint8_t LowCh =   (uint8_t)(uniChar & 0x00FF);
00725    uint8_t HighCh  = (uint8_t)(uniChar >> 8);
00726 
00727    *pLMBCS++ = ULMBCS_GRP_UNICODE;
00728 
00729    if (LowCh == 0)
00730    {
00731       *pLMBCS++ = ULMBCS_UNICOMPATZERO;
00732       *pLMBCS++ = HighCh;
00733    }
00734    else
00735    {
00736       *pLMBCS++ = HighCh;
00737       *pLMBCS++ = LowCh;
00738    }
00739    return ULMBCS_UNICODE_SIZE;
00740 }
00741 
00742 
00743 
00744 /* The main Unicode to LMBCS conversion function */
00745 void 
00746 _LMBCSFromUnicode(UConverterFromUnicodeArgs*     args,
00747                   UErrorCode*     err)
00748 {
00749    ulmbcs_byte_t lastConverterIndex = 0;
00750    UChar uniChar;
00751    ulmbcs_byte_t  LMBCS[ULMBCS_CHARSIZE_MAX];
00752    ulmbcs_byte_t  * pLMBCS;
00753    int bytes_written;
00754    UBool groups_tried[ULMBCS_GRP_LAST];
00755    UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
00756    int sourceIndex = 0; 
00757 
00758 
00759    /* Basic strategy: attempt to fill in local LMBCS 1-char buffer.(LMBCS)
00760       If that succeeds, see if it will all fit into the target & copy it over 
00761       if it does.
00762 
00763       We try conversions in the following order:
00764 
00765       1. Single-byte ascii & special fixed control chars (&null)
00766       2. Look up group in table & try that (could be 
00767             A) Unicode group
00768             B) control group,
00769             C) national encoding, 
00770                or ambiguous SBCS or MBCS group (on to step 4...)
00771         
00772       3. If its ambiguous, try this order:
00773          A) The optimization group
00774          B) The locale group
00775          C) The last group that succeeded with this string.
00776          D) every other group that's relevent (single or double)
00777          E) If its single-byte ambiguous, try the exceptions group
00778 
00779       4. And as a grand fallback: Unicode
00780    */
00781 
00782    while (args->source < args->sourceLimit && !U_FAILURE(*err))
00783    {
00784       if (args->target >= args->targetLimit)
00785       {
00786          *err = U_BUFFER_OVERFLOW_ERROR;
00787          break;
00788       }
00789       uniChar = *(args->source);
00790       bytes_written = 0;
00791       pLMBCS = LMBCS;
00792 
00793       /* check cases in rough order of how common they are, for speed */
00794 
00795       /* single byte matches: strategy 1 */
00796 
00797       if (((uniChar > ULMBCS_C0END) && (uniChar < ULMBCS_C1START)) ||
00798           uniChar == 0 || uniChar == ULMBCS_HT || uniChar == ULMBCS_CR || 
00799           uniChar == ULMBCS_LF || uniChar == ULMBCS_123SYSTEMRANGE 
00800           )
00801       {
00802          *pLMBCS++ = (ulmbcs_byte_t ) uniChar;
00803          bytes_written = 1;
00804       }
00805 
00806 
00807       if (!bytes_written) 
00808       {
00809          /* Check by UNICODE range (Strategy 2) */
00810          ulmbcs_byte_t group = FindLMBCSUniRange(uniChar);
00811          
00812          if (group == ULMBCS_GRP_UNICODE)  /* (Strategy 2A) */
00813          {
00814             pLMBCS += LMBCSConvertUni(pLMBCS,uniChar);
00815             
00816             bytes_written = pLMBCS - LMBCS;
00817          }
00818          else if (group == ULMBCS_GRP_CTRL)  /* (Strategy 2B) */
00819          {
00820             /* Handle control characters here */
00821             if (uniChar <= ULMBCS_C0END)
00822             {
00823                *pLMBCS++ = ULMBCS_GRP_CTRL;
00824                *pLMBCS++ = (ulmbcs_byte_t)(ULMBCS_CTRLOFFSET + uniChar);
00825             }
00826             else if (uniChar >= ULMBCS_C1START && uniChar <= ULMBCS_C1START + ULMBCS_CTRLOFFSET)
00827             {
00828                *pLMBCS++ = ULMBCS_GRP_CTRL;
00829                *pLMBCS++ = (ulmbcs_byte_t ) (uniChar & 0x00FF);
00830             }
00831             bytes_written = pLMBCS - LMBCS;
00832          }
00833          else if (group < ULMBCS_GRP_UNICODE)  /* (Strategy 2C) */
00834          {
00835             /* a specific converter has been identified - use it */
00836             bytes_written = LMBCSConversionWorker (
00837                               extraInfo, group, pLMBCS, &uniChar, 
00838                               &lastConverterIndex, groups_tried);
00839          }
00840          if (!bytes_written)    /* the ambiguous group cases  (Strategy 3) */
00841          {
00842             memset(groups_tried, 0, sizeof(groups_tried));
00843 
00844          /* check for non-default optimization group (Strategy 3A )*/
00845             if (extraInfo->OptGroup != 1 
00846                   && ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->OptGroup)) 
00847             {
00848                bytes_written = LMBCSConversionWorker (extraInfo, 
00849                   extraInfo->OptGroup, pLMBCS, &uniChar, 
00850                   &lastConverterIndex, groups_tried);
00851             }
00852             /* check for locale optimization group (Strategy 3B) */
00853             if (!bytes_written 
00854                && (extraInfo->localeConverterIndex) 
00855                && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->localeConverterIndex)))
00856                {
00857                   bytes_written = LMBCSConversionWorker (extraInfo, 
00858                      extraInfo->localeConverterIndex, pLMBCS, &uniChar, 
00859                      &lastConverterIndex, groups_tried);
00860                }
00861             /* check for last optimization group used for this string (Strategy 3C) */
00862             if (!bytes_written 
00863                 && (lastConverterIndex) 
00864                && (ULMBCS_AMBIGUOUS_MATCH(group, lastConverterIndex)))
00865                {
00866                   bytes_written = LMBCSConversionWorker (extraInfo, 
00867                      lastConverterIndex, pLMBCS, &uniChar, 
00868                      &lastConverterIndex, groups_tried);
00869            
00870                }
00871             if (!bytes_written)
00872             {
00873                /* just check every possible matching converter (Strategy 3D) */ 
00874                ulmbcs_byte_t grp_start;
00875                ulmbcs_byte_t grp_end;  
00876                ulmbcs_byte_t grp_ix;
00877                grp_start = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS) 
00878                         ? ULMBCS_DOUBLEOPTGROUP_START 
00879                         :  ULMBCS_GRP_L1);
00880                grp_end = (ulmbcs_byte_t)((group == ULMBCS_AMBIGUOUS_MBCS) 
00881                         ? ULMBCS_GRP_LAST 
00882                         :  ULMBCS_GRP_TH);
00883                for (grp_ix = grp_start;
00884                    grp_ix <= grp_end && !bytes_written; 
00885                     grp_ix++)
00886                {
00887                   if (extraInfo->OptGrpConverter [grp_ix] && !groups_tried [grp_ix])
00888                   {
00889                      bytes_written = LMBCSConversionWorker (extraInfo, 
00890                        grp_ix, pLMBCS, &uniChar, 
00891                        &lastConverterIndex, groups_tried);
00892                   }
00893                }
00894                 /* a final conversion fallback to the exceptions group if its likely 
00895                      to be single byte  (Strategy 3E) */
00896                if (!bytes_written && grp_start == ULMBCS_GRP_L1)
00897                {
00898                   bytes_written = LMBCSConversionWorker (extraInfo, 
00899                      ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar, 
00900                      &lastConverterIndex, groups_tried);
00901                }
00902             }
00903             /* all of our other strategies failed. Fallback to Unicode. (Strategy 4)*/
00904             if (!bytes_written)
00905             {
00906 
00907                pLMBCS += LMBCSConvertUni(pLMBCS, uniChar);
00908                bytes_written = pLMBCS - LMBCS;
00909             }
00910          }
00911       }
00912   
00913       /* we have a translation. increment source and write as much as posible to target */
00914       args->source++;
00915       pLMBCS = LMBCS;
00916       while (args->target < args->targetLimit && bytes_written--)
00917       {
00918          *(args->target)++ = *pLMBCS++;
00919          if (args->offsets)
00920          {
00921             *(args->offsets)++ = sourceIndex;
00922          }
00923       }
00924       sourceIndex++;
00925       if (bytes_written > 0)
00926       {
00927          /* write any bytes that didn't fit in target to the error buffer,
00928             common code will move this to target if we get called back with
00929             enough target room
00930          */
00931          uint8_t * pErrorBuffer = args->converter->charErrorBuffer;
00932          *err = U_BUFFER_OVERFLOW_ERROR;
00933          args->converter->charErrorBufferLength = (int8_t)bytes_written;
00934          while (bytes_written--)
00935          {
00936             *pErrorBuffer++ = *pLMBCS++;
00937          }
00938       }
00939    }     
00940 }
00941 
00942 
00943 /* Now, the Unicode from LMBCS section */
00944 
00945 
00946 /* A function to call when we are looking at the Unicode group byte in LMBCS */
00947 UChar
00948 GetUniFromLMBCSUni(char const ** ppLMBCSin)  /* Called with LMBCS-style Unicode byte stream */
00949 {
00950    uint8_t  HighCh = *(*ppLMBCSin)++;  /* Big-endian Unicode in LMBCS compatibility group*/
00951    uint8_t  LowCh  = *(*ppLMBCSin)++;
00952 
00953    if (HighCh == ULMBCS_UNICOMPATZERO ) 
00954    {
00955       HighCh = LowCh;
00956       LowCh = 0; /* zero-byte in LSB special character */
00957    }
00958    return (UChar)((HighCh << 8) | LowCh);
00959 }
00960 
00961 
00962 
00963 /* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index' 
00964    bytes left in source up to  sourceLimit.Errors appropriately if not 
00965 */
00966 
00967 #define CHECK_SOURCE_LIMIT(index) \
00968      if (args->source+index > args->sourceLimit){\
00969          *err = U_TRUNCATED_CHAR_FOUND;\
00970          args->source = saveSource;\
00971          return 0xffff;}
00972 
00973 
00974 /* Return the Unicode representation for the current LMBCS character
00975 
00976    This worker function is used by both ucnv_getNextUChar() and ucnv_ToUnicode().  
00977    The last parameter says whether the return value should be treated as UTF-16 or
00978    UTF-32. The only difference is in surrogate handling
00979 */
00980 
00981 UChar32 
00982 _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs*   args,
00983                          UErrorCode*   err,
00984                          UBool         returnUTF32)
00985 {
00986    ulmbcs_byte_t   CurByte; /* A byte from the input stream */
00987    UChar32 uniChar;    /* an output UNICODE char */
00988    const char * saveSource;
00989   
00990    /* error check */
00991    if (args->source >= args->sourceLimit)
00992    {
00993       *err = U_ILLEGAL_ARGUMENT_ERROR;
00994       return 0xffff;
00995    }
00996    /* Grab first byte & save address for error recovery */
00997    CurByte = *((ulmbcs_byte_t  *) (saveSource = args->source++));
00998    
00999    /*
01000     * at entry of each if clause:
01001     * 1. 'CurByte' points at the first byte of a LMBCS character
01002     * 2. '*source'points to the next byte of the source stream after 'CurByte' 
01003     *
01004     * the job of each if clause is:
01005     * 1. set '*source' to point at the beginning of next char (nop if LMBCS char is only 1 byte)
01006     * 2. set 'uniChar' up with the right Unicode value, or set 'err' appropriately
01007     */
01008    
01009    /* First lets check the simple fixed values. */
01010 
01011    if(((CurByte > ULMBCS_C0END) && (CurByte < ULMBCS_C1START)) /* ascii range */
01012       ||  (CurByte == 0) 
01013       ||  CurByte == ULMBCS_HT || CurByte == ULMBCS_CR 
01014       ||  CurByte == ULMBCS_LF || CurByte == ULMBCS_123SYSTEMRANGE)
01015    {
01016       uniChar = CurByte;
01017    }
01018    else  
01019    {
01020       UConverterDataLMBCS * extraInfo;
01021       ulmbcs_byte_t group; 
01022       UConverter* cnv; 
01023             
01024       if (CurByte == ULMBCS_GRP_CTRL)  /* Control character group - no opt group update */
01025       {
01026          ulmbcs_byte_t  C0C1byte;
01027          CHECK_SOURCE_LIMIT(1);
01028          C0C1byte = *(args->source)++;
01029          uniChar = (C0C1byte < ULMBCS_C1START) ? C0C1byte - ULMBCS_CTRLOFFSET : C0C1byte;
01030       }
01031       else 
01032       if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */
01033       {
01034          UChar second;
01035          CHECK_SOURCE_LIMIT(2);
01036          
01037          uniChar = GetUniFromLMBCSUni(&(args->source));
01038             
01039          /* at this point we are usually done, but we need to make sure we are not in 
01040          a situation where we can successfully put together a surrogate pair */
01041 
01042          if(returnUTF32 && UTF_IS_FIRST_SURROGATE(uniChar) && (args->source+3 <= args->sourceLimit)
01043             && *(args->source)++ == ULMBCS_GRP_UNICODE
01044             && UTF_IS_SECOND_SURROGATE(second = GetUniFromLMBCSUni(&(args->source))))
01045          {
01046                uniChar = UTF16_GET_PAIR_VALUE(uniChar, second);
01047          }
01048       }
01049       else if (CurByte <= ULMBCS_CTRLOFFSET)  
01050       {
01051          group = CurByte;                   /* group byte is in the source */
01052          extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
01053          cnv = extraInfo->OptGrpConverter[group];
01054       
01055          if (!cnv)
01056          {
01057             /* this is not a valid group byte - no converter*/
01058             *err = U_INVALID_CHAR_FOUND;
01059          }
01060       
01061          else if (group >= ULMBCS_DOUBLEOPTGROUP_START)    /* double byte conversion */
01062          {
01063 
01064             CHECK_SOURCE_LIMIT(2);
01065 
01066             /* check for LMBCS doubled-group-byte case */
01067             if (*args->source == group) {
01068                /* single byte */
01069                ++args->source;
01070                uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1, FALSE);
01071             } else {
01072                /* double byte */
01073                const char *newLimit = args->source + 2;
01074                uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, newLimit, FALSE);
01075                args->source = newLimit; /* set the correct limit even in case of an error */
01076             }
01077          }
01078          else {                                  /* single byte conversion */
01079             CHECK_SOURCE_LIMIT(1);
01080             CurByte = *(args->source)++;
01081             
01082             if (CurByte >= ULMBCS_C1START)
01083             {
01084                uniChar = cnv->sharedData->table->sbcs.toUnicode[CurByte];
01085             }
01086             else
01087             {
01088             /* The non-optimizable oddballs where there is an explicit byte 
01089              * AND the second byte is not in the upper ascii range
01090             */
01091                const char *s;
01092                char bytes[2];
01093 
01094                extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
01095                cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT];  
01096             
01097             /* Lookup value must include opt group */
01098                bytes[0] = group;
01099                bytes[1] = CurByte;
01100                s = bytes;
01101                uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &s, bytes + 2, FALSE);
01102             }
01103          }
01104       }
01105       else if (CurByte >= ULMBCS_C1START) /* group byte is implicit */
01106       {
01107          extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
01108          group = extraInfo->OptGroup;
01109          cnv = extraInfo->OptGrpConverter[group];
01110          if (group >= ULMBCS_DOUBLEOPTGROUP_START)    /* double byte conversion */
01111          {
01112             if (!_MBCSIsLeadByte(cnv->sharedData, CurByte))
01113             {
01114                CHECK_SOURCE_LIMIT(0);
01115 
01116                /* let the MBCS conversion consume CurByte again */
01117                --args->source;
01118                uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1, FALSE);
01119             }
01120             else
01121             {
01122                CHECK_SOURCE_LIMIT(1);
01123 
01124                /* let the MBCS conversion consume CurByte again */
01125                --args->source;
01126 
01127                /* since we know that we start at a lead byte, args->source _will_ be incremented by 2 */
01128                uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 2, FALSE);
01129             }
01130          }
01131          else                                   /* single byte conversion */
01132          {
01133             uniChar = cnv->sharedData->table->sbcs.toUnicode[CurByte];
01134          }
01135       }
01136    }
01137    if (((uint32_t)uniChar - 0xfffe) <= 1) /* 0xfffe<=uniChar<=0xffff */
01138    {
01139        /*It is very likely that the ErrorFunctor will write to the
01140        *internal buffers */
01141 
01142       /* This code needs updating when new error callbacks are installed */
01143       UConverterToUnicodeArgs cbArgs = *args;
01144       UChar * pUniChar = (UChar *)&uniChar;
01145       UConverterCallbackReason reason;
01146 
01147       if (uniChar == 0xfffe)
01148       {
01149         reason = UCNV_UNASSIGNED;
01150         *err = U_INVALID_CHAR_FOUND;
01151       }
01152       else
01153       {
01154         reason = UCNV_ILLEGAL;
01155         *err = U_ILLEGAL_CHAR_FOUND;
01156       }
01157 
01158       cbArgs.target = pUniChar;
01159       cbArgs.targetLimit = pUniChar + 1;
01160       cbArgs.converter->fromCharErrorBehaviour(cbArgs.converter->toUContext,
01161                                     &cbArgs,
01162                                     saveSource,
01163                                     args->sourceLimit - saveSource,
01164                                     reason,
01165                                     err);
01166    }
01167    return uniChar;
01168 }
01169 
01170 
01171 /* The exported function that gets one UTF32 character from a LMBCS stream
01172 */
01173 UChar32 
01174 _LMBCSGetNextUChar(UConverterToUnicodeArgs*   args,
01175                    UErrorCode*   err)
01176 {
01177    return _LMBCSGetNextUCharWorker(args, err, TRUE);
01178 }
01179 
01180 /* The exported function that converts lmbcs to one or more
01181    UChars - currently UTF-16
01182 */
01183 void 
01184 _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs*    args,
01185                      UErrorCode*    err)
01186 {
01187    UChar uniChar;    /* one output UNICODE char */
01188    const char * saveSource;
01189    const char * pStartLMBCS = args->source;  /* beginning of whole string */
01190 
01191    if (args->targetLimit == args->target)         /* error check may belong in common code */
01192    {
01193       *err = U_BUFFER_OVERFLOW_ERROR;
01194       return;
01195    }
01196    
01197    /* Process from source to limit, or until error */
01198    while (!*err && args->sourceLimit > args->source && args->targetLimit > args->target)
01199    {
01200       saveSource = args->source; /* beginning of current code point */
01201 
01202       if (args->converter->invalidCharLength) /* reassemble char from previous call */
01203       {
01204          char LMBCS [ULMBCS_CHARSIZE_MAX];
01205          char *pLMBCS = LMBCS, *saveSource, *saveSourceLimit; 
01206          size_t size_old = args->converter->invalidCharLength;
01207 
01208          /* limit from source is either reminder of temp buffer, or user limit on source */
01209          size_t size_new_maybe_1 = sizeof(LMBCS) - size_old;
01210          size_t size_new_maybe_2 = args->sourceLimit - args->source;
01211          size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2;
01212          
01213       
01214          uprv_memcpy(LMBCS, args->converter->invalidCharBuffer, size_old);
01215          uprv_memcpy(LMBCS + size_old, args->source, size_new);
01216          saveSource = (char*)args->source;
01217          saveSourceLimit = (char*)args->sourceLimit;
01218          args->source = pLMBCS;
01219          args->sourceLimit = pLMBCS+size_old+size_new;
01220          uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
01221          pLMBCS = (char*)args->source;
01222          args->source =saveSource;
01223          args->sourceLimit = saveSourceLimit;
01224          args->source += (pLMBCS - LMBCS - size_old);
01225 
01226          if (*err == U_TRUNCATED_CHAR_FOUND && !args->flush)
01227          {
01228             /* evil special case: source buffers so small a char spans more than 2 buffers */
01229             int8_t savebytes = (int8_t)(size_old+size_new);
01230             args->converter->invalidCharLength = savebytes;
01231             uprv_memcpy(args->converter->invalidCharBuffer, LMBCS, savebytes);
01232             args->source = args->sourceLimit;
01233             *err = U_ZERO_ERROR;
01234             return;
01235          }
01236          else
01237          {
01238             /* clear the partial-char marker */
01239             args->converter->invalidCharLength = 0;
01240          }
01241       }
01242       else
01243       {
01244          uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
01245       }
01246       if (U_SUCCESS(*err))
01247       {
01248          if (uniChar < 0xfffe)
01249          {
01250             *(args->target)++ = uniChar;
01251             if(args->offsets)
01252             {
01253                *(args->offsets)++ = saveSource - pStartLMBCS;
01254             }
01255          }
01256          else if (uniChar == 0xfffe)
01257          {
01258             *err = U_INVALID_CHAR_FOUND;
01259          }
01260          else /* if (uniChar == 0xffff) */
01261          {
01262             *err = U_ILLEGAL_CHAR_FOUND;
01263          }
01264       }
01265    }
01266    /* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */
01267    if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target)
01268    {
01269       *err = U_BUFFER_OVERFLOW_ERROR;
01270    }
01271 
01272    /* If character incomplete, store away partial char if more to come */
01273    if ((*err == U_TRUNCATED_CHAR_FOUND) && !args->flush )
01274          {
01275       int8_t savebytes = (int8_t)(args->sourceLimit - saveSource);
01276       args->converter->invalidCharLength = (int8_t)savebytes;
01277       uprv_memcpy(args->converter->invalidCharBuffer, saveSource, savebytes);
01278       args->source = args->sourceLimit;
01279       *err = U_ZERO_ERROR;
01280    }
01281 }
01282 
01283 
01284 
01285 

Generated at Tue Dec 5 10:48:02 2000 for ICU by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000