Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

ustring.c

00001 /*
00002 *******************************************************************************
00003 *
00004 *   Copyright (C) 1998-1999, International Business Machines
00005 *   Corporation and others.  All Rights Reserved.
00006 *
00007 *******************************************************************************
00008 *
00009 * File ustring.h
00010 *
00011 * Modification History:
00012 *
00013 *   Date        Name        Description
00014 *   12/07/98    bertrand    Creation.
00015 *******************************************************************************
00016 */
00017 
00018 #include "unicode/ustring.h"
00019 #include "unicode/utypes.h"
00020 #include "cstring.h"
00021 #include "umutex.h"
00022 #include "unicode/ucnv.h"
00023 
00024 /* forward declaractions of definitions for the shared default converter */
00025 
00026 static UConverter *fgDefaultConverter = NULL;
00027 
00028 static UConverter*
00029 getDefaultConverter(void);
00030 
00031 static void
00032 releaseDefaultConverter(UConverter *converter);
00033 
00034 /* ANSI string.h - style functions ------------------------------------------ */
00035 
00036 #define MAX_STRLEN 0x0FFFFFFF
00037 
00038 UChar*
00039 u_strcat(UChar     *dst, 
00040     const UChar     *src)
00041 {
00042   UChar *anchor = dst;            /* save a pointer to start of dst */
00043 
00044   while(*dst != 0) {              /* To end of first string          */
00045     ++dst;
00046   }
00047   while((*dst = *src) != 0) {     /* copy string 2 over              */
00048     ++dst;
00049     ++src;
00050   }
00051 
00052   return anchor;
00053 }
00054 
00055 UChar* 
00056 u_strncat(UChar     *dst, 
00057      const UChar     *src, 
00058      int32_t     n ) 
00059 {
00060   if(n > 0) {
00061     UChar *anchor = dst;            /* save a pointer to start of dst */
00062 
00063     while(*dst != 0) {              /* To end of first string          */
00064       ++dst;
00065     }
00066     while((*dst = *src) != 0) {     /* copy string 2 over              */
00067       ++dst;
00068       if(--n == 0) {
00069         *dst = 0;
00070         break;
00071       }
00072       ++src;
00073     }
00074   
00075     return anchor;
00076   } else {
00077     return dst;
00078   }
00079 }
00080 
00081 UChar*
00082 u_strchr(const UChar *s, UChar c) 
00083 {
00084   while (*s && *s != c) {
00085     ++s;
00086   }
00087   if (*s == c)
00088     return (UChar *)s;
00089   return NULL;
00090 }
00091 
00092 /* A Boyer-Moore algorithm would be better, but that would require a hashtable
00093    because UChar is so big. This algorithm doesn't use a lot of extra memory.
00094  */
00095 U_CAPI UChar * U_EXPORT2
00096 u_strstr(const UChar *s, const UChar *substring) {
00097 
00098   UChar *strItr, *subItr;
00099 
00100   if (*substring == 0) {
00101     return (UChar *)s;
00102   }
00103 
00104   do {
00105     strItr = (UChar *)s;
00106     subItr = (UChar *)substring;
00107 
00108     /* Only one string iterator needs checking for null terminator */
00109     while ((*strItr != 0) && (*strItr == *subItr)) {
00110       strItr++;
00111       subItr++;
00112     }
00113 
00114     if (*subItr == 0) {             /* Was the end of the substring reached? */
00115       return (UChar *)s;
00116     }
00117 
00118     s++;
00119   } while (*strItr != 0);           /* Was the end of the string reached? */
00120 
00121   return NULL;                      /* No match */
00122 }
00123 
00124 U_CAPI UChar * U_EXPORT2
00125 u_strchr32(const UChar *s, UChar32 c) {
00126   if(!UTF_NEED_MULTIPLE_UCHAR(c)) {
00127     return u_strchr(s, (UChar)c);
00128   } else {
00129     UChar buffer[UTF_MAX_CHAR_LENGTH + 1];
00130     UTextOffset i = 0;
00131     UTF_APPEND_CHAR_UNSAFE(buffer, i, c);
00132     buffer[i] = 0;
00133     return u_strstr(s, buffer);
00134   }
00135 }
00136 
00137 int32_t  
00138 u_strcmp(const UChar *s1, 
00139     const UChar *s2) 
00140 {
00141   int32_t rc;
00142   for(;;) {
00143     rc = (int32_t)*s1 - (int32_t)*s2;
00144     if(rc != 0 || *s1 == 0) {
00145       return rc;
00146     }
00147     ++s1;
00148     ++s2;
00149   }
00150 }
00151 
00152 int32_t  
00153 u_strncmp(const UChar     *s1, 
00154      const UChar     *s2, 
00155      int32_t     n) 
00156 {
00157   if(n > 0) {
00158     int32_t rc;
00159     for(;;) {
00160       rc = (int32_t)*s1 - (int32_t)*s2;
00161       if(rc != 0 || *s1 == 0 || --n == 0) {
00162         return rc;
00163       }
00164       ++s1;
00165       ++s2;
00166     }
00167   } else {
00168     return 0;
00169   }
00170 }
00171 
00172 UChar*
00173 u_strcpy(UChar     *dst, 
00174     const UChar     *src) 
00175 {
00176   UChar *anchor = dst;            /* save a pointer to start of dst */
00177 
00178   while((*dst = *src) != 0) {     /* copy string 2 over              */
00179     ++dst;
00180     ++src;
00181   }
00182 
00183   return anchor;
00184 }
00185 
00186 UChar* 
00187 u_strncpy(UChar     *dst, 
00188      const UChar     *src, 
00189      int32_t     n) 
00190 {
00191   UChar *anchor = dst;            /* save a pointer to start of dst */
00192 
00193   if(n > 0) {
00194     while((*dst = *src) != 0) {   /* copy string 2 over              */
00195       ++dst;
00196       if(--n == 0) {
00197         *dst = 0;
00198         break;
00199       }
00200       ++src;
00201     }
00202   } else {
00203     *dst = 0;
00204   }
00205 
00206   return anchor;
00207 }
00208 
00209 int32_t  
00210 u_strlen(const UChar *s) 
00211 {
00212 # if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
00213     return uprv_wcslen(s);
00214 # else
00215     const UChar *t = s;
00216     while(*t != 0) {
00217       ++t;
00218     }
00219     return t - s;
00220 #endif
00221 }
00222 
00223 /* conversions between char* and UChar* ------------------------------------- */
00224 
00225 UChar* u_uastrcpy(UChar *ucs1,
00226           const char *s2 )
00227 {
00228   UConverter *cnv = getDefaultConverter();
00229   if(cnv != NULL) {
00230     UErrorCode err = U_ZERO_ERROR;
00231     ucnv_toUChars(cnv,
00232                     ucs1,
00233                     MAX_STRLEN,
00234                     s2,
00235                     uprv_strlen(s2),
00236                     &err);
00237     releaseDefaultConverter(cnv);
00238     if(U_FAILURE(err)) {
00239       *ucs1 = 0;
00240     }
00241   } else {
00242     *ucs1 = 0;
00243   }
00244   return ucs1;
00245 }
00246 
00247 /*
00248  returns the minimum of (the length of the null-terminated string) and n.
00249 */
00250 static int32_t u_astrnlen(const char *ucs1, int32_t n)
00251 {
00252     int32_t len = 0;
00253 
00254     if (ucs1)
00255     {
00256         while (*(ucs1++) && n--)
00257         {
00258             len++;
00259         }
00260     }
00261     return len;
00262 }
00263 
00264 UChar* u_uastrncpy(UChar *ucs1,
00265            const char *s2 ,
00266            int32_t n)
00267 {
00268   UChar *target = ucs1;
00269   UConverter *cnv = getDefaultConverter();
00270   if(cnv != NULL) {
00271     UErrorCode err = U_ZERO_ERROR;
00272     ucnv_reset(cnv);
00273     ucnv_toUnicode(cnv,
00274                    &target,
00275                    ucs1+n,
00276                    &s2,
00277                    s2+u_astrnlen(s2, n),
00278                    NULL,
00279                    TRUE,
00280                    &err);
00281     ucnv_reset(cnv); /* be good citizens */
00282     releaseDefaultConverter(cnv);
00283     if(U_FAILURE(err) && (err != U_BUFFER_OVERFLOW_ERROR) ) {
00284       *ucs1 = 0; /* failure */
00285     }
00286     if(target < (ucs1+n)) { /* U_BUFFER_OVERFLOW_ERROR isn't an err, just means no termination will happen. */
00287       *target = 0;  /* terminate */
00288     }
00289   } else {
00290     *ucs1 = 0;
00291   }
00292   return ucs1;
00293 }
00294 
00295 char* u_austrcpy(char *s1,
00296          const UChar *ucs2 )
00297 {
00298   UConverter *cnv = getDefaultConverter();
00299   if(cnv != NULL) {
00300     UErrorCode err = U_ZERO_ERROR;
00301     int32_t len = ucnv_fromUChars(cnv,
00302                   s1,
00303                   MAX_STRLEN,
00304                   ucs2,
00305                   -1,
00306                   &err);
00307     releaseDefaultConverter(cnv);
00308     s1[len] = 0;
00309   } else {
00310     *s1 = 0;
00311   }
00312   return s1;
00313 }
00314 
00315 /* mutexed access to a shared default converter ----------------------------- */
00316 
00317 /* this is the same implementation as in unistr.cpp */
00318 
00319 static UConverter*
00320 getDefaultConverter()
00321 {
00322   UConverter *converter = NULL;
00323 
00324   if(fgDefaultConverter != NULL) {
00325     umtx_lock(NULL);
00326 
00327     /* need to check to make sure it wasn't taken out from under us */
00328     if(fgDefaultConverter != NULL) {
00329       converter = fgDefaultConverter;
00330       fgDefaultConverter = NULL;
00331     }
00332     umtx_unlock(NULL);
00333   }
00334 
00335   /* if the cache was empty, create a converter */
00336   if(converter == NULL) {
00337     UErrorCode status = U_ZERO_ERROR;
00338     converter = ucnv_open(NULL, &status);
00339     if(U_FAILURE(status)) {
00340       return NULL;
00341     }
00342   }
00343 
00344   return converter;
00345 }
00346 
00347 static void
00348 releaseDefaultConverter(UConverter *converter)
00349 {
00350   if(fgDefaultConverter == NULL) {
00351     umtx_lock(NULL);
00352 
00353     if(fgDefaultConverter == NULL) {
00354       fgDefaultConverter = converter;
00355       converter = NULL;
00356     }
00357     umtx_unlock(NULL);
00358   }
00359 
00360   if(converter != NULL) {
00361     ucnv_close(converter);
00362   }
00363 }
00364 
00365 /* u_unescape & support fns ------------------------------------------------- */
00366 
00367 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
00368 static const UChar UNESCAPE_MAP[] = {
00369     /*"   0x22, 0x22 */
00370     /*'   0x27, 0x27 */
00371     /*?   0x3F, 0x3F */
00372     /*\   0x5C, 0x5C */
00373     /*a*/ 0x61, 0x07,
00374     /*b*/ 0x62, 0x08,
00375     /*f*/ 0x66, 0x0c,
00376     /*n*/ 0x6E, 0x0a,
00377     /*r*/ 0x72, 0x0d,
00378     /*t*/ 0x74, 0x09,
00379     /*v*/ 0x76, 0x0b
00380 };
00381 enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) };
00382 
00383 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
00384 static int8_t _digit8(UChar c) {
00385     if (c >= 0x0030 && c <= 0x0037) {
00386         return (int8_t)(c - 0x0030);
00387     }
00388     return -1;
00389 }
00390 
00391 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
00392 static int8_t _digit16(UChar c) {
00393     if (c >= 0x0030 && c <= 0x0039) {
00394         return (int8_t)(c - 0x0030);
00395     }
00396     if (c >= 0x0041 && c <= 0x0046) {
00397         return (int8_t)(c - (0x0041 - 10));
00398     }
00399     if (c >= 0x0061 && c <= 0x0066) {
00400         return (int8_t)(c - (0x0061 - 10));
00401     }
00402     return -1;
00403 }
00404 
00405 /* Parse a single escape sequence.  Although this method deals in
00406  * UChars, it does not use C++ or UnicodeString.  This allows it to
00407  * be used from C contexts. */
00408 U_CAPI UChar32 U_EXPORT2
00409 u_unescapeAt(UNESCAPE_CHAR_AT charAt,
00410              int32_t *offset,
00411              int32_t length,
00412              void *context) {
00413 
00414     int32_t start = *offset;
00415     UChar c;
00416     UChar32 result = 0;
00417     int8_t n = 0;
00418     int8_t minDig = 0;
00419     int8_t maxDig = 0;
00420     int8_t bitsPerDigit = 4; 
00421     int8_t dig;
00422     int32_t i;
00423 
00424     /* Check that offset is in range */
00425     if (*offset < 0 || *offset >= length) {
00426         goto err;
00427     }
00428 
00429     /* Fetch first UChar after '\\' */
00430     c = charAt((*offset)++, context);
00431 
00432     /* Convert hexadecimal and octal escapes */
00433     switch (c) {
00434     case 0x0075 /*'u'*/:
00435         minDig = maxDig = 4;
00436         break;
00437     case 0x0055 /*'U'*/:
00438         minDig = maxDig = 8;
00439         break;
00440     case 0x0078 /*'x'*/:
00441         minDig = 1;
00442         maxDig = 2;
00443         break;
00444     default:
00445         dig = _digit8(c);
00446         if (dig >= 0) {
00447             minDig = 1;
00448             maxDig = 3;
00449             n = 1; /* Already have first octal digit */
00450             bitsPerDigit = 3;
00451             result = dig;
00452         }
00453         break;
00454     }
00455     if (minDig != 0) {
00456         while (*offset < length && n < maxDig) {
00457             c = charAt(*offset, context);
00458             dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
00459             if (dig < 0) {
00460                 break;
00461             }
00462             result = (result << bitsPerDigit) | dig;
00463             ++(*offset);
00464             ++n;
00465         }
00466         if (n < minDig) {
00467             goto err;
00468         }
00469         return result;
00470     }
00471 
00472     /* Convert C-style escapes in table */
00473     for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
00474         if (c == UNESCAPE_MAP[i]) {
00475             return UNESCAPE_MAP[i+1];
00476         } else if (c < UNESCAPE_MAP[i]) {
00477             break;
00478         }
00479     }
00480 
00481     /* If no special forms are recognized, then consider
00482      * the backslash to generically escape the next character.
00483      * Deal with surrogate pairs. */
00484     if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) {
00485         UChar c2 = charAt(*offset, context);
00486         if (UTF_IS_SECOND_SURROGATE(c2)) {
00487             ++(*offset);
00488             return UTF16_GET_PAIR_VALUE(c, c2);
00489         }
00490     }
00491     return c;
00492 
00493  err:
00494     /* Invalid escape sequence */
00495     *offset = start; /* Reset to initial value */
00496     return (UChar32)0xFFFFFFFF;
00497 }
00498 
00499 /* u_unescapeAt() callback to return a UChar from a char* */
00500 static UChar _charPtr_charAt(int32_t offset, void *context) {
00501     UChar c16;
00502     /* It would be more efficient to access the invariant tables
00503      * directly but there is no API for that. */
00504     u_charsToUChars(((char*) context) + offset, &c16, 1);
00505     return c16;
00506 }
00507 
00508 /* Append an escape-free segment of the text; used by u_unescape() */
00509 static void _appendUChars(UChar *dest, int32_t destCapacity,
00510                           const char *src, int32_t srcLen) {
00511     if (destCapacity < 0) {
00512         destCapacity = 0;
00513     }
00514     if (srcLen > destCapacity) {
00515         srcLen = destCapacity;
00516     }
00517     u_charsToUChars(src, dest, srcLen);
00518 }
00519 
00520 /* Do an invariant conversion of char* -> UChar*, with escape parsing */
00521 U_CAPI int32_t U_EXPORT2
00522 u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
00523     const char *segment = src;
00524     int32_t i = 0;
00525     char c;
00526 
00527     while ((c=*src) != 0) {
00528         /* '\\' intentionally written as compiler-specific
00529          * character constant to correspond to compiler-specific
00530          * char* constants. */
00531         if (c == '\\') {
00532             int32_t lenParsed = 0;
00533             UChar32 c32;
00534             if (src != segment) {
00535                 if (dest != NULL) {
00536                     _appendUChars(dest + i, destCapacity - i,
00537                                   segment, src - segment);
00538                 }
00539                 i += src - segment;
00540             }
00541             ++src; /* advance past '\\' */
00542             c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, uprv_strlen(src), (void*)src);
00543             if (lenParsed == 0) {
00544                 goto err;
00545             }
00546             src += lenParsed; /* advance past escape seq. */
00547             if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) {
00548                 UTF_APPEND_CHAR_UNSAFE(dest, i, c32);
00549             } else {
00550                 i += UTF_CHAR_LENGTH(c32);
00551             }
00552             segment = src;
00553         } else {
00554             ++src;
00555         }
00556     }
00557     if (src != segment) {
00558         if (dest != NULL) {
00559             _appendUChars(dest + i, destCapacity - i,
00560                           segment, src - segment);
00561         }
00562         i += src - segment;
00563     }
00564     if (dest != NULL && i < destCapacity) {
00565         dest[i] = 0;
00566     }
00567     return i + 1; /* add 1 for zero term */
00568 
00569  err:
00570     if (dest != NULL && destCapacity > 0) {
00571         *dest = 0;
00572     }
00573     return 0;
00574 }

Generated at Tue Dec 5 10:48:12 2000 for ICU by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000