Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

utf_impl.c

00001 /*
00002 *******************************************************************************
00003 *
00004 *   Copyright (C) 1999-2000, International Business Machines
00005 *   Corporation and others.  All Rights Reserved.
00006 *
00007 *******************************************************************************
00008 *   file name:  utf_impl.c
00009 *   encoding:   US-ASCII
00010 *   tab size:   8 (not used)
00011 *   indentation:4
00012 *
00013 *   created on: 1999sep13
00014 *   created by: Markus W. Scherer
00015 *
00016 *   This file provides implementation functions for macros in the utfXX.h
00017 *   that would otherwise be too long as macros.
00018 */
00019 
00020 /* set import/export definitions */
00021 #ifndef U_UTF8_IMPL
00022 #   define U_UTF8_IMPL
00023 #endif
00024 
00025 #include "unicode/umachine.h"
00026 #include "unicode/utf.h"
00027 
00028 /*
00029  * This table could be replaced on many machines by
00030  * a few lines of assembler code using an
00031  * "index of first 0-bit from msb" instruction and
00032  * one or two more integer instructions.
00033  *
00034  * For example, on an i386, do something like
00035  * - MOV AL, leadByte
00036  * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)
00037  * - MOV AH, 0
00038  * - BSR BX, AX     (16-bit)
00039  * - MOV AX, 6      (result)
00040  * - JZ finish      (ZF==1 if leadByte==0xff)
00041  * - SUB AX, BX (result)
00042  * -finish:
00043  * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
00044  */
00045 U_EXPORT uint8_t U_EXPORT2
00046 utf8_countTrailBytes[256]={
00047     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00048     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00049     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00050     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00051 
00052     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00053     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00054     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00055     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00056 
00057     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00058     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00059     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00060     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00061 
00062     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00063     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00064 
00065     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00066     3, 3, 3, 3, 3, 3, 3, 3,
00067     4, 4, 4, 4,
00068     5, 5,
00069     0, 0    /* illegal bytes 0xfe and 0xff */
00070 };
00071 
00072 static UChar32
00073 utf8_minRegular[4]={ 0, 0x80, 0x800, 0x10000 };
00074 
00075 static UChar32
00076 utf8_errorValue[6]={
00077     UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
00078     0x3ffffff, 0x7fffffff
00079 };
00080 
00081 U_CAPI UChar32 U_EXPORT2
00082 utf8_nextCharSafeBody(const uint8_t *s, UTextOffset *pi, UTextOffset length, UChar32 c, UBool strict) {
00083     UTextOffset i=*pi;
00084     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
00085     if((i)+count<=(length)) {
00086         uint8_t trail, illegal=0;
00087 
00088         UTF8_MASK_LEAD_BYTE((c), count);
00089         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
00090         switch(count) {
00091         /* each branch falls through to the next one */
00092         case 5:
00093             trail=s[(i)++];
00094             (c)=((c)<<6)|(trail&0x3f);
00095             illegal|=(trail&0xc0)^0x80;
00096         case 4:
00097             trail=s[(i)++];
00098             (c)=((c)<<6)|(trail&0x3f);
00099             illegal|=(trail&0xc0)^0x80;
00100         case 3:
00101             trail=s[(i)++];
00102             (c)=((c)<<6)|(trail&0x3f);
00103             if(c<0x110) {
00104                 illegal|=(trail&0xc0)^0x80;
00105             } else {
00106                 /* code point>0x10ffff, outside Unicode */
00107                 i+=2;
00108                 illegal=1;
00109                 break;
00110             }
00111         case 2:
00112             trail=s[(i)++];
00113             (c)=((c)<<6)|(trail&0x3f);
00114             illegal|=(trail&0xc0)^0x80;
00115         case 1:
00116             trail=s[(i)++];
00117             (c)=((c)<<6)|(trail&0x3f);
00118             illegal|=(trail&0xc0)^0x80;
00119             break;
00120         case 0:
00121             illegal=1;
00122         /* no default branch to optimize switch()  - all values are covered */
00123             break;
00124         }
00125 
00126         /*
00127          * All the error handling should return a value
00128          * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
00129          */
00130 
00131         /* correct sequence - all trail bytes have (b7..b6)==(10)? */
00132         if(illegal) {
00133             /* error handling */
00134             uint8_t errorCount=count;
00135             /* don't go beyond this sequence */
00136             (i)-=count;
00137             while(count>0 && UTF8_IS_TRAIL(s[i])) {
00138                 ++(i);
00139                 --count;
00140             }
00141             c=utf8_errorValue[errorCount-count];
00142         } else if((strict) &&
00143                   (UTF_IS_SURROGATE(c) ||
00144                    count>=4 || (c)<utf8_minRegular[count] ||
00145                    ((c)&0xfffe)==0xfffe)
00146         ) {
00147             /* irregular sequence */
00148             c=utf8_errorValue[count];
00149         }
00150     } else /* too few bytes left */ {
00151         /* error handling */
00152         UTextOffset i0=i;
00153         /* don't just set (i)=(length) in case there is an illegal sequence */
00154         while((i)<(length) && UTF8_IS_TRAIL(s[i])) {
00155             ++(i);
00156         }
00157         c=utf8_errorValue[i-i0];
00158     }
00159     *pi=i;
00160     return c;
00161 }
00162 
00163 U_CAPI UTextOffset U_EXPORT2
00164 utf8_appendCharSafeBody(uint8_t *s, UTextOffset i, UTextOffset length, UChar32 c) {
00165     if((c)<=0x7ff) {
00166         if((i)+1<(length)) {
00167             (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
00168             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
00169             return i;
00170         }
00171     } else if((uint32_t)(c)<=0xffff) {
00172         if((i)+2<(length)) {
00173             (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
00174             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
00175             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
00176             return i;
00177         }
00178     } else if((uint32_t)(c)<=0x10ffff) {
00179         if((i)+3<(length)) {
00180             (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
00181             (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
00182             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
00183             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
00184             return i;
00185         }
00186     }
00187     /* c>0x10ffff or not enough space, write an error value */
00188     length-=i;
00189     if(length>0) {
00190         UTextOffset offset;
00191         if(length>3) {
00192             length=3;
00193         }
00194         s+=i;
00195         offset=0;
00196         c=utf8_errorValue[length-1];
00197         UTF8_APPEND_CHAR_SAFE(s, offset, length, c);
00198         i=i+offset;
00199      }
00200     return i;
00201 }
00202 
00203 U_CAPI UChar32 U_EXPORT2
00204 utf8_prevCharSafeBody(const uint8_t *s, UTextOffset start, UTextOffset *pi, UChar32 c, UBool strict) {
00205     UTextOffset i=*pi;
00206     uint8_t b, count=1, shift=6;
00207 
00208     /* extract value bits from the last trail byte */
00209     c&=0x3f;
00210 
00211     for(;;) {
00212         if(i<=start) {
00213             /* no lead byte at all */
00214             c=UTF8_ERROR_VALUE_1;
00215             break;
00216         }
00217 
00218         /* read another previous byte */
00219         b=s[--i];
00220         if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
00221             if(b&0x40) {
00222                 /* lead byte, this will always end the loop */
00223                 uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);
00224 
00225                 if(count==shouldCount) {
00226                     /* set the new position */
00227                     *pi=i;
00228                     UTF8_MASK_LEAD_BYTE(b, count);
00229                     c|=(UChar32)b<<shift;
00230                     if( c>0x10ffff ||
00231                         (strict &&
00232                             (UTF_IS_SURROGATE(c) ||
00233                              count>=4 || c<utf8_minRegular[count] || (c&0xfffe)==0xfffe))
00234                     ) {
00235                         /* irregular sequence */
00236                         c=utf8_errorValue[count];
00237                     } else {
00238                         /* exit with correct c */
00239                     }
00240                 } else {
00241                     /* the lead byte does not match the number of trail bytes */
00242                     /* only set the position to the lead byte if it would
00243                        include the trail byte that we started with */
00244                     if(count<shouldCount) {
00245                         *pi=i;
00246                         c=utf8_errorValue[count];
00247                     } else {
00248                         c=UTF8_ERROR_VALUE_1;
00249                     }
00250                 }
00251                 break;
00252             } else if(count<5) {
00253                 /* trail byte */
00254                 c|=(UChar32)(b&0x3f)<<shift;
00255                 ++count;
00256                 shift+=6;
00257             } else {
00258                 /* more than 5 trail bytes is illegal */
00259                 c=UTF8_ERROR_VALUE_1;
00260                 break;
00261             }
00262         } else {
00263             /* single-byte character precedes trailing bytes */
00264             c=UTF8_ERROR_VALUE_1;
00265             break;
00266         }
00267     }
00268     return c;
00269 }
00270 
00271 U_CAPI UTextOffset U_EXPORT2
00272 utf8_back1SafeBody(const uint8_t *s, UTextOffset start, UTextOffset i) {
00273     /* i had been decremented once before the function call */
00274     UTextOffset I=i, Z;
00275     uint8_t b;
00276 
00277     /* read at most the 6 bytes s[Z] to s[i], inclusively */
00278     if(I-5>start) {
00279         Z=I-5;
00280     } else {
00281         Z=start;
00282     }
00283 
00284     /* return I if the sequence starting there is long enough to include i */
00285     for(;;) {
00286         b=s[I];
00287         if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
00288             break;
00289         } else if(b>=0xc0) {
00290             if(UTF8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
00291                 return I;
00292             } else {
00293                 break;
00294             }
00295         } else if(Z<I) {
00296             --I;
00297         } else {
00298             break;
00299         }
00300     }
00301 
00302     /* return i itself to be consistent with the FWD_1 macro */
00303     return i;
00304 }

Generated at Tue Dec 5 10:48:13 2000 for ICU by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000