00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #ifndef U_UTF8_IMPL
00022 # define U_UTF8_IMPL
00023 #endif
00024
00025 #include "unicode/umachine.h"
00026 #include "unicode/utf.h"
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045 U_EXPORT uint8_t U_EXPORT2
00046 utf8_countTrailBytes[256]={
00047 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00048 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00049 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00050 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00051
00052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00056
00057 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00058 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00059 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00060 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00061
00062 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00063 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00064
00065 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00066 3, 3, 3, 3, 3, 3, 3, 3,
00067 4, 4, 4, 4,
00068 5, 5,
00069 0, 0
00070 };
00071
00072 static UChar32
00073 utf8_minRegular[4]={ 0, 0x80, 0x800, 0x10000 };
00074
00075 static UChar32
00076 utf8_errorValue[6]={
00077 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
00078 0x3ffffff, 0x7fffffff
00079 };
00080
00081 U_CAPI UChar32 U_EXPORT2
00082 utf8_nextCharSafeBody(const uint8_t *s, UTextOffset *pi, UTextOffset length, UChar32 c, UBool strict) {
00083 UTextOffset i=*pi;
00084 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
00085 if((i)+count<=(length)) {
00086 uint8_t trail, illegal=0;
00087
00088 UTF8_MASK_LEAD_BYTE((c), count);
00089
00090 switch(count) {
00091
00092 case 5:
00093 trail=s[(i)++];
00094 (c)=((c)<<6)|(trail&0x3f);
00095 illegal|=(trail&0xc0)^0x80;
00096 case 4:
00097 trail=s[(i)++];
00098 (c)=((c)<<6)|(trail&0x3f);
00099 illegal|=(trail&0xc0)^0x80;
00100 case 3:
00101 trail=s[(i)++];
00102 (c)=((c)<<6)|(trail&0x3f);
00103 if(c<0x110) {
00104 illegal|=(trail&0xc0)^0x80;
00105 } else {
00106
00107 i+=2;
00108 illegal=1;
00109 break;
00110 }
00111 case 2:
00112 trail=s[(i)++];
00113 (c)=((c)<<6)|(trail&0x3f);
00114 illegal|=(trail&0xc0)^0x80;
00115 case 1:
00116 trail=s[(i)++];
00117 (c)=((c)<<6)|(trail&0x3f);
00118 illegal|=(trail&0xc0)^0x80;
00119 break;
00120 case 0:
00121 illegal=1;
00122
00123 break;
00124 }
00125
00126
00127
00128
00129
00130
00131
00132 if(illegal) {
00133
00134 uint8_t errorCount=count;
00135
00136 (i)-=count;
00137 while(count>0 && UTF8_IS_TRAIL(s[i])) {
00138 ++(i);
00139 --count;
00140 }
00141 c=utf8_errorValue[errorCount-count];
00142 } else if((strict) &&
00143 (UTF_IS_SURROGATE(c) ||
00144 count>=4 || (c)<utf8_minRegular[count] ||
00145 ((c)&0xfffe)==0xfffe)
00146 ) {
00147
00148 c=utf8_errorValue[count];
00149 }
00150 } else {
00151
00152 UTextOffset i0=i;
00153
00154 while((i)<(length) && UTF8_IS_TRAIL(s[i])) {
00155 ++(i);
00156 }
00157 c=utf8_errorValue[i-i0];
00158 }
00159 *pi=i;
00160 return c;
00161 }
00162
00163 U_CAPI UTextOffset U_EXPORT2
00164 utf8_appendCharSafeBody(uint8_t *s, UTextOffset i, UTextOffset length, UChar32 c) {
00165 if((c)<=0x7ff) {
00166 if((i)+1<(length)) {
00167 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
00168 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
00169 return i;
00170 }
00171 } else if((uint32_t)(c)<=0xffff) {
00172 if((i)+2<(length)) {
00173 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
00174 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
00175 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
00176 return i;
00177 }
00178 } else if((uint32_t)(c)<=0x10ffff) {
00179 if((i)+3<(length)) {
00180 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
00181 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
00182 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
00183 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
00184 return i;
00185 }
00186 }
00187
00188 length-=i;
00189 if(length>0) {
00190 UTextOffset offset;
00191 if(length>3) {
00192 length=3;
00193 }
00194 s+=i;
00195 offset=0;
00196 c=utf8_errorValue[length-1];
00197 UTF8_APPEND_CHAR_SAFE(s, offset, length, c);
00198 i=i+offset;
00199 }
00200 return i;
00201 }
00202
00203 U_CAPI UChar32 U_EXPORT2
00204 utf8_prevCharSafeBody(const uint8_t *s, UTextOffset start, UTextOffset *pi, UChar32 c, UBool strict) {
00205 UTextOffset i=*pi;
00206 uint8_t b, count=1, shift=6;
00207
00208
00209 c&=0x3f;
00210
00211 for(;;) {
00212 if(i<=start) {
00213
00214 c=UTF8_ERROR_VALUE_1;
00215 break;
00216 }
00217
00218
00219 b=s[--i];
00220 if((uint8_t)(b-0x80)<0x7e) {
00221 if(b&0x40) {
00222
00223 uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);
00224
00225 if(count==shouldCount) {
00226
00227 *pi=i;
00228 UTF8_MASK_LEAD_BYTE(b, count);
00229 c|=(UChar32)b<<shift;
00230 if( c>0x10ffff ||
00231 (strict &&
00232 (UTF_IS_SURROGATE(c) ||
00233 count>=4 || c<utf8_minRegular[count] || (c&0xfffe)==0xfffe))
00234 ) {
00235
00236 c=utf8_errorValue[count];
00237 } else {
00238
00239 }
00240 } else {
00241
00242
00243
00244 if(count<shouldCount) {
00245 *pi=i;
00246 c=utf8_errorValue[count];
00247 } else {
00248 c=UTF8_ERROR_VALUE_1;
00249 }
00250 }
00251 break;
00252 } else if(count<5) {
00253
00254 c|=(UChar32)(b&0x3f)<<shift;
00255 ++count;
00256 shift+=6;
00257 } else {
00258
00259 c=UTF8_ERROR_VALUE_1;
00260 break;
00261 }
00262 } else {
00263
00264 c=UTF8_ERROR_VALUE_1;
00265 break;
00266 }
00267 }
00268 return c;
00269 }
00270
00271 U_CAPI UTextOffset U_EXPORT2
00272 utf8_back1SafeBody(const uint8_t *s, UTextOffset start, UTextOffset i) {
00273
00274 UTextOffset I=i, Z;
00275 uint8_t b;
00276
00277
00278 if(I-5>start) {
00279 Z=I-5;
00280 } else {
00281 Z=start;
00282 }
00283
00284
00285 for(;;) {
00286 b=s[I];
00287 if((uint8_t)(b-0x80)>=0x7e) {
00288 break;
00289 } else if(b>=0xc0) {
00290 if(UTF8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
00291 return I;
00292 } else {
00293 break;
00294 }
00295 } else if(Z<I) {
00296 --I;
00297 } else {
00298 break;
00299 }
00300 }
00301
00302
00303 return i;
00304 }