Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

ucmp8.c

00001 /*
00002 ********************************************************************
00003 * COPYRIGHT: 
00004 * Copyright (c) 1997-1999, International Business Machines Corporation and
00005 * others. All Rights Reserved.
00006 ********************************************************************
00007 */
00008 
00009 #ifndef _STDLIB_H
00010 #include <stdlib.h>
00011 #endif
00012 
00013 #ifndef _STDIO_H
00014 #include <stdio.h>
00015 #endif
00016 
00017 
00018 #include "ucmp8.h"
00019 #include "cmemory.h"
00020 
00021 static int32_t findOverlappingPosition(CompactByteArray* this_obj,
00022                        uint32_t start, 
00023                        const UChar *tempIndex, 
00024                        int32_t tempIndexCount, 
00025                        uint32_t cycle);
00026 
00027 /* internal constants*/
00028 
00029 
00030 int32_t ucmp8_getkUnicodeCount() { return UCMP8_kUnicodeCount;}
00031 int32_t ucmp8_getkBlockCount() { return UCMP8_kBlockCount;}
00032 void  ucmp8_initBogus(CompactByteArray* array)
00033 {
00034   CompactByteArray* this_obj = array;
00035 
00036   if (this_obj == NULL) return;
00037 
00038   this_obj->fStructSize = sizeof(CompactByteArray);
00039   this_obj->fArray = NULL; 
00040   this_obj->fIndex = NULL;
00041   this_obj->fCount = UCMP8_kUnicodeCount;
00042   this_obj->fCompact = FALSE; 
00043   this_obj->fBogus = TRUE;
00044   this_obj->fAlias = FALSE;
00045   this_obj->fIAmOwned = TRUE;
00046 }
00047 
00048 /* debug flags*/
00049 /*=======================================================*/
00050 void  ucmp8_init(CompactByteArray* array, int8_t defaultValue)
00051 {
00052 /* set up the index array and the data array.
00053  * the index array always points into particular parts of the data array
00054  * it is initially set up to point at regular block boundaries
00055  * The following example uses blocks of 4 for simplicity
00056  * Example: Expanded
00057  * INDEX# 0   1   2   3   4
00058  * INDEX  0   4   8   12  16 ...
00059  * ARRAY  abcdeababcedzyabcdea...
00060  *        |   |   |   |   |   |...
00061  * whenever you set an element in the array, it unpacks to this state
00062  * After compression, the index will point to various places in the data array
00063  * wherever there is a runs of the same elements as in the original
00064  * Example: Compressed
00065  * INDEX# 0   1   2   3   4
00066  * INDEX  0   4   1   8   2 ...
00067  * ARRAY  abcdeabazyabc...
00068  * If you look at the example, index# 2 in the expanded version points
00069  * to data position number 8, which has elements "bced". In the compressed
00070  * version, index# 2 points to data position 1, which also has "bced"
00071  */
00072   CompactByteArray* this_obj = array;
00073   int32_t i;
00074   
00075   if (this_obj == NULL) return;
00076 
00077   this_obj->fStructSize = sizeof(CompactByteArray);
00078   this_obj->fArray = NULL; 
00079   this_obj->fIndex = NULL;
00080   this_obj->fCount = UCMP8_kUnicodeCount;
00081   this_obj->fCompact = FALSE; 
00082   this_obj->fBogus = FALSE;
00083   this_obj->fAlias = FALSE;
00084   this_obj->fIAmOwned = TRUE;
00085 
00086 
00087   this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
00088   if (!this_obj->fArray) 
00089     {
00090       this_obj->fBogus = TRUE;
00091       return;
00092     }
00093   this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
00094   if (!this_obj->fIndex) 
00095     {
00096       uprv_free(this_obj->fArray);
00097       this_obj->fArray = NULL;
00098       this_obj->fBogus = TRUE;
00099       return;
00100     }
00101   for (i = 0; i < UCMP8_kUnicodeCount; ++i) 
00102     {
00103       this_obj->fArray[i] = defaultValue;
00104     }
00105   for (i = 0; i < UCMP8_kIndexCount; ++i) 
00106     {
00107       this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
00108     }
00109 }
00110 
00111 CompactByteArray* ucmp8_open(int8_t defaultValue)
00112 {
00113 /* set up the index array and the data array.
00114  * the index array always points into particular parts of the data array
00115  * it is initially set up to point at regular block boundaries
00116  * The following example uses blocks of 4 for simplicity
00117  * Example: Expanded
00118  * INDEX# 0   1   2   3   4
00119  * INDEX  0   4   8   12  16 ...
00120  * ARRAY  abcdeababcedzyabcdea...
00121  *        |   |   |   |   |   |...
00122  * whenever you set an element in the array, it unpacks to this state
00123  * After compression, the index will point to various places in the data array
00124  * wherever there is a runs of the same elements as in the original
00125  * Example: Compressed
00126  * INDEX# 0   1   2   3   4
00127  * INDEX  0   4   1   8   2 ...
00128  * ARRAY  abcdeabazyabc...
00129  * If you look at the example, index# 2 in the expanded version points
00130  * to data position number 8, which has elements "bced". In the compressed
00131  * version, index# 2 points to data position 1, which also has "bced"
00132  */
00133   CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
00134   int32_t i;
00135 
00136   if (this_obj == NULL) return NULL;
00137 
00138   this_obj->fStructSize = sizeof(CompactByteArray);
00139   this_obj->fArray = NULL; 
00140   this_obj->fIndex = NULL;
00141   this_obj->fCount = UCMP8_kUnicodeCount;
00142   this_obj->fCompact = FALSE; 
00143   this_obj->fBogus = FALSE;
00144   this_obj->fAlias = FALSE;
00145   this_obj->fIAmOwned = FALSE;
00146 
00147 
00148 
00149   this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
00150   if (!this_obj->fArray) 
00151     {
00152       this_obj->fBogus = TRUE;
00153       return NULL;
00154     }
00155   this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
00156   if (!this_obj->fIndex) 
00157     {
00158       uprv_free(this_obj->fArray);
00159       this_obj->fArray = NULL;
00160       this_obj->fBogus = TRUE;
00161       return NULL;
00162     }
00163   for (i = 0; i < UCMP8_kUnicodeCount; ++i) 
00164     {
00165       this_obj->fArray[i] = defaultValue;
00166     }
00167   for (i = 0; i < UCMP8_kIndexCount; ++i) 
00168     {
00169       this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
00170     }
00171 
00172   return this_obj;
00173 }
00174 
00175 CompactByteArray* ucmp8_openAdopt(uint16_t *indexArray,
00176                   int8_t *newValues,
00177                   int32_t count)
00178 {
00179   CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
00180 
00181     ucmp8_initAdopt(this_obj, indexArray, newValues, count);
00182     this_obj->fIAmOwned = FALSE;
00183     return this_obj;
00184 }
00185 
00186 CompactByteArray* ucmp8_openAlias(uint16_t *indexArray,
00187                   int8_t *newValues,
00188                   int32_t count)
00189 {
00190   CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
00191 
00192   ucmp8_initAlias(this_obj, indexArray, newValues, count);
00193   this_obj->fIAmOwned = FALSE;
00194   return this_obj;
00195 }
00196 
00197 /*=======================================================*/
00198 
00199 CompactByteArray* ucmp8_initAdopt(CompactByteArray *this_obj,
00200                   uint16_t *indexArray,
00201                   int8_t *newValues,
00202                   int32_t count)
00203 {
00204   if (this_obj) {
00205     this_obj->fCount = count;
00206     this_obj->fBogus = FALSE;
00207     this_obj->fStructSize = sizeof(CompactByteArray);
00208 
00209     this_obj->fArray = newValues;
00210     this_obj->fIndex = indexArray;
00211     this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
00212     this_obj->fAlias = FALSE;
00213     this_obj->fIAmOwned = TRUE;
00214   }
00215 
00216   return this_obj;
00217 }
00218 
00219 CompactByteArray* ucmp8_initAlias(CompactByteArray *this_obj,
00220                   uint16_t *indexArray,
00221                   int8_t *newValues,
00222                   int32_t count)
00223 {
00224   if (this_obj) {
00225     this_obj->fArray = NULL;
00226     this_obj->fIndex = NULL; 
00227     this_obj->fCount = count;
00228     this_obj->fBogus = FALSE;
00229     this_obj->fStructSize = sizeof(CompactByteArray);
00230 
00231     this_obj->fArray = newValues;
00232     this_obj->fIndex = indexArray;
00233     this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
00234     this_obj->fAlias = TRUE;
00235     this_obj->fIAmOwned = TRUE;
00236   }
00237 
00238   return this_obj;
00239 }
00240 
00241 /*=======================================================*/
00242 
00243 void ucmp8_close(CompactByteArray* this_obj) 
00244 {
00245   if(this_obj != NULL) {
00246     if(!this_obj->fAlias) {
00247       if(this_obj->fArray != NULL) {
00248         uprv_free(this_obj->fArray);
00249       }
00250       if(this_obj->fIndex != NULL) {
00251         uprv_free(this_obj->fIndex);
00252       }
00253     }
00254     if(!this_obj->fIAmOwned) /* Called if 'init' was called instead of 'open'. */
00255       {
00256         uprv_free(this_obj);
00257       }
00258   }
00259 }
00260 
00261 
00262 /*=======================================================*/
00263  
00264 void ucmp8_expand(CompactByteArray* this_obj) 
00265 {
00266   /* can optimize later.
00267    * if we have to expand, then walk through the blocks instead of using Get
00268    * this code unpacks the array by copying the blocks to the normalized position.
00269    * Example: Compressed
00270    * INDEX# 0   1   2   3   4
00271    * INDEX  0   4   1   8   2 ...
00272    * ARRAY  abcdeabazyabc...
00273    *  turns into
00274    * Example: Expanded
00275    * INDEX# 0   1   2   3   4
00276    * INDEX  0   4   8   12  16 ...
00277    * ARRAY  abcdeababcedzyabcdea...
00278    */
00279     int32_t i;
00280     if (this_obj->fCompact) 
00281     {
00282       int8_t* tempArray;
00283       tempArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
00284       if (!tempArray) 
00285       {
00286           this_obj->fBogus = TRUE;
00287           return;
00288       }
00289       for (i = 0; i < UCMP8_kUnicodeCount; ++i) 
00290       {
00291           tempArray[i] = ucmp8_get(this_obj,(UChar)i);  /* HSYS : How expand?*/
00292       }
00293       for (i = 0; i < UCMP8_kIndexCount; ++i) 
00294       {
00295           this_obj->fIndex[i] = (uint16_t)(i<< UCMP8_kBlockShift);
00296       }
00297       uprv_free(this_obj->fArray);
00298       this_obj->fArray = tempArray;
00299       this_obj->fCompact = FALSE;
00300       this_obj->fAlias = FALSE;
00301       this_obj->fIAmOwned = FALSE;
00302 
00303     }
00304 }
00305  
00306 
00307 /*=======================================================*/
00308 /* this_obj->fArray:    an array to be overlapped
00309  * start and count: specify the block to be overlapped
00310  * tempIndex:   the overlapped array (actually indices back into inputContents)
00311  * inputHash:   an index of hashes for tempIndex, where
00312  *      inputHash[i] = XOR of values from i-count+1 to i
00313  */
00314 static int32_t
00315 findOverlappingPosition(CompactByteArray* this_obj, 
00316             uint32_t start,
00317             const UChar* tempIndex,
00318             int32_t tempIndexCount,
00319             uint32_t cycle) 
00320 {
00321   /* this_obj is a utility routine for finding blocks that overlap.
00322    * IMPORTANT: the cycle number is very important. Small cycles take a lot
00323    * longer to work. In some cases, they may be able to get better compaction.
00324    */
00325     
00326   int32_t i;
00327   int32_t j;
00328   int32_t currentCount;
00329   
00330   for (i = 0; i < tempIndexCount; i += cycle) 
00331     {
00332       currentCount = UCMP8_kBlockCount;
00333       if (i + UCMP8_kBlockCount > tempIndexCount) 
00334     {
00335       currentCount = tempIndexCount - i;
00336         } 
00337       for (j = 0; j < currentCount; ++j) 
00338     {
00339       if (this_obj->fArray[start + j] != this_obj->fArray[tempIndex[i + j]]) break;
00340         }
00341       if (j == currentCount) break;
00342     }
00343   
00344   return i;
00345 }
00346 
00347 UBool
00348 ucmp8_isBogus(const CompactByteArray* this_obj)
00349 {
00350   return (UBool)(this_obj == NULL || this_obj->fBogus);
00351 }
00352 
00353 const int8_t*
00354 ucmp8_getArray(const CompactByteArray* this_obj)
00355 {
00356   return this_obj->fArray;
00357 }
00358 
00359 const uint16_t*
00360 ucmp8_getIndex(const CompactByteArray* this_obj)
00361 {
00362   return this_obj->fIndex;
00363 }
00364 
00365 int32_t 
00366 ucmp8_getCount(const CompactByteArray* this_obj)
00367 {
00368   return this_obj->fCount;
00369 }
00370 
00371 
00372 void 
00373 ucmp8_set(CompactByteArray* this_obj,
00374       UChar c,
00375       int8_t value)
00376 {
00377   if (this_obj->fCompact == TRUE) 
00378     {
00379       ucmp8_expand(this_obj);
00380       if (this_obj->fBogus) return;
00381     }
00382   this_obj->fArray[(int32_t)c] = value;
00383 }
00384 
00385 
00386 void 
00387 ucmp8_setRange(CompactByteArray* this_obj,
00388            UChar start,
00389            UChar end,
00390            int8_t value)
00391 {
00392   int32_t i;
00393   if (this_obj->fCompact == TRUE) 
00394     {
00395       ucmp8_expand(this_obj);
00396       if (this_obj->fBogus) return;
00397     }
00398   for (i = start; i <= end; ++i) 
00399     {
00400       this_obj->fArray[i] = value;
00401     }
00402 }
00403 
00404 
00405 /*=======================================================*/
00406  
00407 void 
00408 ucmp8_compact(CompactByteArray* this_obj,
00409           uint32_t cycle) 
00410 {
00411   if (!this_obj->fCompact) 
00412     {
00413       /* this_obj actually does the compaction.
00414        * it walks throught the contents of the expanded array, finding the
00415        * first block in the data that matches the contents of the current index.
00416        * As it works, it keeps an updated pointer to the last position,
00417        * so that it knows how big to make the final array
00418        * If the matching succeeds, then the index will point into the data
00419        * at some earlier position.
00420        * If the matching fails, then last position pointer will be bumped,
00421        * and the index will point to that last block of data.
00422        */
00423       UChar*    tempIndex;
00424       int32_t     tempIndexCount;
00425       int8_t*     tempArray;
00426       int32_t     iBlock, iIndex;
00427       
00428       /* fix cycle, must be 0 < cycle <= blockcount*/
00429       if (cycle < 0) cycle = 1;
00430       else if (cycle > (uint32_t)UCMP8_kBlockCount) cycle = UCMP8_kBlockCount;
00431       
00432       /* make temp storage, larger than we need*/
00433       tempIndex = (UChar*) uprv_malloc(sizeof(UChar)* UCMP8_kUnicodeCount);
00434       if (!tempIndex) 
00435     {
00436       this_obj->fBogus = TRUE;
00437       return;
00438         }               
00439       /* set up first block.*/
00440       tempIndexCount = UCMP8_kBlockCount;
00441       for (iIndex = 0; iIndex < UCMP8_kBlockCount; ++iIndex) 
00442     {
00443       tempIndex[iIndex] = (uint16_t)iIndex;
00444         }; /* endfor (iIndex = 0; .....)*/
00445       this_obj->fIndex[0] = 0;
00446       
00447       /* for each successive block, find out its first position in the compacted array*/
00448       for (iBlock = 1; iBlock < UCMP8_kIndexCount; ++iBlock) 
00449     {
00450       int32_t newCount, firstPosition, block;
00451       block = iBlock << UCMP8_kBlockShift;
00452       /*      if (debugSmall) if (block > debugSmallLimit) break;*/
00453       firstPosition = findOverlappingPosition(this_obj, 
00454                           block,
00455                           tempIndex,
00456                           tempIndexCount,
00457                           cycle);
00458       
00459       /* if not contained in the current list, copy the remainder
00460        * invariant; cumulativeHash[iBlock] = XOR of values from iBlock-kBlockCount+1 to iBlock
00461        * we do this_obj by XORing out cumulativeHash[iBlock-kBlockCount]
00462        */
00463       newCount = firstPosition + UCMP8_kBlockCount;
00464       if (newCount > tempIndexCount) 
00465         {
00466           for (iIndex = tempIndexCount; iIndex < newCount; ++iIndex) 
00467         {
00468           tempIndex[iIndex] = (uint16_t)(iIndex - firstPosition + block);
00469         } /* endfor (iIndex = tempIndexCount....)*/
00470                 tempIndexCount = newCount;
00471             } /* endif (newCount > tempIndexCount)*/
00472       this_obj->fIndex[iBlock] = (uint16_t)firstPosition;
00473         } /* endfor (iBlock = 1.....)*/
00474       
00475       /* now allocate and copy the items into the array*/
00476       tempArray = (int8_t*) uprv_malloc(tempIndexCount * sizeof(int8_t));
00477       if (!tempArray) 
00478     {
00479       this_obj->fBogus = TRUE;
00480       uprv_free(tempIndex);
00481       return;
00482         }
00483       for (iIndex = 0; iIndex < tempIndexCount; ++iIndex) 
00484     {
00485       tempArray[iIndex] = this_obj->fArray[tempIndex[iIndex]];
00486         }
00487       uprv_free(this_obj->fArray);
00488       this_obj->fArray = tempArray;
00489       this_obj->fCount = tempIndexCount;
00490       
00491       
00492       /* free up temp storage*/
00493       uprv_free(tempIndex);
00494       this_obj->fCompact = TRUE;
00495     } /* endif (!this_obj->fCompact)*/
00496 }
00497 
00498 U_CAPI  uint32_t U_EXPORT2 ucmp8_flattenMem (const CompactByteArray* array, UMemoryStream *MS)
00499 {
00500   int32_t size = 0;
00501 
00502   uprv_mstrm_write32(MS, ICU_UCMP8_VERSION);
00503   size += 4;
00504   
00505   uprv_mstrm_write32(MS, array->fCount);
00506   size += 4;
00507   
00508   uprv_mstrm_writeBlock(MS, array->fIndex, sizeof(array->fIndex[0])*UCMP8_kIndexCount);
00509   size += sizeof(array->fIndex[0])*UCMP8_kIndexCount;
00510   
00511   uprv_mstrm_writeBlock(MS, array->fArray, sizeof(array->fArray[0])*array->fCount);
00512   size += sizeof(array->fArray[0])*array->fCount;
00513   
00514   while(size%4) /* end padding */
00515   {
00516       uprv_mstrm_writePadding(MS, 1); /* Pad total so far to even size */
00517       size += 1;
00518   }
00519 
00520   return size;
00521 }
00522 
00523 /* We use sizeof(*array), etc so that this code can be as portable as 
00524    possible between the ucmpX_ family. 
00525 */
00526 
00527 U_CAPI  void U_EXPORT2 ucmp8_initFromData(CompactByteArray *this_obj, const uint8_t **source, UErrorCode *status)
00528 {
00529   uint32_t i;
00530   const uint8_t *oldSource = *source;
00531 
00532   if(U_FAILURE(*status))
00533     return;
00534 
00535  this_obj->fArray = NULL;
00536  this_obj->fIndex = NULL; 
00537  this_obj->fBogus = FALSE;
00538  this_obj->fStructSize = sizeof(CompactByteArray);
00539  this_obj->fCompact = TRUE;
00540  this_obj->fAlias = TRUE;
00541  this_obj->fIAmOwned = TRUE;
00542   
00543  i = * ((const uint32_t*) *source);
00544  (*source) += 4;
00545 
00546  if(i != ICU_UCMP8_VERSION)
00547  {
00548    *status = U_INVALID_FORMAT_ERROR;
00549    return;
00550  }
00551   
00552  this_obj->fCount = * ((const uint32_t*)*source);
00553  (*source) += 4;
00554 
00555  this_obj->fIndex = (uint16_t*) *source;
00556  (*source) += sizeof(this_obj->fIndex[0])*UCMP8_kIndexCount;
00557 
00558  this_obj->fArray = (int8_t*) *source;
00559  (*source) += sizeof(this_obj->fArray[0])*this_obj->fCount;
00560 
00561  /* eat up padding */
00562  while((*source-(oldSource))%4)
00563     (*source)++;
00564 }

Generated at Tue Dec 5 10:47:57 2000 for ICU by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000