Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

ubidiwrt.c

00001 /*  
00002 *******************************************************************************
00003 *
00004 *   Copyright (C) 2000, International Business Machines
00005 *   Corporation and others.  All Rights Reserved.
00006 *
00007 *******************************************************************************
00008 *   file name:  ubidiwrt.c
00009 *   encoding:   US-ASCII
00010 *   tab size:   8 (not used)
00011 *   indentation:4
00012 *
00013 *   created on: 1999aug06
00014 *   created by: Markus W. Scherer
00015 *
00016 * This file contains implementations for BiDi functions that use
00017 * the core algorithm and core API to write reordered text.
00018 */
00019 
00020 /* set import/export definitions */
00021 #ifndef U_COMMON_IMPLEMENTATION
00022 #   define U_COMMON_IMPLEMENTATION
00023 #endif
00024 
00025 #include "cmemory.h"
00026 #include "unicode/utypes.h"
00027 #include "unicode/ustring.h"
00028 #include "unicode/uchar.h"
00029 #include "unicode/ubidi.h"
00030 #include "ubidiimp.h"
00031 
00032 /*
00033  * The function implementations in this file are designed
00034  * for UTF-16 and UTF-32, not for UTF-8.
00035  *
00036  * Assumptions that are not true for UTF-8:
00037  * - Any code point always needs the same number of code units
00038  *   ("minimum-length-problem" of UTF-8)
00039  * - The BiDi control characters need only one code unit each
00040  *
00041  * Further assumptions for all UTFs:
00042  * - u_charMirror(c) needs the same number of code units as c
00043  */
00044 #if UTF_SIZE==8
00045 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
00046 #endif
00047 
00049 enum {
00050     LRM_CHAR=0x200e,
00051     RLM_CHAR,
00052     LRE_CHAR=0x202a,
00053     RLE_CHAR,
00054     PDF_CHAR,
00055     LRO_CHAR,
00056     RLO_CHAR
00057 };
00058 
00059 #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffe)==LRM_CHAR || (uint32_t)((c)-LRE_CHAR)<5)
00060 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
00061 
00062 /*
00063  * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
00064  * semantically write RTL runs in reverse and later reverse them again.
00065  * Instead, we actually write them in forward order to begin with.
00066  * However, if the RTL run was to be mirrored, we need to mirror here now
00067  * since the implicit second reversal must not do it.
00068  * It looks strange to do mirroring in LTR output, but it is only because
00069  * we are writing RTL output in reverse.
00070  */
00071 static UTextOffset
00072 doWriteForward(const UChar *src, int32_t srcLength,
00073                UChar *dest, int32_t destSize,
00074                uint16_t options,
00075                UErrorCode *pErrorCode) {
00076     /* optimize for several combinations of options */
00077     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
00078     case 0: {
00079         /* simply copy the LTR run to the destination */
00080         int32_t length=srcLength;
00081         if(destSize<length) {
00082             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00083             return 0;
00084         }
00085         do {
00086             *dest++=*src++;
00087         } while(--length>0);
00088         return srcLength;
00089     }
00090     case UBIDI_DO_MIRRORING: {
00091         /* do mirroring */
00092         UTextOffset i=0, j=0;
00093         UChar32 c;
00094 
00095         if(destSize<srcLength) {
00096             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00097             return 0;
00098         }
00099         do {
00100             UTF_NEXT_CHAR(src, i, srcLength, c);
00101             c=u_charMirror(c);
00102             UTF_APPEND_CHAR_UNSAFE(dest, j, c);
00103         } while(i<srcLength);
00104         return srcLength;
00105     }
00106     case UBIDI_REMOVE_BIDI_CONTROLS: {
00107         /* copy the LTR run and remove any BiDi control characters */
00108         int32_t remaining=destSize;
00109         UChar c;
00110         do {
00111             c=*src++;
00112             if(!IS_BIDI_CONTROL_CHAR(c)) {
00113                 if(--remaining<0) {
00114                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00115                     return 0;
00116                 }
00117                 *dest++=c;
00118             }
00119         } while(--srcLength>0);
00120         return destSize-remaining;
00121     }
00122     default: {
00123         /* remove BiDi control characters and do mirroring */
00124         int32_t remaining=destSize;
00125         UTextOffset i, j=0;
00126         UChar32 c;
00127         do {
00128             i=0;
00129             UTF_NEXT_CHAR(src, i, srcLength, c);
00130             src+=i;
00131             srcLength-=i;
00132             if(!IS_BIDI_CONTROL_CHAR(c)) {
00133                 remaining-=i;
00134                 if(remaining<0) {
00135                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00136                     return 0;
00137                 }
00138                 c=u_charMirror(c);
00139                 UTF_APPEND_CHAR_UNSAFE(dest, j, c);
00140             }
00141         } while(srcLength>0);
00142         return j;
00143     }
00144     } /* end of switch */
00145 }
00146 
00147 static UTextOffset
00148 doWriteReverse(const UChar *src, int32_t srcLength,
00149                UChar *dest, int32_t destSize,
00150                uint16_t options,
00151                UErrorCode *pErrorCode) {
00152     /*
00153      * RTL run -
00154      *
00155      * RTL runs need to be copied to the destination in reverse order
00156      * of code points, not code units, to keep Unicode characters intact.
00157      *
00158      * The general strategy for this is to read the source text
00159      * in backward order, collect all code units for a code point
00160      * (and optionally following combining characters, see below),
00161      * and copy all these code units in ascending order
00162      * to the destination for this run.
00163      *
00164      * Several options request whether combining characters
00165      * should be kept after their base characters,
00166      * whether BiDi control characters should be removed, and
00167      * whether characters should be replaced by their mirror-image
00168      * equivalent Unicode characters.
00169      */
00170     UTextOffset i, j;
00171     UChar32 c;
00172 
00173     /* optimize for several combinations of options */
00174     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
00175     case 0:
00176         /*
00177          * With none of the "complicated" options set, the destination
00178          * run will have the same length as the source run,
00179          * and there is no mirroring and no keeping combining characters
00180          * with their base characters.
00181          */
00182         if(destSize<srcLength) {
00183             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00184             return 0;
00185         }
00186         destSize=srcLength;
00187 
00188         /* preserve character integrity */
00189         do {
00190             /* i is always after the last code unit known to need to be kept in this segment */
00191             i=srcLength;
00192 
00193             /* collect code units for one base character */
00194             UTF_BACK_1(src, 0, srcLength);
00195 
00196             /* copy this base character */
00197             j=srcLength;
00198             do {
00199                 *dest++=src[j++];
00200             } while(j<i);
00201         } while(srcLength>0);
00202         break;
00203     case UBIDI_KEEP_BASE_COMBINING:
00204         /*
00205          * Here, too, the destination
00206          * run will have the same length as the source run,
00207          * and there is no mirroring.
00208          * We do need to keep combining characters with their base characters.
00209          */
00210         if(destSize<srcLength) {
00211             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00212             return 0;
00213         }
00214         destSize=srcLength;
00215 
00216         /* preserve character integrity */
00217         do {
00218             /* i is always after the last code unit known to need to be kept in this segment */
00219             i=srcLength;
00220 
00221             /* collect code units and modifier letters for one base character */
00222             do {
00223                 UTF_PREV_CHAR(src, 0, srcLength, c);
00224             } while(srcLength>0 && IS_COMBINING(u_charType(c)));
00225 
00226             /* copy this "user character" */
00227             j=srcLength;
00228             do {
00229                 *dest++=src[j++];
00230             } while(j<i);
00231         } while(srcLength>0);
00232         break;
00233     default:
00234         /*
00235          * With several "complicated" options set, this is the most
00236          * general and the slowest copying of an RTL run.
00237          * We will do mirroring, remove BiDi controls, and
00238          * keep combining characters with their base characters
00239          * as requested.
00240          */
00241         if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
00242             i=srcLength;
00243         } else {
00244             /* we need to find out the destination length of the run,
00245                which will not include the BiDi control characters */
00246             int32_t length=srcLength;
00247             UChar ch;
00248 
00249             i=0;
00250             do {
00251                 ch=*src++;
00252                 if(!IS_BIDI_CONTROL_CHAR(ch)) {
00253                     ++i;
00254                 }
00255             } while(--length>0);
00256             src-=srcLength;
00257         }
00258 
00259         if(destSize<i) {
00260             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00261             return 0;
00262         }
00263         destSize=i;
00264 
00265         /* preserve character integrity */
00266         do {
00267             /* i is always after the last code unit known to need to be kept in this segment */
00268             i=srcLength;
00269 
00270             /* collect code units for one base character */
00271             UTF_PREV_CHAR(src, 0, srcLength, c);
00272             if(options&UBIDI_KEEP_BASE_COMBINING) {
00273                 /* collect modifier letters for this base character */
00274                 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
00275                     UTF_PREV_CHAR(src, 0, srcLength, c);
00276                 }
00277             }
00278 
00279             if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
00280                 /* do not copy this BiDi control character */
00281                 continue;
00282             }
00283 
00284             /* copy this "user character" */
00285             j=srcLength;
00286             if(options&UBIDI_DO_MIRRORING) {
00287                 /* mirror only the base character */
00288                 UTextOffset k=0;
00289                 c=u_charMirror(c);
00290                 UTF_APPEND_CHAR_UNSAFE(dest, k, c);
00291                 dest+=k;
00292                 j+=k;
00293             }
00294             while(j<i) {
00295                 *dest++=src[j++];
00296             }
00297         } while(srcLength>0);
00298         break;
00299     } /* end of switch */
00300     return destSize;
00301 }
00302 
00303 U_CAPI UTextOffset U_EXPORT2
00304 ubidi_writeReverse(const UChar *src, int32_t srcLength,
00305                    UChar *dest, int32_t destSize,
00306                    uint16_t options,
00307                    UErrorCode *pErrorCode) {
00308     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
00309         return 0;
00310     }
00311 
00312     /* more error checking */
00313     if( src==NULL || srcLength<0 ||
00314         dest==NULL || destSize<=0)
00315     {
00316         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
00317         return 0;
00318     }
00319 
00320     /* do input and output overlap? */
00321     if((src>=dest && src<dest+destSize) ||
00322        (dest>=src && dest<src+srcLength))
00323     {
00324         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
00325         return 0;
00326     }
00327 
00328     if(srcLength>0) {
00329         return doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
00330     } else {
00331         /* nothing to do */
00332         return 0;
00333     }
00334 }
00335 
00336 #define MASK_R_AL (1UL<<U_RIGHT_TO_LEFT|1UL<<U_RIGHT_TO_LEFT_ARABIC)
00337 
00338 U_CAPI UTextOffset U_EXPORT2
00339 ubidi_writeReordered(UBiDi *pBiDi,
00340                      UChar *dest, int32_t destSize,
00341                      uint16_t options,
00342                      UErrorCode *pErrorCode) {
00343     const UChar *text;
00344     UChar *oldDest=dest;
00345     int32_t length;
00346     UTextOffset run, runCount, logicalStart, runLength;
00347 
00348     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
00349         return 0;
00350     }
00351 
00352     /* more error checking */
00353     if( pBiDi==NULL ||
00354         (text=ubidi_getText(pBiDi))==NULL || (length=ubidi_getLength(pBiDi))<0 ||
00355         dest==NULL || destSize<=0
00356     ) {
00357         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
00358         return 0;
00359     }
00360 
00361     /* do input and output overlap? */
00362     if((text>=dest && text<dest+destSize) ||
00363        (dest>=text && dest<text+length))
00364     {
00365         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
00366         return 0;
00367     }
00368 
00369     if(length==0) {
00370         /* nothing to do */
00371         return 0;
00372     }
00373 
00374     runCount=ubidi_countRuns(pBiDi, pErrorCode);
00375     if(U_FAILURE(*pErrorCode)) {
00376         return 0;
00377     }
00378 
00379     /*
00380      * If we do not perform the "inverse BiDi" algorithm, then we
00381      * don't need to insert any LRMs, and don't need to test for it.
00382      */
00383     if(!ubidi_isInverse(pBiDi)) {
00384         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
00385     }
00386 
00387     /*
00388      * Iterate through all visual runs and copy the run text segments to
00389      * the destination, according to the options.
00390      *
00391      * The tests for where to insert LRMs ignore the fact that there may be
00392      * BN codes or non-BMP code points at the beginning and end of a run;
00393      * they may insert LRMs unnecessarily but the tests are faster this way
00394      * (this would have to be improved for UTF-8).
00395      */
00396     if(!(options&UBIDI_OUTPUT_REVERSE)) {
00397         /* forward output */
00398         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
00399             /* do not insert BiDi controls */
00400             for(run=0; run<runCount; ++run) {
00401                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
00402                     runLength=doWriteForward(text+logicalStart, runLength,
00403                                              dest, destSize,
00404                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
00405                 } else {
00406                     runLength=doWriteReverse(text+logicalStart, runLength,
00407                                              dest, destSize,
00408                                              options, pErrorCode);
00409                 }
00410                 dest+=runLength;
00411                 destSize-=runLength;
00412                 if(U_FAILURE(*pErrorCode)) {
00413                     return 0;
00414                 }
00415             }
00416         } else {
00417             /* insert BiDi controls for "inverse BiDi" */
00418             const UChar *src;
00419             UBiDiDirection dir;
00420 
00421             for(run=0; run<runCount; ++run) {
00422                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
00423                 src=text+logicalStart;
00424 
00425                 if(UBIDI_LTR==dir) {
00426                     if(/*run>0 &&*/ u_charDirection(*src)!=U_LEFT_TO_RIGHT) {
00427                         if(destSize==0) {
00428                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00429                             return 0;
00430                         }
00431                         *dest++=LRM_CHAR;
00432                         --destSize;
00433                     }
00434 
00435                     runLength=doWriteForward(src, runLength,
00436                                              dest, destSize,
00437                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
00438                     if(U_FAILURE(*pErrorCode)) {
00439                         return 0;
00440                     }
00441                     dest+=runLength;
00442                     destSize-=runLength;
00443 
00444                     if(/*run<runCount-1 &&*/ u_charDirection(src[runLength-1])!=U_LEFT_TO_RIGHT) {
00445                         if(destSize==0) {
00446                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00447                             return 0;
00448                         }
00449                         *dest++=LRM_CHAR;
00450                         --destSize;
00451                     }
00452                 } else {
00453                     if(/*run>0 &&*/ !(MASK_R_AL&1UL<<u_charDirection(src[runLength-1]))) {
00454                         if(destSize==0) {
00455                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00456                             return 0;
00457                         }
00458                         *dest++=RLM_CHAR;
00459                         --destSize;
00460                     }
00461 
00462                     runLength=doWriteReverse(src, runLength,
00463                                              dest, destSize,
00464                                              options, pErrorCode);
00465                     if(U_FAILURE(*pErrorCode)) {
00466                         return 0;
00467                     }
00468                     dest+=runLength;
00469                     destSize-=runLength;
00470 
00471                     if(/*run<runCount-1 &&*/ !(MASK_R_AL&1UL<<u_charDirection(*src))) {
00472                         if(destSize==0) {
00473                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00474                             return 0;
00475                         }
00476                         *dest++=RLM_CHAR;
00477                         --destSize;
00478                     }
00479                 }
00480             }
00481         }
00482     } else {
00483         /* reverse output */
00484         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
00485             /* do not insert BiDi controls */
00486             for(run=runCount; --run>=0;) {
00487                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
00488                     runLength=doWriteReverse(text+logicalStart, runLength,
00489                                              dest, destSize,
00490                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
00491                 } else {
00492                     runLength=doWriteForward(text+logicalStart, runLength,
00493                                              dest, destSize,
00494                                              options, pErrorCode);
00495                 }
00496                 dest+=runLength;
00497                 destSize-=runLength;
00498                 if(U_FAILURE(*pErrorCode)) {
00499                     return 0;
00500                 }
00501             }
00502         } else {
00503             /* insert BiDi controls for "inverse BiDi" */
00504             const UChar *src;
00505             UBiDiDirection dir;
00506 
00507             for(run=runCount; --run>=0;) {
00508                 /* reverse output */
00509                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
00510                 src=text+logicalStart;
00511 
00512                 if(UBIDI_LTR==dir) {
00513                     if(/*run<runCount-1 &&*/ u_charDirection(src[runLength-1])!=U_LEFT_TO_RIGHT) {
00514                         if(destSize==0) {
00515                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00516                             return 0;
00517                         }
00518                         *dest++=LRM_CHAR;
00519                         --destSize;
00520                     }
00521 
00522                     runLength=doWriteReverse(src, runLength,
00523                                              dest, destSize,
00524                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
00525                     if(U_FAILURE(*pErrorCode)) {
00526                         return 0;
00527                     }
00528                     dest+=runLength;
00529                     destSize-=runLength;
00530 
00531                     if(/*run>0 &&*/ u_charDirection(*src)!=U_LEFT_TO_RIGHT) {
00532                         if(destSize==0) {
00533                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00534                             return 0;
00535                         }
00536                         *dest++=LRM_CHAR;
00537                         --destSize;
00538                     }
00539                 } else {
00540                     if(/*run<runCount-1 &&*/ !(MASK_R_AL&1UL<<u_charDirection(*src))) {
00541                         if(destSize==0) {
00542                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00543                             return 0;
00544                         }
00545                         *dest++=RLM_CHAR;
00546                         --destSize;
00547                     }
00548 
00549                     runLength=doWriteForward(src, runLength,
00550                                              dest, destSize,
00551                                              options, pErrorCode);
00552                     if(U_FAILURE(*pErrorCode)) {
00553                         return 0;
00554                     }
00555                     dest+=runLength;
00556                     destSize-=runLength;
00557 
00558                     if(/*run>0 &&*/ !(MASK_R_AL&1UL<<u_charDirection(src[runLength-1]))) {
00559                         if(destSize==0) {
00560                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
00561                             return 0;
00562                         }
00563                         *dest++=RLM_CHAR;
00564                         --destSize;
00565                     }
00566                 }
00567             }
00568         }
00569     }
00570 
00571     return dest-oldDest;
00572 }

Generated at Tue Dec 5 10:47:55 2000 for ICU by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000