/* ***************************************************************************************** * Copyright (C) 1996-1999, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ // FILE NAME : unicode.h // // CREATED // Wednesday, December 11, 1996 // // CREATED BY // Helena Shih // // CHANGES // Thursday, April 15, 1999 // Modified the definitions of all the functions // C++ Wrappers for Unicode // CHANGES BY // Madhu Katragadda // 5/20/99 Madhu Added the function getVersion() // 11/22/99 aliu Added MIN_RADIX, MAX_RADIX, digit, forDigit //******************************************************************************************** #ifndef UNICODE_H #define UNICODE_H #include "unicode/utypes.h" #include "unicode/uchar.h" class U_COMMON_API Unicode { public: /* * In C++, static const members actually take up memory and need to be accessed. * enum values are more like C #define's. * The following is a collection of constants, not an enumeration type. */ enum { MIN_VALUE=0, MAX_VALUE=0x10ffff, MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH, MIN_RADIX=2, MAX_RADIX=36 }; enum EUnicodeGeneralTypes { UNASSIGNED = 0, UPPERCASE_LETTER = 1, LOWERCASE_LETTER = 2, TITLECASE_LETTER = 3, MODIFIER_LETTER = 4, OTHER_LETTER = 5, NON_SPACING_MARK = 6, ENCLOSING_MARK = 7, COMBINING_SPACING_MARK = 8, DECIMAL_DIGIT_NUMBER = 9, LETTER_NUMBER = 10, OTHER_NUMBER = 11, SPACE_SEPARATOR = 12, LINE_SEPARATOR = 13, PARAGRAPH_SEPARATOR = 14, CONTROL = 15, FORMAT = 16, PRIVATE_USE = 17, SURROGATE = 18, DASH_PUNCTUATION = 19, START_PUNCTUATION = 20, END_PUNCTUATION = 21, CONNECTOR_PUNCTUATION = 22, OTHER_PUNCTUATION = 23, MATH_SYMBOL = 24, CURRENCY_SYMBOL = 25, MODIFIER_SYMBOL = 26, OTHER_SYMBOL = 27, INITIAL_PUNCTUATION = 28, FINAL_PUNCTUATION = 29, GENERAL_TYPES_COUNT = 30 }; enum EUnicodeScript { kBasicLatin, kLatin1Supplement, kLatinExtendedA, kLatinExtendedB, kIPAExtension, kSpacingModifier, kCombiningDiacritical, kGreek, kCyrillic, kArmenian, kHebrew, kArabic, kDevanagari, kBengali, kGurmukhi, kGujarati, kOriya, kTamil, kTelugu, kKannada, kMalayalam, kThai, kLao, kTibetan, kGeorgian, kHangulJamo, kLatinExtendedAdditional, kGreekExtended, kGeneralPunctuation, kSuperSubScript, kCurrencySymbolScript, kSymbolCombiningMark, kLetterlikeSymbol, kNumberForm, kArrow, kMathOperator, kMiscTechnical, kControlPicture, kOpticalCharacter, kEnclosedAlphanumeric, kBoxDrawing, kBlockElement, kGeometricShape, kMiscSymbol, kDingbat, kCJKSymbolPunctuation, kHiragana, kKatakana, kBopomofo, kHangulCompatibilityJamo, kKanbun, kEnclosedCJKLetterMonth, kCJKCompatibility, kCJKUnifiedIdeograph, kHangulSyllable, kHighSurrogate, kHighPrivateUseSurrogate, kLowSurrogate, kPrivateUse, kCJKCompatibilityIdeograph, kAlphabeticPresentation, kArabicPresentationA, kCombiningHalfMark, kCJKCompatibilityForm, kSmallFormVariant, kArabicPresentationB, kNoScript, kHalfwidthFullwidthForm, kScriptCount }; enum EDirectionProperty { LEFT_TO_RIGHT = 0, RIGHT_TO_LEFT = 1, EUROPEAN_NUMBER = 2, EUROPEAN_NUMBER_SEPARATOR = 3, EUROPEAN_NUMBER_TERMINATOR = 4, ARABIC_NUMBER = 5, COMMON_NUMBER_SEPARATOR = 6, BLOCK_SEPARATOR = 7, SEGMENT_SEPARATOR = 8, WHITE_SPACE_NEUTRAL = 9, OTHER_NEUTRAL = 10, LEFT_TO_RIGHT_EMBEDDING = 11, LEFT_TO_RIGHT_OVERRIDE = 12, RIGHT_TO_LEFT_ARABIC = 13, RIGHT_TO_LEFT_EMBEDDING = 14, RIGHT_TO_LEFT_OVERRIDE = 15, POP_DIRECTIONAL_FORMAT = 16, DIR_NON_SPACING_MARK = 17, BOUNDARY_NEUTRAL = 18 }; enum ECellWidths { ZERO_WIDTH = 0, HALF_WIDTH = 1, FULL_WIDTH = 2, NEUTRAL = 3 }; static inline UBool isSingle(UChar c); static inline UBool isLead(UChar c); static inline UBool isTrail(UChar c); static inline UBool isSurrogate(UChar32 c); static inline UBool isUnicodeChar(UChar32 c); static inline UBool isError(UChar32 c); static inline UBool isValid(UChar32 c); static inline UBool needMultipleUChar(UChar32 c); static inline int32_t charLength(UChar32 c); static inline int32_t arraySize(int32_t size); static inline UBool isLowerCase(UChar32 ch); static inline UBool isUpperCase(UChar32 ch); static inline UBool isTitleCase(UChar32 ch); static inline UBool isDigit(UChar32 ch); static inline UBool isDefined(UChar32 ch); static inline UBool isControl(UChar32 ch); static inline UBool isPrintable(UChar32 ch); static inline UBool isBaseForm(UChar32 ch); static inline UBool isLetter(UChar32 ch); static inline UBool isJavaIdentifierStart(UChar32 ch); static inline UBool isJavaIdentifierPart(UChar32 ch); static inline UBool isUnicodeIdentifierStart(UChar32 ch); static inline UBool isUnicodeIdentifierPart(UChar32 ch); static inline UBool isIdentifierIgnorable(UChar32 ch); static inline UChar32 toLowerCase(UChar32 ch); static inline UChar32 toUpperCase(UChar32 ch); static inline UChar32 toTitleCase(UChar32 ch); static inline UBool isSpaceChar(UChar32 ch); static inline UBool isWhitespace(UChar32 ch); static inline int8_t getType(UChar32 ch); static inline EDirectionProperty characterDirection(UChar32 ch); static inline UBool isMirrored(UChar32 c); static inline UChar32 charMirror(UChar32 c); static inline EUnicodeScript getScript(UChar32 ch); static inline uint16_t getCellWidth(UChar32 ch); static inline UTextOffset getCharName(uint32_t code, char *buffer, UTextOffset bufferLength, UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME); static inline int32_t digitValue(UChar32 ch); static inline int8_t digit(UChar32 ch, int8_t radix); static inline UChar32 forDigit(int32_t digit, int8_t radix); static void getUnicodeVersion(UVersionInfo info); protected: // These constructors, destructor, and assignment operator must // be protected (not private, as they semantically are) to make // various UNIX compilers happy. [LIU] // They should be private to prevent anyone from instantiating or // subclassing Unicode. Unicode(); Unicode(const Unicode &other); ~Unicode(); const Unicode &operator=(const Unicode &other); }; /* inline implementations --------------------------------------------------- */ inline UBool Unicode::isSingle(UChar c) { return UTF_IS_SINGLE(c); } inline UBool Unicode::isLead(UChar c) { return UTF_IS_LEAD(c); } inline UBool Unicode::isTrail(UChar c) { return UTF_IS_TRAIL(c); } inline UBool Unicode::isSurrogate(UChar32 c) { return UTF_IS_SURROGATE(c); } inline UBool Unicode::isUnicodeChar(UChar32 c) { return UTF_IS_UNICODE_CHAR(c); } inline UBool Unicode::isError(UChar32 c) { return UTF_IS_ERROR(c); } inline UBool Unicode::isValid(UChar32 c) { return UTF_IS_VALID(c); } inline UBool Unicode::needMultipleUChar(UChar32 c) { return UTF_NEED_MULTIPLE_UCHAR(c); } inline int32_t Unicode::charLength(UChar32 c) { return UTF_CHAR_LENGTH(c); } inline int32_t Unicode::arraySize(int32_t size) { return UTF_ARRAY_SIZE(size); } // Checks if ch is a lower case letter. inline UBool Unicode::isLowerCase(UChar32 ch) { return u_islower(ch); } // Checks if ch is a upper case letter. inline UBool Unicode::isUpperCase(UChar32 ch) { return u_isupper(ch); } // Checks if ch is a title case letter; usually upper case letters. inline UBool Unicode::isTitleCase(UChar32 ch) { return u_istitle(ch); } // Checks if ch is a decimal digit. inline UBool Unicode::isDigit(UChar32 ch) { return u_isdigit(ch); } // Checks if ch is a unicode character with assigned character type. inline UBool Unicode::isDefined(UChar32 ch) { return u_isdefined(ch); } // Checks if the Unicode character is a control character. inline UBool Unicode::isControl(UChar32 ch) { return u_iscntrl(ch); } // Checks if the Unicode character is printable. inline UBool Unicode::isPrintable(UChar32 ch) { return u_isprint(ch); } // Checks if the Unicode character is a base form character that can take a diacritic. inline UBool Unicode::isBaseForm(UChar32 ch) { return u_isbase(ch); } // Checks if the Unicode character is a letter. inline UBool Unicode::isLetter(UChar32 ch) { return u_isalpha(ch); } // Checks if the Unicode character can start a Java identifier. inline UBool Unicode::isJavaIdentifierStart(UChar32 ch) { return u_isJavaIDStart(ch); } // Checks if the Unicode character can be a Java identifier part other than starting the // identifier. inline UBool Unicode::isJavaIdentifierPart(UChar32 ch) { return u_isJavaIDPart(ch); } // Checks if the Unicode character can start a Unicode identifier. inline UBool Unicode::isUnicodeIdentifierStart(UChar32 ch) { return u_isIDStart(ch); } // Checks if the Unicode character can be a Unicode identifier part other than starting the // identifier. inline UBool Unicode::isUnicodeIdentifierPart(UChar32 ch) { return u_isIDPart(ch); } // Checks if the Unicode character can be ignorable in a Java or Unicode identifier. inline UBool Unicode::isIdentifierIgnorable(UChar32 ch) { return u_isIDIgnorable(ch); } // Transforms the Unicode character to its lower case equivalent. inline UChar32 Unicode::toLowerCase(UChar32 ch) { return u_tolower(ch); } // Transforms the Unicode character to its upper case equivalent. inline UChar32 Unicode::toUpperCase(UChar32 ch) { return u_toupper(ch); } // Transforms the Unicode character to its title case equivalent. inline UChar32 Unicode::toTitleCase(UChar32 ch) { return u_totitle(ch); } // Checks if the Unicode character is a space character. inline UBool Unicode::isSpaceChar(UChar32 ch) { return u_isspace(ch); } // Determines if the specified character is white space according to ICU. inline UBool Unicode::isWhitespace(UChar32 ch) { return u_isWhitespace(ch); } // Gets if the Unicode character's character property. inline int8_t Unicode::getType(UChar32 ch) { return u_charType(ch); } // Gets the character's linguistic directionality. inline Unicode::EDirectionProperty Unicode::characterDirection(UChar32 ch) { return (EDirectionProperty)u_charDirection(ch); } // Determines if the character has the "mirrored" property. inline UBool Unicode::isMirrored(UChar32 ch) { return u_isMirrored(ch); } // Maps the character to a "mirror-image" character, or to itself. inline UChar32 Unicode::charMirror(UChar32 ch) { return u_charMirror(ch); } // Get the script associated with the character inline Unicode::EUnicodeScript Unicode::getScript(UChar32 ch) { return (EUnicodeScript) u_charScript(ch); } // Gets table cell width of the Unicode character. inline uint16_t Unicode::getCellWidth(UChar32 ch) { return u_charCellWidth(ch); } inline UTextOffset Unicode::getCharName(uint32_t code, char *buffer, UTextOffset bufferLength, UCharNameChoice nameChoice) { UErrorCode errorCode=U_ZERO_ERROR; UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode); return U_SUCCESS(errorCode) ? length : 0; } inline int32_t Unicode::digitValue(UChar32 ch) { return u_charDigitValue(ch); } inline int8_t Unicode::digit(UChar32 ch, int8_t radix) { // ### TODO this should probably move to a C u_charDigitValueEx(ch, radix) and be called here int8_t value; if((uint8_t)(radix-MIN_RADIX)<=(MAX_RADIX-MIN_RADIX)) { value=(int8_t)u_charDigitValue(ch); if(value<0) { // ch is not a decimal digit, try latin letters if ((uint32_t)(ch-0x41)<26) { value=(int8_t)(ch-(0x41-10)); // A-Z, subtract A } else if ((uint32_t)(ch-0x61)<26) { value=(int8_t)(ch-(0x61-10)); // a-z, subtract a } else { return -1; // ch is not a digit character } } } else { return -1; // invalid radix } return (value<radix) ? value : -1; } inline UChar32 Unicode::forDigit(int32_t digit, int8_t radix) { // ### TODO this should probably move to a C u_forDigit(digit, radix) and be called here if((uint8_t)(radix-MIN_RADIX)>(MAX_RADIX-MIN_RADIX) || (uint32_t)digit>=(uint32_t)radix) { return 0; } else if(digit<10) { return (UChar32)(0x30+digit); } else { return (UChar32)((0x61-10)+digit); } } inline void Unicode::getUnicodeVersion(UVersionInfo versionArray) { u_getUnicodeVersion(versionArray); } #endif