#include <uniset.h>
Inheritance diagram for UnicodeSet:
Public Methods | |
UnicodeSet () | |
Constructs an empty set. More... | |
UnicodeSet (UChar32 start, UChar32 end) | |
Constructs a set containing the given range. More... | |
UnicodeSet (const UnicodeString& pattern, UErrorCode& status) | |
Constructs a set from the given pattern. More... | |
UnicodeSet (int8_t category, UErrorCode& status) | |
Constructs a set from the given Unicode character category. More... | |
UnicodeSet (const UnicodeSet& o) | |
Constructs a set that is identical to the given UnicodeSet. More... | |
virtual | ~UnicodeSet () |
Destructs the set. More... | |
UnicodeSet& | operator= (const UnicodeSet& o) |
Assigns this object to be a copy of another. More... | |
virtual UBool | operator== (const UnicodeSet& o) const |
Compares the specified object with this set for equality. More... | |
UBool | operator!= (const UnicodeSet& o) const |
Compares the specified object with this set for equality. More... | |
virtual UnicodeFilter* | clone () const |
Returns a copy of this object. More... | |
virtual int32_t | hashCode (void) const |
Returns the hash code value for this set. More... | |
void | set (UChar32 start, UChar32 end) |
Make this object represent the range start - end . More... | |
virtual void | applyPattern (const UnicodeString& pattern, UErrorCode& status) |
Modifies this set to represent the set specified by the given pattern, optionally ignoring white space. More... | |
virtual UnicodeString& | toPattern (UnicodeString& result) const |
Returns a string representation of this set. More... | |
virtual int32_t | size (void) const |
Returns the number of elements in this set (its cardinality), n, where 0 <= n <= 65536 . More... | |
virtual UBool | isEmpty (void) const |
Returns true if this set contains no elements. More... | |
virtual UBool | contains (UChar32 start, UChar32 end) const |
Returns true if this set contains the specified range of chars. More... | |
virtual UBool | contains (UChar32 c) const |
Implement UnicodeFilter: Returns true if this set contains the specified char. More... | |
virtual UBool | contains (UChar c) const |
Implement UnicodeFilter: Returns true if this set contains the specified char. More... | |
virtual void | add (UChar32 start, UChar32 end) |
Adds the specified range to this set if it is not already present. More... | |
void | add (UChar32 c) |
Adds the specified character to this set if it is not already present. More... | |
virtual void | retain (UChar32 start, UChar32 end) |
Retain only the elements in this set that are contained in the specified range. More... | |
void | retain (UChar32 c) |
Retain the specified character from this set if it is present. | |
virtual void | remove (UChar32 start, UChar32 end) |
Removes the specified range from this set if it is present. More... | |
void | remove (UChar32 c) |
Removes the specified character from this set if it is present. More... | |
virtual void | complement (void) |
Inverts this set. More... | |
virtual void | complement (UChar32 start, UChar32 end) |
Complements the specified range in this set. More... | |
void | complement (UChar32 c) |
Complements the specified character in this set. More... | |
virtual UBool | containsAll (const UnicodeSet& c) const |
Returns true if the specified set is a subset of this set. More... | |
virtual void | addAll (const UnicodeSet& c) |
Adds all of the elements in the specified set to this set if they're not already present. More... | |
virtual void | retainAll (const UnicodeSet& c) |
Retains only the elements in this set that are contained in the specified set. More... | |
virtual void | removeAll (const UnicodeSet& c) |
Removes from this set all of its elements that are contained in the specified set. More... | |
virtual void | complementAll (const UnicodeSet& c) |
Complements in this set all elements contained in the specified set. More... | |
virtual void | clear (void) |
Removes all of the elements from this set. More... | |
virtual int32_t | getRangeCount (void) const |
Iteration method that returns the number of ranges contained in this set. More... | |
virtual UChar32 | getRangeStart (int32_t index) const |
Iteration method that returns the first character in the specified range of this set. More... | |
virtual UChar32 | getRangeEnd (int32_t index) const |
Iteration method that returns the last character in the specified range of this set. More... | |
virtual void | compact () |
Reallocate this objects internal structures to take up the least possible space, without changing this object's value. | |
Static Public Attributes | |
const UChar32 | MIN_VALUE |
Minimum value that can be stored in a UnicodeSet. More... | |
const UChar32 | MAX_VALUE |
Maximum value that can be stored in a UnicodeSet. More... | |
Private Methods | |
UnicodeSet (const UnicodeString& pattern, ParsePosition& pos, const SymbolTable& symbols, UErrorCode& status) | |
Constructs a set from the given pattern. More... | |
UBool | containsIndexValue (uint8_t v) const |
Returns true if this set contains any character whose low byte is the given value. More... | |
void | applyPattern (const UnicodeString& pattern, ParsePosition& pos, const SymbolTable* symbols, UErrorCode& status) |
Parses the given pattern, starting at the given position. More... | |
void | applyCategory (const UnicodeString& catName, UErrorCode& status) |
Sets this object to the given category, given its name. More... | |
void | ensureCapacity (int32_t newLen) |
void | ensureBufferCapacity (int32_t newLen) |
void | swapBuffers (void) |
void | exclusiveOr (const UChar32* other, int32_t otherLen, int8_t polarity) |
void | add (const UChar32* other, int32_t otherLen, int8_t polarity) |
void | retain (const UChar32* other, int32_t otherLen, int8_t polarity) |
Private Attributes | |
int32_t | len |
int32_t | capacity |
int32_t | bufferCapacity |
UChar32* | list |
UChar32* | buffer |
Static Private Methods | |
const UnicodeSet& | getCategorySet (int8_t cat) |
Returns a pairs string for the given category. More... | |
UChar | charAfter (const UnicodeString& str, int32_t i) |
Returns the character after the given position, or '\uFFFE' if there is none. | |
void | _toPat (UnicodeString& buf, UChar32 c) |
Static Private Attributes | |
const UChar32 | HIGH |
const UChar32 | LOW |
const int32_t | START_EXTRA |
const int32_t | GROW_EXTRA |
const UnicodeString | CATEGORY_NAMES |
UnicodeSet* | CATEGORY_CACHE |
A cache mapping character category integers, as returned by Unicode::getType(), to pairs strings. More... | |
const UnicodeString | CATEGORY_CLOSE |
Delimiter string used in patterns to close a category reference: ":]". More... | |
const UChar | SET_OPEN |
const UChar | SET_CLOSE |
const UChar | HYPHEN |
const UChar | COMPLEMENT |
const UChar | COLON |
const UChar | BACKSLASH |
const UChar | INTERSECTION |
const UChar | UPPER_U |
const UChar | HEX [16] |
Friends | |
class | TransliterationRuleParser |
class | TransliterationRule |
Objects of this class represent character classes used in regular expressions. Such classes specify a subset of the set of all Unicode characters, which in this implementation is the characters from U+0000 to U+FFFF, ignoring surrogates.
This class supports two APIs. The first is modeled after Java 2's java.util.Set
interface, although this class does not implement that interface. All methods of Set
are supported, with the modification that they take a character range or single character instead of an Object
, and they take a UnicodeSet
instead of a Collection
.
The second API is the applyPattern()
/toPattern()
API from the Format
-derived classes. Unlike the methods that add characters, add categories, and control the logic of the set, the method applyPattern()
sets all attributes of a UnicodeSet
at once, based on a string pattern.
In addition, the set complement operation is supported through the complement()
method.
Pattern syntax
Patterns are accepted by the constructors and the applyPattern()
methods and returned by the toPattern()
method. These patterns follow a syntax similar to that employed by version 8 regular expression character classes:
pattern :=
('[' '^'? item* ']') | ('[:' '^'? category ':]')
item :=
char | (char '-' char) | pattern-expr
pattern-expr :=
pattern | pattern-expr pattern | pattern-expr op pattern
op :=
'&' | '-'
special :=
'[' | ']' | '-'
char :=
any character that is not special
any character
| ('\u005C')
| ('\u005Cu' hex hex hex hex)
hex :=
any character for which Character.digit(c, 16)
returns a non-negative resultcategory :=
'M' | 'N' | 'Z' | 'C' | 'L' | 'P' | 'S' | 'Mn' | 'Mc' | 'Me' | 'Nd' | 'Nl' | 'No' | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cs' | 'Co' | 'Cn' | 'Lu' | 'Ll' | 'Lt' | 'Lm' | 'Lo' | 'Pc' | 'Pd' | 'Ps' | 'Pe' | 'Po' | 'Sm' | 'Sc' | 'Sk' | 'So'
Legend:
a := b
a
may be replaced byb
a?
zero or one instance of a
a*
one or more instances of a
a | b
either a
orb
'a'
the literal string between the quotes
Any character may be preceded by a backslash in order to remove any special meaning. White space characters, as defined by Character.isWhitespace(), are ignored, unless they are escaped.
Patterns specify individual characters, ranges of characters, and Unicode character categories. When elements are concatenated, they specify their union. To complement a set, place a '^' immediately after the opening '[' or '[:'. In any other location, '^' has no special meaning.
Ranges are indicated by placing two a '-' between two characters, as in "a-z". This specifies the range of all characters from the left to the right, in Unicode order. If the left and right characters are the same, then the range consists of just that character. If the left character is greater than the right character it is a syntax error. If a '-' occurs as the first character after the opening '[' or '[^', or if it occurs as the last character before the closing ']', then it is taken as a literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same set of three characters, 'a', 'b', and '-'.
Sets may be intersected using the '&' operator or the asymmetric set difference may be taken using the '-' operator, for example, "[[:L:]&[\u0000-\u0FFF]]" indicates the set of all Unicode letters with values less than 4096. Operators ('&' and '|') have equal precedence and bind left-to-right. Thus "[[:L:]-[a-z]-[\u0100-\u01FF]]" is equivalent to "[[[:L:]-[a-z]]-[\u0100-\u01FF]]". This only really matters for difference; intersection is commutative.
[a] | The set containing 'a' |
[a-z] | The set containing 'a' through 'z' and all letters in between, in Unicode order |
[^a-z] | The set containing all characters but 'a' through 'z', that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF |
[[pat1][pat2]] | The union of sets specified by pat1 and pat2 |
[[pat1]&[pat2]] | The intersection of sets specified by pat1 and pat2 |
[[pat1]-[pat2]] | The asymmetric difference of sets specified by pat1 and pat2 |
[:Lu:] | The set of characters belonging to the given Unicode category, as defined by Character.getType() ; in this case, Unicode uppercase letters |
[:L:] | The set of characters belonging to all Unicode categories starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]] . |
Character categories.
Character categories are specified using the POSIX-like syntax '[:Lu:]'. The complement of a category is specified by inserting '^' after the opening '[:'. The following category names are recognized. Actual determination of category data uses Unicode::getType()
, so it reflects the underlying data used by Unicode
.
Normative Mn = Mark, Non-Spacing Mc = Mark, Spacing Combining Me = Mark, Enclosing Nd = Number, Decimal Digit Nl = Number, Letter No = Number, Other Zs = Separator, Space Zl = Separator, Line Zp = Separator, Paragraph Cc = Other, Control Cf = Other, Format Cs = Other, Surrogate Co = Other, Private Use Cn = Other, Not Assigned Informative Lu = Letter, Uppercase Ll = Letter, Lowercase Lt = Letter, Titlecase Lm = Letter, Modifier Lo = Letter, Other Pc = Punctuation, Connector Pd = Punctuation, Dash Ps = Punctuation, Open Pe = Punctuation, Close Pi = Punctuation, Initial quote Pf = Punctuation, Final quote Po = Punctuation, Other Sm = Symbol, Math Sc = Symbol, Currency Sk = Symbol, Modifier So = Symbol, Other
Definition at line 249 of file uniset.h.
|
Constructs an empty set.
|
|
Constructs a set containing the given range.
If
|
|
Constructs a set from the given pattern. See the class description for the syntax of the pattern language.
|
|
Constructs a set from the given Unicode character category.
|
|
Constructs a set that is identical to the given UnicodeSet.
|
|
Destructs the set.
|
|
Constructs a set from the given pattern. See the class description for the syntax of the pattern language.
|
|
|
|
|
|
Adds the specified character to this set if it is not already present. If this set already contains the specified character, the call leaves this set unchanged.
|
|
Adds the specified range to this set if it is not already present.
If this set already contains the specified range, the call leaves this set unchanged. If
|
|
Adds all of the elements in the specified set to this set if they're not already present. This operation effectively modifies this set so that its value is the union of the two sets. The behavior of this operation is unspecified if the specified collection is modified while the operation is in progress.
|
|
Sets this object to the given category, given its name. The category name must be either a two-letter name, such as "Lu", or a one letter name, such as "L". One-letter names indicate the logical union of all two-letter names that start with that letter. Case is significant. If the name starts with the character '^' then the complement of the given character set is returned. Although individual categories such as "Lu" are cached, we do not currently cache single-letter categories such as "L" or complements such as "^Lu" or "^L". It would be easy to cache these as well in a hashtable should the need arise. |
|
Parses the given pattern, starting at the given position. The character at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. Parsing continues until the corresponding closing ']'. If a syntax error is encountered between the opening and closing brace, the parse fails. Upon return from a successful parse, the ParsePosition is updated to point to the character following the closing ']', and a StringBuffer containing a pairs list for the parsed pattern is returned. This method calls itself recursively to parse embedded subpatterns.
|
|
Modifies this set to represent the set specified by the given pattern, optionally ignoring white space. See the class description for the syntax of the pattern language.
|
|
Returns the character after the given position, or '\uFFFE' if there is none.
|
|
Removes all of the elements from this set. This set will be empty after this call returns.
|
|
Returns a copy of this object. All UnicodeFilter objects have to support cloning in order to allow classes using UnicodeFilters, such as Transliterator, to implement cloning.
Reimplemented from UnicodeFilter. |
|
Reallocate this objects internal structures to take up the least possible space, without changing this object's value.
|
|
Complements the specified character in this set. The character will be removed if it is in this set, or will be added if it is not in this set. |
|
Complements the specified range in this set.
Any character in the range will be removed if it is in this set, or will be added if it is not in this set. If
|
|
Inverts this set.
This operation modifies this set so that its value is its complement. This is equivalent to
|
|
Complements in this set all elements contained in the specified set. Any character in the other set will be removed if it is in this set, or will be added if it is not in this set.
|
|
Implement UnicodeFilter: Returns
Reimplemented from UnicodeFilter. |
|
Implement UnicodeFilter: Returns
Reimplemented from UnicodeFilter. |
|
Returns
|
|
Returns
|
|
Returns
This is used by |
|
|
|
|
|
|
|
Returns a pairs string for the given category. This string is cached and returned again if this method is called again with the same parameter. |
|
Iteration method that returns the number of ranges contained in this set.
|
|
Iteration method that returns the last character in the specified range of this set.
|
|
Iteration method that returns the first character in the specified range of this set.
|
|
Returns the hash code value for this set.
|
|
Returns
|
|
Compares the specified object with this set for equality.
Returns
|
|
Assigns this object to be a copy of another.
|
|
Compares the specified object with this set for equality.
Returns
Referenced by operator!=(). |
|
Removes the specified character from this set if it is present. The set will not contain the specified range once the call returns.
|
|
Removes the specified range from this set if it is present.
The set will not contain the specified range once the call returns. If
|
|
Removes from this set all of its elements that are contained in the specified set. This operation effectively modifies this set so that its value is the asymmetric set difference of the two sets.
|
|
|
|
Retain the specified character from this set if it is present.
|
|
Retain only the elements in this set that are contained in the specified range.
If
|
|
Retains only the elements in this set that are contained in the specified set. In other words, removes from this set all of its elements that are not contained in the specified set. This operation effectively modifies this set so that its value is the intersection of the two sets.
|
|
Make this object represent the range
If
|
|
Returns the number of elements in this set (its cardinality), n, where
|
|
|
|
Returns a string representation of this set. If the result of calling this function is passed to a UnicodeSet constructor, it will produce another set that is equal to this one.
|
|
|
|
|
|
|
|
A cache mapping character category integers, as returned by Unicode::getType(), to pairs strings. Entries are initially zero length and are filled in on demand. |
|
Delimiter string used in patterns to close a category reference: ":]". Example: "[:Lu:]". |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Maximum value that can be stored in a UnicodeSet.
|
|
Minimum value that can be stored in a UnicodeSet.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|