Main Page | Class Hierarchy | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals | Related Pages

unicode.h

00001 /*
00002 ******************************************************************************
00003 *   Copyright (C) 1996-2001, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 ******************************************************************************
00006 */
00007 //  FILE NAME : unicode.h
00008 //
00009 //  CREATED
00010 //      Wednesday, December 11, 1996
00011 //
00012 //  CREATED BY
00013 //      Helena Shih
00014 //
00015 //  CHANGES
00016 //      Thursday, April 15, 1999
00017 //      Modified the definitions of all the functions
00018 //      C++ Wrappers for Unicode
00019 //  CHANGES BY
00020 //      Madhu Katragadda
00021 //   5/20/99     Madhu      Added the function getVersion()
00022 //  11/22/99     aliu       Added MIN_RADIX, MAX_RADIX, digit, forDigit
00023 //*****************************************************************************
00024 
00025 
00026 
00027 #ifndef UNICODE_H
00028 #define UNICODE_H
00029 
00030 #include "unicode/utypes.h"
00031 #include "unicode/uchar.h"
00032 
00033 U_NAMESPACE_BEGIN
00055 class U_COMMON_API Unicode
00056 {
00057 public:
00058     /*
00059      * In C++, static const members actually take up memory and need to be accessed.
00060      * enum values are more like C #define's.
00061      * The following is a collection of constants, not an enumeration type.
00062      *
00063      * @deprecated See the Unicode class description.
00064      */
00065     enum {
00067         MIN_VALUE=0,
00068 
00074         MAX_VALUE=0x10ffff,
00075 
00083         MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH,
00084 
00095         MIN_RADIX=2,
00096 
00107         MAX_RADIX=36
00108     };
00109 
00116     enum EUnicodeGeneralTypes
00117     {
00118         UNASSIGNED              = 0,
00119         UPPERCASE_LETTER        = 1,
00120         LOWERCASE_LETTER        = 2,
00121         TITLECASE_LETTER        = 3,
00122         MODIFIER_LETTER         = 4,
00123         OTHER_LETTER            = 5,
00124         NON_SPACING_MARK        = 6,
00125         ENCLOSING_MARK          = 7,
00126         COMBINING_SPACING_MARK  = 8,
00127         DECIMAL_DIGIT_NUMBER    = 9,
00128         LETTER_NUMBER           = 10,
00129         OTHER_NUMBER            = 11,
00130         SPACE_SEPARATOR         = 12,
00131         LINE_SEPARATOR          = 13,
00132         PARAGRAPH_SEPARATOR     = 14,
00133         CONTROL                 = 15,
00134         FORMAT                  = 16,
00135         PRIVATE_USE             = 17,
00136         SURROGATE               = 18,
00137         DASH_PUNCTUATION        = 19,
00138         START_PUNCTUATION       = 20,
00139         END_PUNCTUATION         = 21,
00140         CONNECTOR_PUNCTUATION   = 22,
00141         OTHER_PUNCTUATION       = 23,
00142         MATH_SYMBOL             = 24,
00143         CURRENCY_SYMBOL         = 25,
00144         MODIFIER_SYMBOL         = 26,
00145         OTHER_SYMBOL            = 27,
00146         INITIAL_PUNCTUATION     = 28,
00147         FINAL_PUNCTUATION       = 29,
00148         GENERAL_TYPES_COUNT     = 30
00149     };
00150 
00151     /* Please keep these values in sync with UCharScript */
00157     enum EUnicodeScript 
00158     {
00159         kBasicLatin=UBLOCK_BASIC_LATIN,
00160         kLatin1Supplement,
00161         kLatinExtendedA,
00162         kLatinExtendedB,
00163         kIPAExtension,
00164         kSpacingModifier,
00165         kCombiningDiacritical,
00166         kGreek,
00167         kCyrillic,
00168         kArmenian,
00169         kHebrew,
00170         kArabic,
00171         kSyriac,
00172         kThaana,
00173         kDevanagari,
00174         kBengali,
00175         kGurmukhi,
00176         kGujarati,
00177         kOriya,
00178         kTamil,
00179         kTelugu,
00180         kKannada,
00181         kMalayalam,
00182         kSinhala,
00183         kThai,
00184         kLao,
00185         kTibetan,
00186         kMyanmar,
00187         kGeorgian,
00188         kHangulJamo,
00189         kEthiopic,
00190         kCherokee,
00191         kUnifiedCanadianAboriginalSyllabics,
00192         kogham,
00193         kRunic,
00194         kKhmer,
00195         kMongolian,
00196         kLatinExtendedAdditional,
00197         kGreekExtended,
00198         kGeneralPunctuation,
00199         kSuperSubScript,
00200         kCurrencySymbolScript,
00201         kSymbolCombiningMark,
00202         kLetterlikeSymbol,
00203         kNumberForm,
00204         kArrow,
00205         kMathOperator,
00206         kMiscTechnical,
00207         kControlPicture,
00208         kOpticalCharacter,
00209         kEnclosedAlphanumeric,
00210         kBoxDrawing,
00211         kBlockElement,
00212         kGeometricShape,
00213         kMiscSymbol,
00214         kDingbat,
00215         kBraillePatterns,
00216         kCJKRadicalsSupplement,
00217         kKangxiRadicals,
00218         kIdeographicDescriptionCharacters,
00219         kCJKSymbolPunctuation,
00220         kHiragana,
00221         kKatakana,
00222         kBopomofo,
00223         kHangulCompatibilityJamo,
00224         kKanbun,
00225         kBopomofoExtended,
00226         kEnclosedCJKLetterMonth,
00227         kCJKCompatibility,
00228         kCJKUnifiedIdeographExtensionA,
00229         kCJKUnifiedIdeograph,
00230         kYiSyllables,
00231         kYiRadicals,
00232         kHangulSyllable,
00233         kHighSurrogate,
00234         kHighPrivateUseSurrogate,
00235         kLowSurrogate,
00236         kPrivateUse,
00237         kCJKCompatibilityIdeograph,
00238         kAlphabeticPresentation,
00239         kArabicPresentationA,
00240         kCombiningHalfMark,
00241         kCJKCompatibilityForm,
00242         kSmallFormVariant,
00243         kArabicPresentationB,
00244         kNoScript,
00245         kHalfwidthFullwidthForm,
00246         kScriptCount=UBLOCK_COUNT
00247     };
00248 
00254     enum EDirectionProperty { 
00255         LEFT_TO_RIGHT               = 0, 
00256         RIGHT_TO_LEFT               = 1, 
00257         EUROPEAN_NUMBER             = 2,
00258         EUROPEAN_NUMBER_SEPARATOR   = 3,
00259         EUROPEAN_NUMBER_TERMINATOR  = 4,
00260         ARABIC_NUMBER               = 5,
00261         COMMON_NUMBER_SEPARATOR     = 6,
00262         BLOCK_SEPARATOR             = 7,
00263         SEGMENT_SEPARATOR           = 8,
00264         WHITE_SPACE_NEUTRAL         = 9, 
00265         OTHER_NEUTRAL               = 10, 
00266         LEFT_TO_RIGHT_EMBEDDING     = 11,
00267         LEFT_TO_RIGHT_OVERRIDE      = 12,
00268         RIGHT_TO_LEFT_ARABIC        = 13,
00269         RIGHT_TO_LEFT_EMBEDDING     = 14,
00270         RIGHT_TO_LEFT_OVERRIDE      = 15,
00271         POP_DIRECTIONAL_FORMAT      = 16,
00272         DIR_NON_SPACING_MARK        = 17,
00273         BOUNDARY_NEUTRAL            = 18
00274     };
00275 
00282     enum ECellWidths
00283     {
00284         ZERO_WIDTH              = 0,
00285         HALF_WIDTH              = 1,
00286         FULL_WIDTH              = 2,
00287         NEUTRAL                 = 3
00288     };
00289 
00301     static inline UBool isSingle(UChar c);
00302 
00312     static inline UBool isLead(UChar c);
00313 
00323     static inline UBool isTrail(UChar c);
00324 
00336     static inline UBool isSurrogate(UChar32 c);
00337 
00351     static inline UBool isUnicodeChar(UChar32 c);
00352 
00365     static inline UBool isError(UChar32 c);
00366 
00377     static inline UBool isValid(UChar32 c);
00378 
00391     static inline UBool needMultipleUChar(UChar32 c);
00392 
00402     static inline int32_t charLength(UChar32 c);
00403 
00418     static inline int32_t arraySize(int32_t size);
00419 
00433     static inline UBool isLowerCase(UChar32 ch);
00434 
00447     static inline UBool isUpperCase(UChar32 ch);
00448 
00461     static inline UBool isTitleCase(UChar32 ch);
00462 
00475     static inline UBool isDigit(UChar32 ch);
00476 
00493     static inline UBool isDefined(UChar32 ch);
00494 
00506     static inline UBool isControl(UChar32 ch);
00507 
00519     static inline UBool isPrintable(UChar32 ch);
00520 
00533      static inline UBool isBaseForm(UChar32 ch);
00534 
00551     static inline UBool isLetter(UChar32 ch);
00552 
00574     static inline UBool isJavaIdentifierStart(UChar32 ch);
00575 
00605     static inline UBool isJavaIdentifierPart(UChar32 ch);
00606 
00622     static inline UBool isUnicodeIdentifierStart(UChar32 ch);
00623 
00651     static inline UBool isUnicodeIdentifierPart(UChar32 ch);
00652 
00679     static inline UBool isIdentifierIgnorable(UChar32 ch);
00680 
00706    static inline UChar32 toLowerCase(UChar32 ch); 
00707 
00730     static inline UChar32 toUpperCase(UChar32 ch);
00731 
00750     static inline UChar32 toTitleCase(UChar32 ch);
00751 
00766     static inline UChar32
00767     foldCase(UChar32 c, uint32_t options);
00768 
00778     static inline UBool isSpaceChar(UChar32 ch);
00779 
00809     static inline UBool isWhitespace(UChar32 ch);
00810 
00846     static inline int8_t getType(UChar32 ch);
00847 
00856     static inline uint8_t getCombiningClass(UChar32 c);
00857 
00868     static inline EDirectionProperty characterDirection(UChar32 ch);
00869 
00881     static inline UBool isMirrored(UChar32 c);
00882 
00900     static inline UChar32 charMirror(UChar32 c);
00901 
00907     static inline EUnicodeScript getScript(UChar32 ch);
00908 
00961     static inline uint16_t getCellWidth(UChar32 ch);
00962 
00991     static inline int32_t
00992     getCharName(uint32_t code,
00993                 char *buffer, int32_t bufferLength,
00994                 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME);
00995 
01007     static inline int32_t digitValue(UChar32 ch);     
01008 
01047     static inline int32_t digit(UChar32 ch, int8_t radix);
01048 
01077     static inline UChar32 forDigit(int32_t digit, int8_t radix);
01078 
01085     static void getUnicodeVersion(UVersionInfo info);
01086 
01087 protected:
01088     // These constructors, destructor, and assignment operator must
01089     // be protected (not private, as they semantically are) to make
01090     // various UNIX compilers happy. [LIU]
01091     // They should be private to prevent anyone from instantiating or
01092     // subclassing Unicode.
01093     Unicode();
01094     Unicode(const Unicode &other);
01095     ~Unicode();
01096     const Unicode &operator=(const Unicode &other);
01097 };
01098 
01099 /* inline implementations --------------------------------------------------- */
01100 
01101 inline UBool
01102 Unicode::isSingle(UChar c) {
01103     return UTF_IS_SINGLE(c);
01104 }
01105 
01106 inline UBool
01107 Unicode::isLead(UChar c) {
01108     return UTF_IS_LEAD(c);
01109 }
01110 
01111 inline UBool
01112 Unicode::isTrail(UChar c) {
01113     return UTF_IS_TRAIL(c);
01114 }
01115 
01116 inline UBool
01117 Unicode::isSurrogate(UChar32 c) {
01118     return UTF_IS_SURROGATE(c);
01119 }
01120 
01121 inline UBool
01122 Unicode::isUnicodeChar(UChar32 c) {
01123     return UTF_IS_UNICODE_CHAR(c);
01124 }
01125 
01126 inline UBool
01127 Unicode::isError(UChar32 c) {
01128     return UTF_IS_ERROR(c);
01129 }
01130 
01131 inline UBool
01132 Unicode::isValid(UChar32 c) {
01133     return UTF_IS_VALID(c);
01134 }
01135 
01136 inline UBool
01137 Unicode::needMultipleUChar(UChar32 c) {
01138     return UTF_NEED_MULTIPLE_UCHAR(c);
01139 }
01140 
01141 inline int32_t
01142 Unicode::charLength(UChar32 c) {
01143     return UTF_CHAR_LENGTH(c);
01144 }
01145 
01146 inline int32_t
01147 Unicode::arraySize(int32_t size) {
01148     return UTF_ARRAY_SIZE(size);
01149 }
01150 
01151 // Checks if ch is a lower case letter.
01152 inline UBool
01153 Unicode::isLowerCase(UChar32 ch) {
01154     return u_islower(ch);
01155 }
01156 
01157 // Checks if ch is a upper case letter.
01158 inline UBool
01159 Unicode::isUpperCase(UChar32 ch) {
01160     return u_isupper(ch);
01161 }
01162 
01163 // Checks if ch is a title case letter; usually upper case letters.
01164 inline UBool
01165 Unicode::isTitleCase(UChar32 ch) {
01166     return u_istitle(ch);
01167 }
01168 
01169 // Checks if ch is a decimal digit.
01170 inline UBool
01171 Unicode::isDigit(UChar32 ch) {
01172     return u_isdigit(ch);
01173 }
01174 
01175 // Checks if ch is a unicode character with assigned character type.
01176 inline UBool
01177 Unicode::isDefined(UChar32 ch) {
01178     return u_isdefined(ch);
01179 }
01180 
01181 // Checks if the Unicode character is a control character.
01182 inline UBool
01183 Unicode::isControl(UChar32 ch) {
01184     return u_iscntrl(ch);
01185 }
01186 
01187 // Checks if the Unicode character is printable.
01188 inline UBool
01189 Unicode::isPrintable(UChar32 ch) {
01190     return u_isprint(ch);
01191 }
01192 
01193 // Checks if the Unicode character is a base form character that can take a diacritic.
01194 inline UBool
01195 Unicode::isBaseForm(UChar32 ch) {
01196     return u_isbase(ch);
01197 }
01198 
01199 // Checks if the Unicode character is a letter.
01200 inline UBool
01201 Unicode::isLetter(UChar32 ch) {
01202     return u_isalpha(ch);
01203 }
01204 
01205 // Checks if the Unicode character can start a Java identifier.
01206 inline UBool
01207 Unicode::isJavaIdentifierStart(UChar32 ch) {
01208     return u_isJavaIDStart(ch);
01209 }
01210 
01211 // Checks if the Unicode character can be a Java identifier part other than starting the
01212 // identifier.
01213 inline UBool
01214 Unicode::isJavaIdentifierPart(UChar32 ch) {
01215     return u_isJavaIDPart(ch);
01216 }
01217 
01218 // Checks if the Unicode character can start a Unicode identifier.
01219 inline UBool
01220 Unicode::isUnicodeIdentifierStart(UChar32 ch) {
01221     return u_isIDStart(ch);
01222 }
01223 
01224 // Checks if the Unicode character can be a Unicode identifier part other than starting the
01225 // identifier.
01226 inline UBool
01227 Unicode::isUnicodeIdentifierPart(UChar32 ch) {
01228     return u_isIDPart(ch);
01229 }
01230 
01231 // Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
01232 inline UBool
01233 Unicode::isIdentifierIgnorable(UChar32 ch) {
01234     return u_isIDIgnorable(ch);
01235 }
01236 
01237 // Transforms the Unicode character to its lower case equivalent.
01238 inline UChar32       
01239 Unicode::toLowerCase(UChar32 ch) {
01240     return u_tolower(ch);
01241 }
01242     
01243 // Transforms the Unicode character to its upper case equivalent.
01244 inline UChar32
01245 Unicode::toUpperCase(UChar32 ch) {
01246     return u_toupper(ch);
01247 }
01248 
01249 // Transforms the Unicode character to its title case equivalent.
01250 inline UChar32
01251 Unicode::toTitleCase(UChar32 ch) {
01252     return u_totitle(ch);
01253 }
01254 
01255 // Transforms the Unicode character to its case folded equivalent.
01256 inline UChar32       
01257 Unicode::foldCase(UChar32 ch, uint32_t options) {
01258     return u_foldCase(ch, options);
01259 }
01260     
01261 // Checks if the Unicode character is a space character.
01262 inline UBool
01263 Unicode::isSpaceChar(UChar32 ch) {
01264     return u_isspace(ch);
01265 }
01266 
01267 // Determines if the specified character is white space according to ICU.
01268 inline UBool
01269 Unicode::isWhitespace(UChar32 ch) {
01270     return u_isWhitespace(ch);
01271 }
01272 
01273 // Gets if the Unicode character's character property.
01274 inline int8_t
01275 Unicode::getType(UChar32 ch) {
01276     return u_charType(ch);
01277 }
01278 
01279 inline uint8_t
01280 Unicode::getCombiningClass(UChar32 c) {
01281     return u_getCombiningClass(c);
01282 }
01283 
01284 // Gets the character's linguistic directionality.
01285 inline Unicode::EDirectionProperty
01286 Unicode::characterDirection(UChar32 ch) {
01287     return (EDirectionProperty)u_charDirection(ch);
01288 }
01289 
01290 // Determines if the character has the "mirrored" property.
01291 inline UBool
01292 Unicode::isMirrored(UChar32 ch) {
01293     return u_isMirrored(ch);
01294 }
01295 
01296 // Maps the character to a "mirror-image" character, or to itself.
01297 inline UChar32
01298 Unicode::charMirror(UChar32 ch) {
01299     return u_charMirror(ch);
01300 }
01301 
01302 // Get the script associated with the character
01303 inline Unicode::EUnicodeScript
01304 Unicode::getScript(UChar32 ch) {
01305     return (EUnicodeScript) u_charScript(ch);
01306 }
01307 
01308 // Gets table cell width of the Unicode character.
01309 inline uint16_t
01310 Unicode::getCellWidth(UChar32 ch) {
01311     return u_charCellWidth(ch);
01312 }
01313 
01314 inline int32_t
01315 Unicode::getCharName(uint32_t code,
01316                      char *buffer, int32_t bufferLength,
01317                      UCharNameChoice nameChoice) {
01318     UErrorCode errorCode=U_ZERO_ERROR;
01319     int32_t length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode);
01320     return U_SUCCESS(errorCode) ? length : 0;
01321 }
01322 
01323 inline int32_t            
01324 Unicode::digitValue(UChar32 ch) {
01325     return u_charDigitValue(ch);
01326 }
01327 
01328 inline int32_t
01329 Unicode::digit(UChar32 ch, int8_t radix) {
01330     return u_digit(ch, radix);
01331 }
01332 
01333 inline UChar32
01334 Unicode::forDigit(int32_t digit, int8_t radix) {
01335     return u_forDigit(digit, radix);
01336 }
01337 
01338 inline void
01339 Unicode::getUnicodeVersion(UVersionInfo versionArray) {
01340     u_getUnicodeVersion(versionArray);
01341 }
01342 U_NAMESPACE_END
01343 
01344 #endif

Generated on Sun Aug 21 23:34:46 2005 for ICU 2.1 by  doxygen 1.4.4