5.20. Internationalised Identifier support

Start ocaml section to src/flx_id.ml[1 /1 ]
     1: # 489 "./lpsrc/flx_string.ipk"
     2: 
     3: (* Universal Character Names in identifiers:
     4:    Below, a table of letters acceptable in identifiers.
     5: 
     6:    Source: ISO Standard C++, Appendix E.
     7:    Which came from ISO/IEC PDTR 10176, produced by
     8:    ISO/IEC JTC1/SC22/WG20 (internationalisation)
     9: 
    10:    Characters must be in the range shown
    11:    inclusive. This list must be strictly ordered.
    12: 
    13:    Felix also allows
    14:    underscore, prime, and digits in identifiers.
    15:    Digits must not be first.
    16: *)
    17: open Flx_string
    18: 
    19: let ucs_id_ranges = [
    20:   (* ASCII *)
    21:   (0x0041,0x005a);
    22:   (0x0061,0x007a);
    23: 
    24:   (* Latin *)
    25:   (0x00c0,0x00d6);
    26:   (0x00d8,0x00f6);
    27:   (0x00f8,0x01f5);
    28:   (0x01fa,0x0217);
    29:   (0x0250,0x02a8);
    30: 
    31:   (* Greek *)
    32:   (0x0384,0x0384);
    33:   (0x0388,0x038a);
    34:   (0x038c,0x038c);
    35:   (0x038e,0x03a1);
    36:   (0x03a3,0x03ce);
    37:   (0x03d0,0x03d6);
    38:   (0x03da,0x03da);
    39:   (0x03dc,0x03dc);
    40:   (0x03de,0x03de);
    41:   (0x03e0,0x03e0);
    42:   (0x03e2,0x03f3);
    43: 
    44:   (* Cyrillic *)
    45:   (0x0401,0x040d);
    46:   (0x040f,0x044f);
    47:   (0x0451,0x045c);
    48:   (0x045e,0x0481);
    49:   (0x0490,0x04c4);
    50:   (0x04c7,0x04c4);
    51:   (0x04cb,0x04cc);
    52:   (0x04d0,0x04eb);
    53:   (0x04ee,0x04f5);
    54:   (0x04f8,0x04f9);
    55: 
    56:   (* Armenian *)
    57:   (0x0531,0x0556);
    58:   (0x0561,0x0587);
    59:   (0x04d0,0x04eb);
    60: 
    61:   (* Hebrew *)
    62:   (0x05d0,0x05ea);
    63:   (0x05f0,0x05f4);
    64: 
    65:   (* Arabic *)
    66:   (0x0621,0x063a);
    67:   (0x0640,0x0652);
    68:   (0x0670,0x06b7);
    69:   (0x06ba,0x06be);
    70:   (0x06c0,0x06ce);
    71:   (0x06e5,0x06e7);
    72: 
    73:   (* Devanagari *)
    74:   (0x0905,0x0939);
    75:   (0x0958,0x0962);
    76: 
    77:   (* Bengali *)
    78:   (0x0985,0x098c);
    79:   (0x098f,0x0990);
    80:   (0x0993,0x09a8);
    81:   (0x09aa,0x09b0);
    82:   (0x09b2,0x09b2);
    83:   (0x09b6,0x09b9);
    84:   (0x09dc,0x09dd);
    85:   (0x09df,0x09e1);
    86:   (0x09f0,0x09f1);
    87: 
    88:   (* Gurmukhi *)
    89:   (0x0a05,0x0a0a);
    90:   (0x0a0f,0x0a10);
    91:   (0x0a13,0x0a28);
    92:   (0x0a2a,0x0a30);
    93:   (0x0a32,0x0a33);
    94:   (0x0a35,0x0a36);
    95:   (0x0a38,0x0a39);
    96:   (0x0a59,0x0a5c);
    97:   (0x0a5e,0x0a5e);
    98: 
    99:   (* Gunjarati *)
   100:   (0x0a85,0x0a8b);
   101:   (0x0a8d,0x0a8d);
   102:   (0x0a8f,0x0a91);
   103:   (0x0a93,0x0aa8);
   104:   (0x0aaa,0x0ab0);
   105:   (0x0ab2,0x0ab3);
   106:   (0x0ab5,0x0ab9);
   107:   (0x0ae0,0x0ae0);
   108: 
   109:   (* Oriya *)
   110:   (0x0b05,0x0b0c);
   111:   (0x0b0f,0x0b10);
   112:   (0x0b13,0x0b28);
   113:   (0x0b2a,0x0b30);
   114:   (0x0b32,0x0b33);
   115:   (0x0b36,0x0b39);
   116:   (0x0b5c,0x0b5d);
   117:   (0x0b5f,0x0b61);
   118: 
   119:   (* Tamil *)
   120:   (0x0b85,0x0b8a);
   121:   (0x0b8e,0x0b90);
   122:   (0x0b92,0x0b95);
   123:   (0x0b99,0x0b9a);
   124:   (0x0b9c,0x0b9c);
   125:   (0x0b9e,0x0b9f);
   126:   (0x0ba3,0x0ba4);
   127:   (0x0ba8,0x0baa);
   128:   (0x0bae,0x0bb5);
   129:   (0x0bb7,0x0bb9);
   130: 
   131:   (* Telugu *)
   132:   (0x0c05,0x0c0c);
   133:   (0x0c0e,0x0c10);
   134:   (0x0c12,0x0c28);
   135:   (0x0c2a,0x0c33);
   136:   (0x0c35,0x0c39);
   137:   (0x0c60,0x0c61);
   138: 
   139:   (* Kannada *)
   140:   (0x0c85,0x0c8c);
   141:   (0x0c8e,0x0c90);
   142:   (0x0c92,0x0ca8);
   143:   (0x0caa,0x0cb3);
   144:   (0x0cb5,0x0cb9);
   145:   (0x0ce0,0x0ce1);
   146: 
   147:   (* Malayam *)
   148:   (0x0d05,0x0d0c);
   149:   (0x0d0e,0x0d10);
   150:   (0x0d12,0x0d28);
   151:   (0x0d2a,0x0d39);
   152:   (0x0d60,0x0d61);
   153: 
   154:   (* Thai *)
   155:   (0x0e01,0x0e30);
   156:   (0x0e32,0x0e33);
   157:   (0x0e40,0x0e46);
   158:   (0x0e4f,0x0e5b);
   159: 
   160:   (* Lao *)
   161:   (0x0e81,0x0e82);
   162:   (0x0e84,0x0e84);
   163:   (0x0e87,0x0e88);
   164:   (0x0e8a,0x0e8a);
   165:   (0x0e0d,0x0e0d);
   166:   (0x0e94,0x0e97);
   167:   (0x0e99,0x0e9f);
   168:   (0x0ea1,0x0ea3);
   169:   (0x0ea5,0x0ea5);
   170:   (0x0ea7,0x0ea7);
   171:   (0x0eaa,0x0eab);
   172:   (0x0ead,0x0eb0);
   173:   (0x0eb2,0x0eb3);
   174:   (0x0ebd,0x0ebd);
   175:   (0x0ec0,0x0ec4);
   176:   (0x0ec6,0x0ec6);
   177: 
   178:   (* Georgian *)
   179:   (0x10a0,0x10c5);
   180:   (0x10d0,0x10f6);
   181: 
   182:   (* Hangul Jamo *)
   183:   (0x1100,0x1159);
   184:   (0x1161,0x11a2);
   185:   (0x11a8,0x11f9);
   186:   (0x11d0,0x11f6);
   187: 
   188:   (* Latin extensions *)
   189:   (0x1e00,0x1e9a);
   190:   (0x1ea0,0x1ef9);
   191: 
   192:   (* Greek extended *)
   193:   (0x1f00,0x1f15);
   194:   (0x1f18,0x1f1d);
   195:   (0x1f20,0x1f45);
   196:   (0x1f48,0x1f4d);
   197:   (0x1f50,0x1f57);
   198:   (0x1f59,0x1f59);
   199:   (0x1f5b,0x1f5b);
   200:   (0x1f5d,0x1f5d);
   201:   (0x1f5f,0x1f7d);
   202:   (0x1f80,0x1fb4);
   203:   (0x1fb6,0x1fbc);
   204:   (0x1fc2,0x1fc4);
   205:   (0x1fc6,0x1fcc);
   206:   (0x1fd0,0x1fd3);
   207:   (0x1fd6,0x1fdb);
   208:   (0x1fe0,0x1fec);
   209:   (0x1ff2,0x1ff4);
   210:   (0x1ff6,0x1ffc);
   211: 
   212: 
   213:   (* Hiragana *)
   214:   (0x3041,0x3094);
   215:   (0x309b,0x309e);
   216: 
   217:   (* Katakana *)
   218:   (0x30a1,0x30fe);
   219: 
   220:   (* Bopmofo *)
   221:   (0x3105,0x312c);
   222: 
   223:   (* CJK Unified Ideographs *)
   224:   (0x4e00,0x9fa5);
   225: 
   226:   (* CJK Compatibility Ideographs *)
   227:   (0xf900,0xfa2d);
   228: 
   229:   (* Arabic Presentation Forms *)
   230:   (0xfb1f,0xfb36);
   231:   (0xfb38,0xfb3c);
   232:   (0xfb3e,0xfb3e);
   233:   (0xfb40,0xfb41);
   234:   (0xfb42,0xfb44);
   235:   (0xfb46,0xfbb1);
   236:   (0xfbd3,0xfd35);
   237: 
   238:   (* Arabic Presentation Forms-A *)
   239:   (0xfd50,0xfd85);
   240:   (0xfd92,0xfbc7);
   241:   (0xfdf0,0xfdfb);
   242: 
   243:   (* Arabic Presentation Forms-B *)
   244:   (0xfe70,0xfe72);
   245:   (0xfe74,0xfe74);
   246:   (0xfe76,0xfefc);
   247: 
   248:   (* Half width and Fullwidth Forms *)
   249:   (0xff21,0xff3a);
   250:   (0xff41,0xff5a);
   251:   (0xff66,0xffbe);
   252:   (0xffc2,0xffc7);
   253:   (0xffca,0xffcf);
   254:   (0xffd2,0xffd7);
   255:   (0xffd2,0xffd7);
   256:   (0xffda,0xffdc)
   257: ]
   258: 
   259: exception Found
   260: let check_code x =
   261:   try
   262:     List.iter
   263:     (fun (first, last) ->
   264:       (* print_endline ((hex4 first) ^"-"^(hex4 last)); *)
   265:       if x < first
   266:       then raise (Flx_exceptions.LexError ("Bad letter \\U"^hex8 x^" in identifier"))
   267:       ;
   268:       if x <= last
   269:       then raise Found
   270:     )
   271:     ucs_id_ranges
   272:     ;
   273:     raise (Flx_exceptions.LexError ("Bad letter \\U"^hex8 x^" in identifier"))
   274:   with Found -> ()
   275: 
   276: let utf8_to_ucn s =
   277:   let s' = Buffer.create 1000 in
   278:   let n = String.length s in
   279:   let i = ref 0 in
   280:   while !i < n do
   281:     let u,i' =
   282:       if s.[!i]='\\'
   283:       then begin
   284:         incr i;
   285:         if !i>n
   286:         then failwith ("Slosh at end of identifier " ^ s)
   287:         else if s.[!i] = 'u'
   288:         then begin
   289:           incr i;
   290:           if n - !i < 4
   291:           then failwith
   292:           (
   293:             "\\u at col "^
   294:             string_of_int !i ^
   295:             " must be followed by 4 hex digits"
   296:           )
   297:           else
   298:             let u = hexint_of_string (String.sub s !i 4) in
   299:             u,!i + 4
   300:         end else if s.[!i] = 'U'
   301:         then begin
   302:           incr i;
   303:           if n - !i < 8
   304:           then failwith
   305:           (
   306:             "\\U at col "^
   307:             string_of_int !i ^
   308:             " must be followed by 8 hex digits"
   309:           )
   310:           else
   311:             let u = Flx_string.hexint_of_string (String.sub s !i 8) in
   312:             u,!i + 8
   313:         end else failwith
   314:         (
   315:           "Slosh in identifier '"^
   316:           s^
   317:           "' col "^
   318:           string_of_int (!i+1)^
   319:           "must be followed by u or U"
   320:         )
   321:       end
   322:       else
   323:        parse_utf8 s !i
   324:     in
   325:       i := i';
   326:       if (u <> 0x27) (* apostrophe *)
   327:       && (u <> 0x5F) (* underscore *)
   328:       && ((u < 0x30) or (u > 0x39)) (* digits *)
   329:       then check_code u;
   330:       match u with
   331:       | x when x < 127 && x >= 0x20 ->
   332:         Buffer.add_char s' (char_of_int x)
   333:       | x when x<= 0xFFFF ->
   334:         Buffer.add_string s' ("\\u" ^ hex4 x)
   335:       | x ->
   336:         Buffer.add_string s' ("\\U" ^ hex8 x)
   337:   done;
   338:   Buffer.contents s'
   339: 
End ocaml section to src/flx_id.ml[1]
Start ocaml section to src/flx_id.mli[1 /1 ]
     1: # 829 "./lpsrc/flx_string.ipk"
     2: val ucs_id_ranges : (int * int) list
     3: val utf8_to_ucn : string -> string
     4: 
End ocaml section to src/flx_id.mli[1]