1: # 489 "./lpsrc/flx_string.ipk"
2:
3: (* Universal Character Names in identifiers:
4: Below, a table of letters acceptable in identifiers.
5:
6: Source: ISO Standard C++, Appendix E.
7: Which came from ISO/IEC PDTR 10176, produced by
8: ISO/IEC JTC1/SC22/WG20 (internationalisation)
9:
10: Characters must be in the range shown
11: inclusive. This list must be strictly ordered.
12:
13: Felix also allows
14: underscore, prime, and digits in identifiers.
15: Digits must not be first.
16: *)
17: open Flx_string
18:
19: let ucs_id_ranges = [
20: (* ASCII *)
21: (0x0041,0x005a);
22: (0x0061,0x007a);
23:
24: (* Latin *)
25: (0x00c0,0x00d6);
26: (0x00d8,0x00f6);
27: (0x00f8,0x01f5);
28: (0x01fa,0x0217);
29: (0x0250,0x02a8);
30:
31: (* Greek *)
32: (0x0384,0x0384);
33: (0x0388,0x038a);
34: (0x038c,0x038c);
35: (0x038e,0x03a1);
36: (0x03a3,0x03ce);
37: (0x03d0,0x03d6);
38: (0x03da,0x03da);
39: (0x03dc,0x03dc);
40: (0x03de,0x03de);
41: (0x03e0,0x03e0);
42: (0x03e2,0x03f3);
43:
44: (* Cyrillic *)
45: (0x0401,0x040d);
46: (0x040f,0x044f);
47: (0x0451,0x045c);
48: (0x045e,0x0481);
49: (0x0490,0x04c4);
50: (0x04c7,0x04c4);
51: (0x04cb,0x04cc);
52: (0x04d0,0x04eb);
53: (0x04ee,0x04f5);
54: (0x04f8,0x04f9);
55:
56: (* Armenian *)
57: (0x0531,0x0556);
58: (0x0561,0x0587);
59: (0x04d0,0x04eb);
60:
61: (* Hebrew *)
62: (0x05d0,0x05ea);
63: (0x05f0,0x05f4);
64:
65: (* Arabic *)
66: (0x0621,0x063a);
67: (0x0640,0x0652);
68: (0x0670,0x06b7);
69: (0x06ba,0x06be);
70: (0x06c0,0x06ce);
71: (0x06e5,0x06e7);
72:
73: (* Devanagari *)
74: (0x0905,0x0939);
75: (0x0958,0x0962);
76:
77: (* Bengali *)
78: (0x0985,0x098c);
79: (0x098f,0x0990);
80: (0x0993,0x09a8);
81: (0x09aa,0x09b0);
82: (0x09b2,0x09b2);
83: (0x09b6,0x09b9);
84: (0x09dc,0x09dd);
85: (0x09df,0x09e1);
86: (0x09f0,0x09f1);
87:
88: (* Gurmukhi *)
89: (0x0a05,0x0a0a);
90: (0x0a0f,0x0a10);
91: (0x0a13,0x0a28);
92: (0x0a2a,0x0a30);
93: (0x0a32,0x0a33);
94: (0x0a35,0x0a36);
95: (0x0a38,0x0a39);
96: (0x0a59,0x0a5c);
97: (0x0a5e,0x0a5e);
98:
99: (* Gunjarati *)
100: (0x0a85,0x0a8b);
101: (0x0a8d,0x0a8d);
102: (0x0a8f,0x0a91);
103: (0x0a93,0x0aa8);
104: (0x0aaa,0x0ab0);
105: (0x0ab2,0x0ab3);
106: (0x0ab5,0x0ab9);
107: (0x0ae0,0x0ae0);
108:
109: (* Oriya *)
110: (0x0b05,0x0b0c);
111: (0x0b0f,0x0b10);
112: (0x0b13,0x0b28);
113: (0x0b2a,0x0b30);
114: (0x0b32,0x0b33);
115: (0x0b36,0x0b39);
116: (0x0b5c,0x0b5d);
117: (0x0b5f,0x0b61);
118:
119: (* Tamil *)
120: (0x0b85,0x0b8a);
121: (0x0b8e,0x0b90);
122: (0x0b92,0x0b95);
123: (0x0b99,0x0b9a);
124: (0x0b9c,0x0b9c);
125: (0x0b9e,0x0b9f);
126: (0x0ba3,0x0ba4);
127: (0x0ba8,0x0baa);
128: (0x0bae,0x0bb5);
129: (0x0bb7,0x0bb9);
130:
131: (* Telugu *)
132: (0x0c05,0x0c0c);
133: (0x0c0e,0x0c10);
134: (0x0c12,0x0c28);
135: (0x0c2a,0x0c33);
136: (0x0c35,0x0c39);
137: (0x0c60,0x0c61);
138:
139: (* Kannada *)
140: (0x0c85,0x0c8c);
141: (0x0c8e,0x0c90);
142: (0x0c92,0x0ca8);
143: (0x0caa,0x0cb3);
144: (0x0cb5,0x0cb9);
145: (0x0ce0,0x0ce1);
146:
147: (* Malayam *)
148: (0x0d05,0x0d0c);
149: (0x0d0e,0x0d10);
150: (0x0d12,0x0d28);
151: (0x0d2a,0x0d39);
152: (0x0d60,0x0d61);
153:
154: (* Thai *)
155: (0x0e01,0x0e30);
156: (0x0e32,0x0e33);
157: (0x0e40,0x0e46);
158: (0x0e4f,0x0e5b);
159:
160: (* Lao *)
161: (0x0e81,0x0e82);
162: (0x0e84,0x0e84);
163: (0x0e87,0x0e88);
164: (0x0e8a,0x0e8a);
165: (0x0e0d,0x0e0d);
166: (0x0e94,0x0e97);
167: (0x0e99,0x0e9f);
168: (0x0ea1,0x0ea3);
169: (0x0ea5,0x0ea5);
170: (0x0ea7,0x0ea7);
171: (0x0eaa,0x0eab);
172: (0x0ead,0x0eb0);
173: (0x0eb2,0x0eb3);
174: (0x0ebd,0x0ebd);
175: (0x0ec0,0x0ec4);
176: (0x0ec6,0x0ec6);
177:
178: (* Georgian *)
179: (0x10a0,0x10c5);
180: (0x10d0,0x10f6);
181:
182: (* Hangul Jamo *)
183: (0x1100,0x1159);
184: (0x1161,0x11a2);
185: (0x11a8,0x11f9);
186: (0x11d0,0x11f6);
187:
188: (* Latin extensions *)
189: (0x1e00,0x1e9a);
190: (0x1ea0,0x1ef9);
191:
192: (* Greek extended *)
193: (0x1f00,0x1f15);
194: (0x1f18,0x1f1d);
195: (0x1f20,0x1f45);
196: (0x1f48,0x1f4d);
197: (0x1f50,0x1f57);
198: (0x1f59,0x1f59);
199: (0x1f5b,0x1f5b);
200: (0x1f5d,0x1f5d);
201: (0x1f5f,0x1f7d);
202: (0x1f80,0x1fb4);
203: (0x1fb6,0x1fbc);
204: (0x1fc2,0x1fc4);
205: (0x1fc6,0x1fcc);
206: (0x1fd0,0x1fd3);
207: (0x1fd6,0x1fdb);
208: (0x1fe0,0x1fec);
209: (0x1ff2,0x1ff4);
210: (0x1ff6,0x1ffc);
211:
212:
213: (* Hiragana *)
214: (0x3041,0x3094);
215: (0x309b,0x309e);
216:
217: (* Katakana *)
218: (0x30a1,0x30fe);
219:
220: (* Bopmofo *)
221: (0x3105,0x312c);
222:
223: (* CJK Unified Ideographs *)
224: (0x4e00,0x9fa5);
225:
226: (* CJK Compatibility Ideographs *)
227: (0xf900,0xfa2d);
228:
229: (* Arabic Presentation Forms *)
230: (0xfb1f,0xfb36);
231: (0xfb38,0xfb3c);
232: (0xfb3e,0xfb3e);
233: (0xfb40,0xfb41);
234: (0xfb42,0xfb44);
235: (0xfb46,0xfbb1);
236: (0xfbd3,0xfd35);
237:
238: (* Arabic Presentation Forms-A *)
239: (0xfd50,0xfd85);
240: (0xfd92,0xfbc7);
241: (0xfdf0,0xfdfb);
242:
243: (* Arabic Presentation Forms-B *)
244: (0xfe70,0xfe72);
245: (0xfe74,0xfe74);
246: (0xfe76,0xfefc);
247:
248: (* Half width and Fullwidth Forms *)
249: (0xff21,0xff3a);
250: (0xff41,0xff5a);
251: (0xff66,0xffbe);
252: (0xffc2,0xffc7);
253: (0xffca,0xffcf);
254: (0xffd2,0xffd7);
255: (0xffd2,0xffd7);
256: (0xffda,0xffdc)
257: ]
258:
259: exception Found
260: let check_code x =
261: try
262: List.iter
263: (fun (first, last) ->
264: (* print_endline ((hex4 first) ^"-"^(hex4 last)); *)
265: if x < first
266: then raise (Flx_exceptions.LexError ("Bad letter \\U"^hex8 x^" in identifier"))
267: ;
268: if x <= last
269: then raise Found
270: )
271: ucs_id_ranges
272: ;
273: raise (Flx_exceptions.LexError ("Bad letter \\U"^hex8 x^" in identifier"))
274: with Found -> ()
275:
276: let utf8_to_ucn s =
277: let s' = Buffer.create 1000 in
278: let n = String.length s in
279: let i = ref 0 in
280: while !i < n do
281: let u,i' =
282: if s.[!i]='\\'
283: then begin
284: incr i;
285: if !i>n
286: then failwith ("Slosh at end of identifier " ^ s)
287: else if s.[!i] = 'u'
288: then begin
289: incr i;
290: if n - !i < 4
291: then failwith
292: (
293: "\\u at col "^
294: string_of_int !i ^
295: " must be followed by 4 hex digits"
296: )
297: else
298: let u = hexint_of_string (String.sub s !i 4) in
299: u,!i + 4
300: end else if s.[!i] = 'U'
301: then begin
302: incr i;
303: if n - !i < 8
304: then failwith
305: (
306: "\\U at col "^
307: string_of_int !i ^
308: " must be followed by 8 hex digits"
309: )
310: else
311: let u = Flx_string.hexint_of_string (String.sub s !i 8) in
312: u,!i + 8
313: end else failwith
314: (
315: "Slosh in identifier '"^
316: s^
317: "' col "^
318: string_of_int (!i+1)^
319: "must be followed by u or U"
320: )
321: end
322: else
323: parse_utf8 s !i
324: in
325: i := i';
326: if (u <> 0x27) (* apostrophe *)
327: && (u <> 0x5F) (* underscore *)
328: && ((u < 0x30) or (u > 0x39)) (* digits *)
329: then check_code u;
330: match u with
331: | x when x < 127 && x >= 0x20 ->
332: Buffer.add_char s' (char_of_int x)
333: | x when x<= 0xFFFF ->
334: Buffer.add_string s' ("\\u" ^ hex4 x)
335: | x ->
336: Buffer.add_string s' ("\\U" ^ hex8 x)
337: done;
338: Buffer.contents s'
339: