5.19. String handling

Start ocaml section to src/flx_string.mli[1 /1 ]
     1: # 4 "./lpsrc/flx_string.ipk"
     2: val bin_char2int : char -> int
     3: val oct_char2int : char -> int
     4: val dec_char2int : char -> int
     5: val hex_char2int : char -> int
     6: 
     7: val binint_of_string : string -> int
     8: val octint_of_string : string -> int
     9: val decint_of_string : string -> int
    10: val hexint_of_string : string -> int
    11: 
    12: val binbig_int_of_string : string -> Big_int.big_int
    13: val octbig_int_of_string : string -> Big_int.big_int
    14: val decbig_int_of_string : string -> Big_int.big_int
    15: val hexbig_int_of_string : string -> Big_int.big_int
    16: 
    17: val floating_of_string : string -> float
    18: 
    19: val unescape : string -> string
    20: 
    21: val escape_of_string : char -> string -> string
    22: val py_dquote_of_string : string -> string
    23: val py_quote_of_string : string -> string
    24: val c_quote_of_string : string -> string
    25: val utf8_of_int : int -> string
    26: val parse_utf8 : string -> int -> int * int
    27: val hex2 : int -> string
    28: val hex4 : int -> string
    29: val hex8 : int -> string
    30: 
End ocaml section to src/flx_string.mli[1]
Start ocaml section to src/flx_string.ml[1 /1 ]
     1: # 35 "./lpsrc/flx_string.ipk"
     2: let hexchar_of_int i =
     3:   if i < 10
     4:   then char_of_int (i + (int_of_char '0'))
     5:   else char_of_int (i- 10 + (int_of_char 'A'))
     6: 
     7: let hex8 i =
     8:   let j = ref i in
     9:   let s = String.create 8 in
    10:   for k = 0 to 7 do
    11:     s.[7-k]  <- hexchar_of_int (!j mod 16);
    12:     j := !j / 16
    13:   done;
    14:   s
    15: 
    16: let hex4 i =
    17:   let j = ref i in
    18:   let s = String.create 4 in
    19:   for k = 0 to 3 do
    20:     s.[3-k]  <- hexchar_of_int (!j mod 16);
    21:     j := !j / 16
    22:   done;
    23:   s
    24: 
    25: let hex2 i =
    26:   let j = ref i in
    27:   let s = String.create 2 in
    28:   for k = 0 to 1 do
    29:     s.[1-k]  <- hexchar_of_int (!j mod 16);
    30:     j := !j / 16
    31:   done;
    32:   s
    33: 
    34: let escape_of_char quote ch =
    35:   if ch = '\\' then "\\\\"
    36:   else if ch = quote then "\\" ^ (String.make 1 quote)
    37:   else if ch = '\n' then "\\n"
    38:   else if ch < ' '
    39:   or ch > char_of_int 126
    40:   then "\\x" ^ (hex2 (Char.code ch))
    41:   else String.make 1 ch
    42: 
    43: let escape_of_string quote x =
    44:   let esc = escape_of_char quote in
    45:   let res = ref "" in
    46:   for i = 0 to (String.length x -1) do
    47:     res := !res ^ (esc x.[i])
    48:   done;
    49:   (String.make 1) quote ^ !res ^ (String.make 1 quote)
    50: 
    51: let py_dquote_of_string = escape_of_string '"';;
    52: let c_quote_of_string = escape_of_string '"';;
    53: let py_quote_of_string = escape_of_string '\'';;
    54: 
    55: let string_of_char c = String.make 1 c;;
    56: 
    57: let bin_char2int s =
    58:   let c = Char.code s in
    59:   match s with
    60:   | '0' -> 0
    61:   | '1' -> 1
    62:   | _ -> raise (Flx_exceptions.LexError ("'" ^ (string_of_char s) ^ "' not binary digit"))
    63: 
    64: let oct_char2int s =
    65:   let c = Char.code s in
    66:   match s with
    67:     _ when (s >= '0' & s <= '7') ->
    68:       c - (Char.code '0')
    69:   | _ -> raise (Flx_exceptions.LexError ("'" ^ (string_of_char s) ^ "' not octal digit"))
    70: 
    71: let dec_char2int s =
    72:   let c = Char.code s in
    73:   match s with
    74:     _ when (s >= '0' & s <= '9') ->
    75:       c - (Char.code '0')
    76:   | _ -> raise (Flx_exceptions.LexError ("'" ^ (string_of_char s) ^ "' not decimal digit"))
    77: 
    78: let hex_char2int s =
    79:   let c = Char.code s in
    80:   match s with
    81:     _ when (s >= '0' & s <= '9') ->
    82:       c - (Char.code '0')
    83:   | _ when (s >= 'a' & s <= 'f') ->
    84:       (c - (Char.code 'a')) + 10
    85:   | _ when (s >= 'A' & s <= 'F') ->
    86:       (c - (Char.code 'A')) + 10
    87:   | _ -> raise (Flx_exceptions.LexError ("'" ^ (string_of_char s) ^ "' not hexadecimal digit"))
    88: 
    89: 
    90: let len = String.length;;
    91: 
    92: let binint_of_string s =
    93:   let len = len s in
    94:   let value = ref 0 in
    95:   for i = 0 to (len - 1) do
    96:     if s.[i] <> '_'
    97:     then value := !value * 2 + (bin_char2int s.[i])
    98:   done;
    99:   !value
   100: 
   101: let octint_of_string s =
   102:   let len = len s in
   103:   let value = ref 0 in
   104:   for i = 0 to (len - 1) do
   105:     if s.[i] <> '_'
   106:     then value := !value * 8 + (oct_char2int s.[i])
   107:   done;
   108:   !value
   109: 
   110: let decint_of_string s =
   111:   let len = len s in
   112:   let value = ref 0 in
   113:   for i = 0 to (len - 1) do
   114:     if s.[i] <> '_'
   115:     then value := !value * 10 + (dec_char2int s.[i])
   116:   done;
   117:   !value
   118: 
   119: let hexint_of_string s =
   120:   let len = len s in
   121:   let value = ref 0 in
   122:   for i = 0 to (len - 1) do
   123:     if s.[i] <> '_'
   124:     then value := !value * 16 + (hex_char2int s.[i])
   125:   done;
   126:   !value
   127: 
   128: let binbig_int_of_string s =
   129:   let len = len s in
   130:   let value = ref (Big_int.big_int_of_int 0) in
   131:   for i = 0 to (len - 1) do
   132:     if s.[i] <> '_'
   133:     then value :=
   134:       Big_int.add_int_big_int
   135:         (bin_char2int s.[i])
   136:         (Big_int.mult_int_big_int 2 !value)
   137:   done;
   138:   !value
   139: 
   140: let octbig_int_of_string s =
   141:   let len = len s in
   142:   let value = ref (Big_int.big_int_of_int 0) in
   143:   for i = 0 to (len - 1) do
   144:     if s.[i] <> '_'
   145:     then value :=
   146:       Big_int.add_int_big_int
   147:         (oct_char2int s.[i])
   148:         (Big_int.mult_int_big_int 8 !value)
   149:   done;
   150:   !value
   151: 
   152: let decbig_int_of_string s =
   153:   let len = len s in
   154:   let value = ref (Big_int.big_int_of_int 0) in
   155:   for i = 0 to (len - 1) do
   156:     if s.[i] <> '_'
   157:     then value :=
   158:       Big_int.add_int_big_int
   159:         (dec_char2int s.[i])
   160:         (Big_int.mult_int_big_int 10 !value)
   161:   done;
   162:   !value
   163: 
   164: let hexbig_int_of_string s =
   165:   let len = len s in
   166:   let value = ref (Big_int.big_int_of_int 0) in
   167:   for i = 0 to (len - 1) do
   168:     if s.[i] <> '_'
   169:     then value :=
   170:       Big_int.add_int_big_int
   171:         (hex_char2int s.[i])
   172:         (Big_int.mult_int_big_int 16 !value)
   173:   done;
   174:   !value
   175: 
   176: let floating_of_string s' =
   177:   let dst = ref 0 in
   178:   let s = String.copy s' in
   179:   for src = 0 to (String.length s) - 1 do
   180:     if s.[src] <> '_'
   181:     then begin
   182:       s.[!dst] <- s.[src];
   183:       incr dst
   184:     end
   185:   done;
   186:   float_of_string (String.sub s 0 !dst)
   187: 
   188: (* WARNING: THIS CODE WILL NOT WORK FOR THE HIGHER PLANES
   189:   BECAUSE OCAML ONLY SUPPORTS 31 bit signed integers;
   190:   THIS CODE REQUIRES 32 bits [This can be fixed by using
   191:   negative codes but hasn't been done]
   192: 
   193:   HAPPINESS: Since the above note was posted,
   194:   ISO10646/Unicode has agreed on a 20 bit address
   195:   space for code points.
   196: *)
   197: 
   198: (* parse the first utf8 encoded character of a string s
   199:   starting at index position i, return a pair
   200:   consisting of the decoded integers, and the position
   201:   of the first character not decoded.
   202: 
   203:   If the first character is bad, it is returned,
   204:   otherwise if the encoding is bad, the result is
   205:   an unspecified value.
   206: 
   207:   Fails if the index is past or at
   208:   the end of the string.
   209: 
   210:   COMPATIBILITY NOTE: if this function is called
   211:   with a SINGLE character string, it will return
   212:   the usual value for the character, in range
   213:   0 .. 255
   214: *)
   215: 
   216: let parse_utf8 (s : string)  (i : int) : int * int =
   217:   let ord = int_of_char
   218:   and n = (String.length s)  - i
   219:   in
   220:   if n <= 0 then
   221:     failwith
   222:     (
   223:       "parse_utf8: index "^ string_of_int i^
   224:       " >= "^string_of_int (String.length s)^
   225:       " = length of '" ^ s ^ "'"
   226:     )
   227:   else let lead = ord (s.[i]) in
   228:     if (lead land 0x80) = 0 then
   229:       lead land 0x7F,i+1 (* ASCII *)
   230:     else if lead land 0xE0 = 0xC0 && n > 1 then
   231:       ((lead land 0x1F)  lsl  6) lor
   232:         (ord(s.[i+1]) land 0x3F),i+2
   233:     else if lead land 0xF0 = 0xE0 && n > 2 then
   234:       ((lead land 0x1F) lsl 12) lor
   235:         ((ord(s.[i+1]) land 0x3F)  lsl 6) lor
   236:         (ord(s.[i+2]) land 0x3F),i+3
   237:     else if lead land 0xF8 = 0xF0 && n > 3 then
   238:       ((lead land 0x1F) lsl 18) lor
   239:         ((ord(s.[i+1]) land 0x3F)  lsl 12) lor
   240:         ((ord(s.[i+2]) land 0x3F)  lsl 6) lor
   241:         (ord(s.[i+3]) land 0x3F),i+4
   242:     else if lead land 0xFC = 0xF8 && n > 4 then
   243:       ((lead land 0x1F) lsl 24) lor
   244:         ((ord(s.[i+1]) land 0x3F)  lsl 18) lor
   245:         ((ord(s.[i+2]) land 0x3F)  lsl 12) lor
   246:         ((ord(s.[i+3]) land 0x3F)  lsl 6) lor
   247:         (ord(s.[i+4]) land 0x3F),i+5
   248:     else if lead land 0xFE = 0xFC && n > 5 then
   249:       ((lead land 0x1F) lsl 30) lor
   250:         ((ord(s.[i+1]) land 0x3F)  lsl 24) lor
   251:         ((ord(s.[i+2]) land 0x3F)  lsl 18) lor
   252:         ((ord(s.[i+3]) land 0x3F)  lsl 12) lor
   253:         ((ord(s.[i+4]) land 0x3F)  lsl 6) lor
   254:         (ord(s.[i+5]) land 0x3F),i+6
   255:     else lead, i+1  (* error, just use bad character *)
   256: 
   257: (* convert an integer into a utf-8 encoded string of bytes *)
   258: let utf8_of_int i =
   259:   let chr x = String.make 1 (Char.chr x) in
   260:   if i < 0x80 then
   261:      chr(i)
   262:   else if i < 0x800 then
   263:      chr(0xC0 lor ((i lsr 6) land 0x1F))  ^
   264:       chr(0x80 lor (i land 0x3F))
   265:   else if i < 0x10000 then
   266:      chr(0xE0 lor ((i lsr 12) land 0xF)) ^
   267:       chr(0x80 lor ((i lsr 6) land 0x3F)) ^
   268:       chr(0x80 lor (i land 0x3F))
   269:   else if i < 0x200000 then
   270:      chr(0xF0 lor ((i lsr 18) land 0x7)) ^
   271:       chr(0x80 lor ((i lsr 12) land 0x3F)) ^
   272:       chr(0x80 lor ((i lsr 6) land 0x3F)) ^
   273:       chr(0x80 lor (i land 0x3F))
   274:   else if i < 0x4000000 then
   275:      chr(0xF8 lor ((i lsr 24) land 0x3)) ^
   276:       chr(0x80 lor ((i lsr 18) land 0x3F)) ^
   277:       chr(0x80 lor ((i lsr 12) land 0x3F)) ^
   278:       chr(0x80 lor ((i lsr 6) land 0x3F)) ^
   279:       chr(0x80 lor (i land 0x3F))
   280:   else chr(0xFC lor ((i lsr 30) land 0x1)) ^
   281:     chr(0x80 lor ((i lsr 24) land 0x3F)) ^
   282:     chr(0x80 lor ((i lsr 18) land 0x3F)) ^
   283:     chr(0x80 lor ((i lsr 12) land 0x3F)) ^
   284:     chr(0x80 lor ((i lsr 6) land 0x3F)) ^
   285:     chr(0x80 lor (i land 0x3F))
   286: 
   287: let unescape s =
   288:   let hex_limit = 2 in
   289:   let n = len s in
   290:   let s' = Buffer.create 1000 in
   291:   let deferred = ref 0 in
   292: 
   293:   (* tack char deferres tacking spaces until
   294:      the next non-space is received
   295:   *)
   296:   let tack_char ch =
   297:     if ch = ' ' then incr deferred
   298:     else begin
   299:       if !deferred<>0 then begin
   300:         Buffer.add_string s' (String.make !deferred ' ');
   301:         deferred := 0
   302:       end;
   303:       Buffer.add_char s' ch
   304:     end
   305:   in
   306: 
   307:   (* tack string always flushes deferred characters *)
   308:   let tack_string ss =
   309:     if !deferred<> 0 then begin
   310:        Buffer.add_string s' (String.make !deferred ' ');
   311:        deferred := 0
   312:      end;
   313:      Buffer.add_string s' ss
   314:   in
   315:   let tack_utf8 code = tack_string (utf8_of_int code) in
   316:   let i= ref 0 in
   317:   while !i< n do let ch = s.[!i] in
   318:     if ch = '\\' then begin
   319:       tack_string ""; (* flush spaces before any slosh *)
   320:       incr i;
   321:       if !i = n then tack_char '\\'
   322:       else match s.[!i] with
   323:       | 'a'  -> tack_char  '\007'; incr i   (* 7 : bell *)
   324:       | 'b'  -> tack_char  '\008'; incr i   (* 8 : backspace *)
   325:       | 't'  -> tack_char  '\t'; incr i     (* 9 : horizontal tab *)
   326: 
   327:       (* Note that \n flushes deferred spaces! *)
   328:       | 'n'  -> tack_char  '\n'; incr i     (* 10 : linefeed *)
   329:       | 'r'  -> tack_char  '\r'; incr i     (* 13 : return *)
   330:       | 'v'  -> tack_char  '\011'; incr i   (* vertical tab *)
   331:       | 'f'  -> tack_char  '\012'; incr i   (* form feed *)
   332:       | 'e'  -> tack_char  '\033'; incr i   (* 27: x1b: escape *)
   333: 
   334:       | '\\' -> tack_char  '\\'; incr i
   335:       | '"'  -> tack_char  '"'; incr i (* NOTE OCAMLLEX BUG: TWO SPACES REQUIRED *)
   336:       | '\'' -> tack_char  '\''; incr i
   337: 
   338:       (* this is the special case of \ spaces:
   339:          if the spaces are followed by a newline,
   340:          discard the spaces (and the newline!)
   341:          otherwise we keep the spaces
   342:       *)
   343:       | ' ' ->
   344:         while !i<n && s.[!i]=' ' do
   345:            incr deferred;
   346:            incr i
   347:         done;
   348:         if !i<n && s.[!i]='\n' then begin
   349:           deferred :=0;
   350:           incr i
   351:         end
   352: 
   353:       (* \newline is thrown out, but defered spaces are output *)
   354:       | '\n' -> incr i
   355:       | 'x' ->
   356:         begin
   357:           incr i;
   358:           let j = ref 0 and value = ref 0 in
   359:           while
   360:             (!i < n) &
   361:             (!j < hex_limit) &
   362:             (String.contains "0123456789ABCDEFabcdef" s.[!i]) do
   363:             value := !value * 16 + (hex_char2int s.[!i]);
   364:             incr i;
   365:             incr j
   366:           done;
   367:           tack_utf8 !value
   368:         end
   369:       | 'u' ->
   370:         begin
   371:           incr i;
   372:           let j = ref 0 and value = ref 0 in
   373:           while
   374:             (!i < n) &
   375:             (!j < 4) &
   376:             (String.contains "0123456789ABCDEFabcdef" s.[!i]) do
   377:             value := !value * 16 + (hex_char2int s.[!i]);
   378:             incr i;
   379:             incr j
   380:           done;
   381:           tack_utf8 !value
   382:         end
   383:       | 'U' ->
   384:         begin
   385:           incr i;
   386:           let j = ref 0 and value = ref 0 in
   387:           while
   388:             (!i < n) &
   389:             (!j < 8) &
   390:             (String.contains "0123456789ABCDEFabcdef" s.[!i]) do
   391:             value := !value * 16 + (hex_char2int s.[!i]);
   392:             incr i;
   393:             incr j
   394:           done;
   395:           tack_utf8 !value
   396:         end
   397:       | 'd' ->
   398:         begin
   399:           incr i;
   400:           let j = ref 0 and value = ref 0 in
   401:           while
   402:             (!i < n) &
   403:             (!j < 3) &
   404:             (String.contains "0123456789" s.[!i]) do
   405:             value := !value * 10 + (dec_char2int s.[!i]);
   406:             incr i;
   407:             incr j
   408:           done;
   409:           tack_utf8 !value
   410:         end
   411:       | 'o' ->
   412:         begin
   413:           incr i;
   414:           let j = ref 0 and value = ref 0 in
   415:           while
   416:             (!i < n) &
   417:             (!j < 3) &
   418:             (String.contains "01234567" s.[!i]) do
   419:             value := !value * 8 + (oct_char2int s.[!i]);
   420:             incr i;
   421:             incr j
   422:           done;
   423:           tack_utf8 !value
   424:         end
   425: 
   426:       | x -> tack_char '\\'; tack_char x;
   427:         incr i;
   428:     end else begin
   429: 
   430:       (* if we get a newline character, emit it
   431:          without preceding spaces
   432:       *)
   433:       if s.[!i]='\n' then deferred :=0;
   434:       tack_char s.[!i];
   435:       incr i
   436:     end
   437:   done;
   438:   tack_string "";  (* flush any deferred spaces *)
   439:   Buffer.contents s'
   440: 
   441: (* this routine converts strings containing
   442:    utf8 and/or \U \u escapes to a normalised
   443:    ASCII form using \U and \u escapes
   444:    for all codes in the range 0-1F, and >80
   445: *)
   446: 
   447: (* this routine converts strings containing
   448:    utf8 and/or \U \u escapes to a normalised
   449:    ASCII form using \U and \u escapes
   450:    for all codes in the range 0-1F, and >80
   451: *)
   452: 
End ocaml section to src/flx_string.ml[1]