7.1. Misc

Start ocaml section to src/inria_cset.mli[1 /1 ]
     1: # 2 "./lpsrc/inria_re.ipk"
     2: (***********************************************************************)
     3: (*                                                                     *)
     4: (*                           Objective Caml                            *)
     5: (*                                                                     *)
     6: (*            Luc Maranget, Jerome Vouillon projet Cristal,            *)
     7: (*                         INRIA Rocquencourt                          *)
     8: (*                                                                     *)
     9: (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
    10: (*  en Automatique.  All rights reserved.  This file is distributed    *)
    11: (*  under the terms of the Q Public License version 1.0.               *)
    12: (*                                                                     *)
    13: (***********************************************************************)
    14: 
    15: (* $Id: inria_re.ipk,v 1.2 2005/10/13 21:14:32 skaller Exp $ *)
    16: 
    17: (* Set of characters encoded as list of intervals *)
    18: 
    19: type t
    20: exception Bad
    21: 
    22: val empty : t
    23: val is_empty : t -> bool
    24: val all_chars : t
    25: 
    26: val all_chars_eof : t
    27: val eof : t
    28: val singleton : int ->  t
    29: val interval : int -> int -> t
    30: val union : t -> t -> t
    31: val inter : t -> t -> t
    32: val diff : t -> t -> t
    33: val complement : t -> t
    34: val env_to_array : (t * 'a) list -> 'a array
    35: val string_of_characters : t -> string
    36: 
    37: 
End ocaml section to src/inria_cset.mli[1]
Start ocaml section to src/inria_cset.ml[1 /1 ]
     1: # 39 "./lpsrc/inria_re.ipk"
     2: (***********************************************************************)
     3: (*                                                                     *)
     4: (*                           Objective Caml                            *)
     5: (*                                                                     *)
     6: (*            Luc Maranget, Jerome Vouillon projet Cristal,            *)
     7: (*                         INRIA Rocquencourt                          *)
     8: (*                                                                     *)
     9: (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
    10: (*  en Automatique.  All rights reserved.  This file is distributed    *)
    11: (*  under the terms of the Q Public License version 1.0.               *)
    12: (*                                                                     *)
    13: (***********************************************************************)
    14: 
    15: (* $Id: inria_re.ipk,v 1.2 2005/10/13 21:14:32 skaller Exp $ *)
    16: 
    17: 
    18: exception Bad
    19: 
    20: type t = (int * int) list
    21: 
    22: 
    23: let empty = []
    24: let is_empty = function
    25:   | [] -> true
    26:   | _  -> false
    27: 
    28: let singleton c = [c,c]
    29: 
    30: let interval c1 c2 =
    31:   if c1 <= c2 then [c1,c2]
    32:   else [c2,c1]
    33: 
    34: 
    35: let rec union s1 s2 = match s1,s2 with
    36: | [],_ -> s2
    37: | _,[] -> s1
    38: | (c1,d1) as p1::r1, (c2,d2)::r2 ->
    39:     if c1 > c2 then
    40:       union s2 s1
    41:     else begin (* c1 <= c2 *)
    42:       if d1+1 < c2 then
    43:         p1::union r1 s2
    44:       else if d1 < d2 then
    45:         union ((c1,d2)::r2) r1
    46:       else
    47:         union s1 r2
    48:     end
    49: 
    50: let rec inter l l' =  match l, l' with
    51:     _, [] -> []
    52:   | [], _ -> []
    53:   | (c1, c2)::r, (c1', c2')::r' ->
    54:       if c2 < c1' then
    55:         inter r l'
    56:       else if c2' < c1 then
    57:         inter l r'
    58:       else if c2 < c2' then
    59:         (max c1 c1', c2)::inter r l'
    60:       else
    61:         (max c1 c1', c2')::inter l r'
    62: 
    63: let rec diff l l' =  match l, l' with
    64:     _, [] -> l
    65:   | [], _ -> []
    66:   | (c1, c2)::r, (c1', c2')::r' ->
    67:       if c2 < c1' then
    68:         (c1, c2)::diff r l'
    69:       else if c2' < c1 then
    70:         diff l r'
    71:       else
    72:         let r'' = if c2' < c2 then (c2' + 1, c2) :: r else r in
    73:         if c1 < c1' then
    74:           (c1, c1' - 1)::diff r'' r'
    75:         else
    76:           diff r'' r'
    77: 
    78: 
    79: let eof = singleton 256
    80: and all_chars = interval 0 255
    81: and all_chars_eof = interval 0 256
    82: 
    83: let complement s = diff all_chars s
    84: 
    85: let env_to_array env = match env with
    86: | []         -> assert false
    87: | (_,x)::rem ->
    88:     let res = Array.create 257 x in
    89:     List.iter
    90:       (fun (c,y) ->
    91:         List.iter
    92:           (fun (i,j) ->
    93:             for k=i to j do
    94:               res.(k) <- y
    95:             done)
    96:           c)
    97:       rem ;
    98:     res
    99: 
   100: let string_of_characters c = match c with
   101: | [] -> "Empty"
   102: | (x,_) :: _ -> String.make 1 (Char.chr x)
   103: 
End ocaml section to src/inria_cset.ml[1]
Start ocaml section to src/inria_syntax.mli[1 /1 ]
     1: # 142 "./lpsrc/inria_re.ipk"
     2: 
     3: (***********************************************************************)
     4: (*                                                                     *)
     5: (*                           Objective Caml                            *)
     6: (*                                                                     *)
     7: (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
     8: (*                                                                     *)
     9: (*  Copyright 1996 Institut National de Recherche en Informatique et   *)
    10: (*  en Automatique.  All rights reserved.  This file is distributed    *)
    11: (*  under the terms of the Q Public License version 1.0.               *)
    12: (*                                                                     *)
    13: (***********************************************************************)
    14: 
    15: (* $Id: inria_re.ipk,v 1.2 2005/10/13 21:14:32 skaller Exp $ *)
    16: 
    17: (* The shallow abstract syntax *)
    18: 
    19: type location =
    20:     { start_pos: int;
    21:       end_pos: int;
    22:       start_line: int;
    23:       start_col: int }
    24: 
    25: type regular_expression =
    26:     Epsilon
    27:   | Characters of Inria_cset.t
    28:   | Eof
    29:   | Sequence of regular_expression * regular_expression
    30:   | Alternative of regular_expression * regular_expression
    31:   | Repetition of regular_expression
    32:   | Bind of regular_expression * string
    33: 
    34: type ('arg,'action) entry =
    35:   {name:string ;
    36:    shortest : bool ;
    37:    args : 'arg ;
    38:    clauses : (regular_expression * 'action) list}
    39: 
    40: type  lexer_definition =
    41:     { header: location;
    42:       entrypoints: ((string list, location) entry) list;
    43:       trailer: location }
    44: 
End ocaml section to src/inria_syntax.mli[1]
Start ocaml section to src/inria_syntax.ml[1 /1 ]
     1: # 186 "./lpsrc/inria_re.ipk"
     2: (***********************************************************************)
     3: (*                                                                     *)
     4: (*                           Objective Caml                            *)
     5: (*                                                                     *)
     6: (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
     7: (*                                                                     *)
     8: (*  Copyright 1996 Institut National de Recherche en Informatique et   *)
     9: (*  en Automatique.  All rights reserved.  This file is distributed    *)
    10: (*  under the terms of the Q Public License version 1.0.               *)
    11: (*                                                                     *)
    12: (***********************************************************************)
    13: 
    14: (* $Id: inria_re.ipk,v 1.2 2005/10/13 21:14:32 skaller Exp $ *)
    15: 
    16: (* This apparently useless implmentation file is in fact required
    17:    by the pa_ocamllex syntax extension *)
    18: 
    19: (* The shallow abstract syntax *)
    20: 
    21: type location =
    22:     { start_pos: int;
    23:       end_pos: int;
    24:       start_line: int;
    25:       start_col: int }
    26: 
    27: type regular_expression =
    28:     Epsilon
    29:   | Characters of Inria_cset.t
    30:   | Eof
    31:   | Sequence of regular_expression * regular_expression
    32:   | Alternative of regular_expression * regular_expression
    33:   | Repetition of regular_expression
    34:   | Bind of regular_expression * string
    35: 
    36: type ('arg,'action) entry =
    37:   {name:string ;
    38:    shortest : bool ;
    39:    args : 'arg ;
    40:    clauses : (regular_expression * 'action) list}
    41: 
    42: type  lexer_definition =
    43:     { header: location;
    44:       entrypoints: ((string list, location) entry) list;
    45:       trailer: location }
    46: 
End ocaml section to src/inria_syntax.ml[1]
Start ocaml section to src/inria_table.mli[1 /1 ]
     1: # 232 "./lpsrc/inria_re.ipk"
     2: (***********************************************************************)
     3: (*                                                                     *)
     4: (*                           Objective Caml                            *)
     5: (*                                                                     *)
     6: (*            Luc Maranget, projet Moscova, INRIA Rocquencourt         *)
     7: (*                                                                     *)
     8: (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
     9: (*  en Automatique.  All rights reserved.  This file is distributed    *)
    10: (*  under the terms of the Q Public License version 1.0.               *)
    11: (*                                                                     *)
    12: (***********************************************************************)
    13: 
    14: (* Table used for code emission, ie extensible arrays *)
    15: type 'a t
    16: val create : 'a -> 'a t
    17: val emit : 'a t -> 'a -> unit
    18: val iter : 'a t -> ('a -> unit) -> unit
    19: val trim : 'a t -> 'a array
    20: exception Error
    21: val get : 'a t -> int -> 'a
    22: val size : 'a t -> int
    23: 
End ocaml section to src/inria_table.mli[1]
Start ocaml section to src/inria_table.ml[1 /1 ]
     1: # 255 "./lpsrc/inria_re.ipk"
     2: 
     3: (***********************************************************************)
     4: (*                                                                     *)
     5: (*                           Objective Caml                            *)
     6: (*                                                                     *)
     7: (*            Luc Maranget, projet Moscova, INRIA Rocquencourt         *)
     8: (*                                                                     *)
     9: (*  Copyright 2002 Institut National de Recherche en Informatique et   *)
    10: (*  en Automatique.  All rights reserved.  This file is distributed    *)
    11: (*  under the terms of the Q Public License version 1.0.               *)
    12: (*                                                                     *)
    13: (***********************************************************************)
    14: 
    15: type 'a t = {mutable next : int ; mutable data : 'a array}
    16: 
    17: let default_size = 32
    18: ;;
    19: 
    20: let create x = {next = 0 ; data = Array.create default_size x}
    21: and reset t = t.next <- 0
    22: ;;
    23: 
    24: let incr_table table new_size =
    25:   let t = Array.create new_size table.data.(0) in
    26:   Array.blit table.data 0 t 0 (Array.length table.data) ;
    27:   table.data <- t
    28: 
    29: let emit table i =
    30:  let size = Array.length table.data in
    31:  if table.next >= size then
    32:     incr_table table (2*size);
    33:  table.data.(table.next) <- i ;
    34:  table.next <- table.next + 1
    35: ;;
    36: 
    37: 
    38: exception Error
    39: 
    40: let get t i =
    41:   if 0 <= i && i < t.next then
    42:     t.data.(i)
    43:   else
    44:     raise Error
    45: 
    46: let trim t =
    47:   let r = Array.sub t.data 0 t.next in
    48:   reset t ;
    49:   r
    50: 
    51: let iter t f =
    52:   let size = t.next
    53:   and data = t.data in
    54:   for i = 0 to size-1 do
    55:     f data.(i)
    56:   done
    57: 
    58: let size t = t.next
    59: 
End ocaml section to src/inria_table.ml[1]
Start ocaml section to src/inria_lexgen.mli[1 /1 ]
     1: # 314 "./lpsrc/inria_re.ipk"
     2: 
     3: (***********************************************************************)
     4: (*                                                                     *)
     5: (*                           Objective Caml                            *)
     6: (*                                                                     *)
     7: (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
     8: (*                                                                     *)
     9: (*  Copyright 1996 Institut National de Recherche en Informatique et   *)
    10: (*  en Automatique.  All rights reserved.  This file is distributed    *)
    11: (*  under the terms of the Q Public License version 1.0.               *)
    12: (*                                                                     *)
    13: (***********************************************************************)
    14: 
    15: (* $Id: inria_re.ipk,v 1.2 2005/10/13 21:14:32 skaller Exp $ *)
    16: 
    17: 
    18: (* raised when there are too many bindings (>= 254 memory cells) *)
    19: exception Memory_overflow
    20: 
    21: 
    22: (* Representation of automata *)
    23: 
    24: 
    25: type automata =
    26:     Perform of int * tag_action list
    27:   | Shift of automata_trans * (automata_move * memory_action list) array
    28: and automata_trans =
    29:     No_remember
    30:   | Remember of int * tag_action list
    31: and automata_move =
    32:     Backtrack
    33:   | Goto of int
    34: and memory_action =
    35:   | Copy of int * int
    36:   | Set of int
    37: 
    38: and tag_action = SetTag of int * int | EraseTag of int
    39: 
    40: 
    41: (* Representation of entry points *)
    42: type tag_base = Start | End | Mem of int
    43: type tag_addr = Sum of (tag_base * int)
    44: type ident_info =
    45:   | Ident_string of bool * tag_addr * tag_addr
    46:   | Ident_char of bool * tag_addr
    47: type t_env = (string * ident_info) list
    48: 
    49: type ('args,'action) automata_entry =
    50:   { auto_name: string;
    51:     auto_args: 'args ;
    52:     auto_mem_size : int ;
    53:     auto_initial_state: int * memory_action list ;
    54:     auto_actions: (int * t_env * 'action) list }
    55: 
    56: (* The entry point *)
    57: 
    58: val make_dfa :
    59:   ('args, 'action) Inria_syntax.entry list ->
    60:   ('args, 'action) automata_entry list * automata array
    61: 
End ocaml section to src/inria_lexgen.mli[1]
Start ocaml section to src/inria_lexgen.ml[1 /1 ]
     1: # 375 "./lpsrc/inria_re.ipk"
     2: (***********************************************************************)
     3: (*                                                                     *)
     4: (*                           Objective Caml                            *)
     5: (*                                                                     *)
     6: (*            Xavier Leroy, projet Cristal,                            *)
     7: (*            Luc Maranget, projet Moscova,                            *)
     8: (*                  INRIA Rocquencourt                                 *)
     9: (*                                                                     *)
    10: (*  Copyright 1996 Institut National de Recherche en Informatique et   *)
    11: (*  en Automatique.  All rights reserved.  This file is distributed    *)
    12: (*  under the terms of the Q Public License version 1.0.               *)
    13: (*                                                                     *)
    14: (***********************************************************************)
    15: 
    16: (* $Id: inria_re.ipk,v 1.2 2005/10/13 21:14:32 skaller Exp $ *)
    17: 
    18: (* Compiling a lexer definition *)
    19: 
    20: open Inria_syntax
    21: open Printf
    22: 
    23: exception Memory_overflow
    24: 
    25: (* Deep abstract syntax for regular expressions *)
    26: 
    27: type tag_info = {id : string ; start : bool ; action : int}
    28: 
    29: type regexp =
    30:     Empty
    31:   | Chars of int * bool
    32:   | Action of int
    33:   | Tag of tag_info
    34:   | Seq of regexp * regexp
    35:   | Alt of regexp * regexp
    36:   | Star of regexp
    37: 
    38: type tag_base = Start | End | Mem of int
    39: type tag_addr = Sum of (tag_base * int)
    40: type ident_info =
    41:   | Ident_string of bool * tag_addr * tag_addr
    42:   | Ident_char of bool * tag_addr
    43: type t_env = (string * ident_info) list
    44: 
    45: type ('args,'action) lexer_entry =
    46:   { lex_name: string;
    47:     lex_regexp: regexp;
    48:     lex_mem_tags: int ;
    49:     lex_actions: (int *  t_env * 'action) list }
    50: 
    51: 
    52: type automata =
    53:     Perform of int * tag_action list
    54:   | Shift of automata_trans * (automata_move * memory_action list) array
    55: 
    56: and automata_trans =
    57:     No_remember
    58:   | Remember of int * tag_action list
    59: 
    60: and automata_move =
    61:     Backtrack
    62:   | Goto of int
    63: 
    64: and memory_action =
    65:   | Copy of int * int
    66:   | Set of int
    67: 
    68: and tag_action = SetTag of int * int | EraseTag of int
    69: 
    70: let string_of_automata_move x = match x with
    71:   | Backtrack -> "backtrack"
    72:   | Goto i -> "goto " ^ string_of_int i
    73: 
    74: let string_of_tag_base x = match x with
    75:   | Start -> "Start"
    76:   | Mem j -> "Mem " ^ string_of_int j
    77:   | End -> "End"
    78: 
    79: let string_of_tag_addr x = match x with Sum (tb,i) ->
    80:   string_of_tag_base tb ^ " " ^ string_of_int i
    81: 
    82: let string_of_ident_info x = match x with
    83:   | Ident_string (b,t1,t2) ->
    84:     "string " ^
    85:     if b then "true " else "false " ^
    86:    "(" ^ string_of_tag_addr t1 ^ ", " ^ string_of_tag_addr t2 ^ ")"
    87:  | Ident_char (b,t1) ->
    88:     "char " ^
    89:     if b then "true " else "false " ^
    90:    "(" ^ string_of_tag_addr t1 ^ ")"
    91: 
    92: let string_of_tenv iidis =
    93:   if List.length iidis = 0 then "none" else
    94:   (
    95:   String.concat ";\n"
    96:     (
    97:       List.map
    98:       (fun (s,idi) -> "  var " ^ s ^ ":" ^string_of_ident_info idi)
    99:       iidis
   100:     )
   101:   ) ^ "\n"
   102: 
   103: (* Representation of entry points *)
   104: let string_of_tag_action x = match x with
   105:   | SetTag (i,j) -> "set " ^ string_of_int i ^ " " ^ string_of_int j
   106:   | EraseTag i -> "erase " ^ string_of_int i
   107: 
   108: let string_of_memory_action x = match x with
   109:   | Copy(i,j) -> "copy " ^ string_of_int i ^ " " ^ string_of_int j
   110:   | Set i -> "set " ^ string_of_int i
   111: 
   112: let string_of_tag_action_list x =
   113:   String.concat ", " (List.map string_of_tag_action x)
   114: 
   115: let string_of_automata_trans x = match x with
   116:   | No_remember -> "No_remember"
   117:   | Remember  (i,tal) -> "Remember " ^ string_of_int i ^ " "^
   118:     string_of_tag_action_list tal
   119: 
   120: let string_of_automata a = match a with
   121:   | Perform (i,tal) -> "perform " ^ string_of_int i ^ " " ^
   122:     string_of_tag_action_list tal
   123: 
   124:   | Shift (at,vect) -> "shift " ^ string_of_automata_trans at ^
   125:     "\n" ^
   126:     (
   127:       let s = ref "" in
   128:       for i = 0 to Array.length vect - 1 do
   129:         let move,macts = vect.(i) in
   130:         if move=Backtrack && List.length macts = 0 then () else
   131:         s:= !s ^ "Char " ^ string_of_int i ^ " " ^
   132:           "move="^ string_of_automata_move move ^
   133:           ", macts= " ^
   134:           (
   135:             if List.length macts = 0 then "none" else
   136:             String.concat ", "
   137:             (List.map string_of_memory_action macts)
   138:           ) ^ "\n"
   139:       done;
   140:       !s
   141:     )
   142: 
   143: type ('args,'action) automata_entry =
   144:   { auto_name: string;
   145:     auto_args: 'args ;
   146:     auto_mem_size : int ;
   147:     auto_initial_state: int * memory_action list;
   148:     auto_actions: (int * t_env * 'action) list }
   149: 
   150: let string_of_automata_entry {
   151:     auto_name = auto_name;
   152:     auto_mem_size = size ;
   153:     auto_initial_state = (i,mal);
   154:     auto_actions = itenvas
   155: } =
   156:   auto_name ^ " size=" ^ string_of_int size ^
   157:   ", init_state=" ^ string_of_int i ^ "\nmemacts=" ^
   158:   (if List.length mal = 0 then "none" else
   159:   String.concat "," (List.map (fun ma -> string_of_memory_action ma) mal)
   160:   ) ^ "\n" ^
   161:   "actions=\n" ^
   162:   String.concat "" (List.map (fun (i,te,_) ->
   163:     "action " ^ string_of_int i ^ "\n  " ^ string_of_tenv te
   164:   ) itenvas)
   165: 
   166: 
   167: (* A lot of sets and map structures *)
   168: 
   169: module Ints = Set.Make(struct type t = int let compare = compare end)
   170: 
   171: module Tags = Set.Make(struct type t = tag_info let compare = compare end)
   172: 
   173: module TagMap =
   174:   Map.Make (struct type t = tag_info let compare = compare end)
   175: 
   176: module StringSet =
   177:   Set.Make (struct type t = string let compare = Pervasives.compare end)
   178: module StringMap =
   179:   Map.Make (struct type t = string let compare = Pervasives.compare end)
   180: 
   181: (*********************)
   182: (* Variable cleaning *)
   183: (*********************)
   184: 
   185: (* Silently eliminate nested variables *)
   186: 
   187: let rec do_remove_nested to_remove = function
   188:   | Bind (e,x) ->
   189:       if StringSet.mem x to_remove then
   190:         do_remove_nested to_remove e
   191:       else
   192:         Bind (do_remove_nested (StringSet.add x to_remove) e, x)
   193:   | Epsilon|Eof|Characters _ as e -> e
   194:   | Sequence (e1, e2) ->
   195:       Sequence
   196:         (do_remove_nested to_remove  e1, do_remove_nested to_remove  e2)
   197:   | Alternative (e1, e2) ->
   198:       Alternative
   199:         (do_remove_nested to_remove  e1, do_remove_nested to_remove  e2)
   200:   | Repetition e ->
   201:       Repetition (do_remove_nested to_remove  e)
   202: 
   203: let remove_nested_as e = do_remove_nested StringSet.empty e
   204: 
   205: (*********************)
   206: (* Variable analysis *)
   207: (*********************)
   208: 
   209: (*
   210:   Optional variables.
   211:    A variable is optional when matching of regexp does not
   212:    implies it binds.
   213:      The typical case is:
   214:        ("" | 'a' as x) -> optional
   215:        ("" as x | 'a' as x) -> non-optional
   216: *)
   217: 
   218: let stringset_delta s1 s2 =
   219:   StringSet.union
   220:     (StringSet.diff s1 s2)
   221:     (StringSet.diff s2 s1)
   222: 
   223: let rec find_all_vars = function
   224:   | Characters _|Epsilon|Eof ->
   225:       StringSet.empty
   226:   | Bind (e,x) ->
   227:       StringSet.add x (find_all_vars e)
   228:   | Sequence (e1,e2)|Alternative (e1,e2) ->
   229:       StringSet.union (find_all_vars e1) (find_all_vars e2)
   230:   | Repetition e -> find_all_vars e
   231: 
   232: 
   233: let rec do_find_opt = function
   234:   | Characters _|Epsilon|Eof -> StringSet.empty, StringSet.empty
   235:   | Bind (e,x) ->
   236:       let opt,all = do_find_opt e in
   237:       opt, StringSet.add x all
   238:   | Sequence (e1,e2) ->
   239:       let opt1,all1 = do_find_opt e1
   240:       and opt2,all2 = do_find_opt e2 in
   241:       StringSet.union opt1 opt2, StringSet.union all1 all2
   242:   | Alternative (e1,e2) ->
   243:       let opt1,all1 = do_find_opt e1
   244:       and opt2,all2 = do_find_opt e2 in
   245:       StringSet.union
   246:         (StringSet.union opt1 opt2)
   247:         (stringset_delta all1 all2),
   248:       StringSet.union all1 all2
   249:   | Repetition e  ->
   250:       let r = find_all_vars e in
   251:       r,r
   252: 
   253: let find_optional e =
   254:   let r,_ = do_find_opt e in r
   255: 
   256: (*
   257:    Double variables
   258:    A variable is double when it can be bound more than once
   259:    in a single matching
   260:      The typical case is:
   261:        (e1 as x) (e2 as x)
   262: 
   263: *)
   264: 
   265: let rec do_find_double = function
   266:   | Characters _|Epsilon|Eof -> StringSet.empty, StringSet.empty
   267:   | Bind (e,x) ->
   268:       let dbl,all = do_find_double e in
   269:       (if StringSet.mem x all then
   270:         StringSet.add x dbl
   271:       else
   272:         dbl),
   273:       StringSet.add x all
   274:   | Sequence (e1,e2) ->
   275:       let dbl1, all1 = do_find_double e1
   276:       and dbl2, all2 = do_find_double e2 in
   277:       StringSet.union
   278:         (StringSet.inter all1 all2)
   279:         (StringSet.union dbl1 dbl2),
   280:       StringSet.union all1 all2
   281:   | Alternative (e1,e2) ->
   282:       let dbl1, all1 = do_find_double e1
   283:       and dbl2, all2 = do_find_double e2 in
   284:       StringSet.union dbl1 dbl2,
   285:       StringSet.union all1 all2
   286:   | Repetition e ->
   287:       let r = find_all_vars e in
   288:       r,r
   289: 
   290: let find_double e = do_find_double e
   291: 
   292: (*
   293:    Type of variables:
   294:     A variable is bound to a char when all its occurences
   295:     bind a pattern of length 1.
   296:      The typical case is:
   297:        (_ as x) -> char
   298: *)
   299: 
   300: let add_some x = function
   301:   | Some i -> Some (x+i)
   302:   | None   -> None
   303: 
   304: let add_some_some x y = match x,y with
   305: | Some i, Some j -> Some (i+j)
   306: | _,_            -> None
   307: 
   308: let rec do_find_chars sz = function
   309:   | Epsilon|Eof    -> StringSet.empty, StringSet.empty, sz
   310:   | Characters _ -> StringSet.empty, StringSet.empty, add_some 1 sz
   311:   | Bind (e,x)   ->
   312:       let c,s,e_sz = do_find_chars (Some 0) e in
   313:       begin match e_sz  with
   314:       | Some 1 ->
   315:           StringSet.add x c,s,add_some 1 sz
   316:       | _ ->
   317:           c, StringSet.add x s, add_some_some sz e_sz
   318:       end
   319:   | Sequence (e1,e2) ->
   320:       let c1,s1,sz1 = do_find_chars sz e1 in
   321:       let c2,s2,sz2 = do_find_chars sz1 e2 in
   322:       StringSet.union c1 c2,
   323:       StringSet.union s1 s2,
   324:       sz2
   325:   | Alternative (e1,e2) ->
   326:       let c1,s1,sz1 = do_find_chars sz e1
   327:       and c2,s2,sz2 = do_find_chars sz e2 in
   328:       StringSet.union c1 c2,
   329:       StringSet.union s1 s2,
   330:       (if sz1 = sz2 then sz1 else None)
   331:   | Repetition e -> do_find_chars None e
   332: 
   333: 
   334: 
   335: let find_chars e =
   336:   let c,s,_ = do_find_chars (Some 0) e in
   337:   StringSet.diff c s
   338: 
   339: (*******************************)
   340: (* From shallow to deep syntax *)
   341: (*******************************)
   342: 
   343: let chars = ref ([] : Inria_cset.t list)
   344: let chars_count = ref 0
   345: 
   346: 
   347: let rec encode_regexp char_vars act = function
   348:     Epsilon -> Empty
   349:   | Characters cl ->
   350:       let n = !chars_count in
   351:       chars := cl :: !chars;
   352:       incr chars_count;
   353:       print_endline ("Position " ^ string_of_int n ^ "->" ^ Inria_cset.string_of_characters cl);
   354:       Chars(n,false)
   355:   | Eof ->
   356:       let n = !chars_count in
   357:       chars := Inria_cset.eof :: !chars;
   358:       incr chars_count;
   359:       Chars(n,true)
   360:   | Sequence(r1,r2) ->
   361:       let r1 = encode_regexp char_vars act r1 in
   362:       let r2 = encode_regexp char_vars act r2 in
   363:       Seq (r1, r2)
   364:   | Alternative(r1,r2) ->
   365:       let r1 = encode_regexp char_vars act r1 in
   366:       let r2 = encode_regexp char_vars act r2 in
   367:       Alt(r1, r2)
   368:   | Repetition r ->
   369:       let r = encode_regexp char_vars act r in
   370:       Star r
   371:   | Bind (r,x) ->
   372:       let r = encode_regexp char_vars act r in
   373:       if StringSet.mem x char_vars then
   374:         Seq (Tag {id=x ; start=true ; action=act},r)
   375:       else
   376:         Seq (Tag {id=x ; start=true ; action=act},
   377:           Seq (r, Tag {id=x ; start=false ; action=act}))
   378: 
   379: 
   380: (* Optimisation,
   381:     Static optimization :
   382:       Replace tags by offsets relative to the beginning
   383:       or end of matched string.
   384:     Dynamic optimization:
   385:       Replace some non-optional, non-double tags by offsets w.r.t
   386:       a previous similar tag.
   387: *)
   388: 
   389: let incr_pos = function
   390:   | None   -> None
   391:   | Some i -> Some (i+1)
   392: 
   393: let decr_pos = function
   394:   | None -> None
   395:   | Some i -> Some (i-1)
   396: 
   397: 
   398: let opt = true
   399: 
   400: let mk_seq r1 r2 = match r1,r2  with
   401: | Empty,_ -> r2
   402: | _,Empty -> r1
   403: | _,_     -> Seq (r1,r2)
   404: 
   405: let add_pos p i = match p with
   406: | Some (Sum (a,n)) -> Some (Sum (a,n+i))
   407: | None -> None
   408: 
   409: let opt_regexp all_vars char_vars optional_vars double_vars r =
   410: 
   411: (* From removed tags to their addresses *)
   412:   let env = Hashtbl.create 17 in
   413: 
   414: (* First static optimizations, from start position *)
   415:   let rec size_forward pos = function
   416:     | Empty|Chars (_,true)|Tag _ -> Some pos
   417:     | Chars (_,false) -> Some (pos+1)
   418:     | Seq (r1,r2) ->
   419:         begin match size_forward pos r1 with
   420:         | None -> None
   421:         | Some pos  -> size_forward pos r2
   422:         end
   423:     | Alt (r1,r2) ->
   424:         let pos1 = size_forward pos r1
   425:         and pos2 = size_forward pos r2 in
   426:         if pos1=pos2 then pos1 else None
   427:     | Star _ -> None
   428:     | Action _ -> assert false in
   429: 
   430:   let rec simple_forward pos r = match r with
   431:     | Tag n ->
   432:         if StringSet.mem n.id double_vars then
   433:           r,Some pos
   434:         else begin
   435:           Hashtbl.add env (n.id,n.start) (Sum (Start, pos)) ;
   436:           Empty,Some pos
   437:         end
   438:     | Empty -> r, Some pos
   439:     | Chars (_,is_eof) ->
   440:         r,Some (if is_eof then  pos else pos+1)
   441:     | Seq (r1,r2) ->
   442:         let r1,pos = simple_forward pos r1 in
   443:         begin match pos with
   444:         | None -> mk_seq r1 r2,None
   445:         | Some pos ->
   446:             let r2,pos = simple_forward pos r2 in
   447:             mk_seq r1 r2,pos
   448:         end
   449:     | Alt (r1,r2) ->
   450:         let pos1 = size_forward pos r1
   451:         and pos2 = size_forward pos r2 in
   452:         r,(if pos1=pos2 then pos1 else None)
   453:     | Star _ -> r,None
   454:     | Action _ -> assert false in
   455: 
   456: (* Then static optimizations, from end position *)
   457:   let rec size_backward pos = function
   458:     | Empty|Chars (_,true)|Tag _ -> Some pos
   459:     | Chars (_,false) -> Some (pos-1)
   460:     | Seq (r1,r2) ->
   461:         begin match size_backward pos r2 with
   462:         | None -> None
   463:         | Some pos  -> size_backward pos r1
   464:         end
   465:     | Alt (r1,r2) ->
   466:         let pos1 = size_backward pos r1
   467:         and pos2 = size_backward pos r2 in
   468:         if pos1=pos2 then pos1 else None
   469:     | Star _ -> None
   470:     | Action _ -> assert false in
   471: 
   472: 
   473:   let rec simple_backward pos r = match r with
   474:     | Tag n ->
   475:         if StringSet.mem n.id double_vars then
   476:           r,Some pos
   477:         else begin
   478:           Hashtbl.add env (n.id,n.start) (Sum (End, pos)) ;
   479:           Empty,Some pos
   480:         end
   481:     | Empty -> r,Some pos
   482:     | Chars (_,is_eof) ->
   483:         r,Some (if is_eof then pos else pos-1)
   484:     | Seq (r1,r2) ->
   485:         let r2,pos = simple_backward pos r2 in
   486:         begin match pos with
   487:         | None -> mk_seq r1 r2,None
   488:         | Some pos ->
   489:             let r1,pos = simple_backward pos r1 in
   490:             mk_seq r1 r2,pos
   491:         end
   492:     | Alt (r1,r2) ->
   493:         let pos1 = size_backward pos r1
   494:         and pos2 = size_backward pos r2 in
   495:         r,(if pos1=pos2 then pos1 else None)
   496:     | Star _ -> r,None
   497:     | Action _ -> assert false in
   498: 
   499:   let r =
   500:     if opt then
   501:       let r,_ = simple_forward 0 r in
   502:       let r,_ = simple_backward 0 r in
   503:       r
   504:     else
   505:       r in
   506: 
   507:   let loc_count = ref 0 in
   508:   let get_tag_addr t =
   509:     try
   510:      Hashtbl.find env t
   511:     with
   512:     | Not_found ->
   513:         let n = !loc_count in
   514:         incr loc_count ;
   515:         Hashtbl.add env t (Sum (Mem n,0)) ;
   516:         Sum (Mem n,0) in
   517: 
   518:   let rec alloc_exp pos r = match r with
   519:     | Tag n ->
   520:         if StringSet.mem n.id double_vars then
   521:           r,pos
   522:         else begin match pos with
   523:         | Some a ->
   524:             Hashtbl.add env (n.id,n.start) a ;
   525:             Empty,pos
   526:         | None ->
   527:             let a = get_tag_addr (n.id,n.start) in
   528:             r,Some a
   529:         end
   530: 
   531:     | Empty -> r,pos
   532:     | Chars (_,is_eof) -> r,(if is_eof then pos else add_pos pos 1)
   533:     | Seq (r1,r2) ->
   534:         let r1,pos = alloc_exp pos r1 in
   535:         let r2,pos = alloc_exp pos r2 in
   536:         mk_seq r1 r2,pos
   537:     | Alt (_,_) ->
   538:         let off = size_forward 0 r in
   539:         begin match off with
   540:         | Some i -> r,add_pos pos i
   541:         | None -> r,None
   542:         end
   543:     | Star _ -> r,None
   544:     | Action _ -> assert false in
   545: 
   546:   let r,_ = alloc_exp None r in
   547:   let m =
   548:     StringSet.fold
   549:       (fun x r ->
   550:         let v =
   551:           if StringSet.mem x char_vars then
   552:             Ident_char
   553:               (StringSet.mem x optional_vars, get_tag_addr (x,true))
   554:           else
   555:             Ident_string
   556:               (StringSet.mem x optional_vars,
   557:                get_tag_addr (x,true),
   558:                get_tag_addr (x,false)) in
   559:         (x,v)::r)
   560:       all_vars [] in
   561:   m,r, !loc_count
   562: 
   563: 
   564: 
   565: let encode_casedef casedef =
   566:   let r =
   567:     List.fold_left
   568:       (fun (reg,actions,count,ntags) (expr, act) ->
   569:         let expr = remove_nested_as expr in
   570:         let char_vars = find_chars expr in
   571:         let r = encode_regexp char_vars count expr
   572:         and opt_vars = find_optional expr
   573:         and double_vars,all_vars = find_double expr in
   574:         let m,r,loc_ntags =
   575:           opt_regexp all_vars char_vars opt_vars double_vars r in
   576:         Alt(reg, Seq(r, Action count)),
   577:         (count, m ,act) :: actions,
   578:         (succ count),
   579:         max loc_ntags ntags)
   580:       (Empty, [], 0, 0)
   581:       casedef in
   582:   r
   583: 
   584: let encode_lexdef def =
   585:   chars := [];
   586:   chars_count := 0;
   587:   let entry_list =
   588:     List.map
   589:       (fun {name=entry_name ; args=args ; shortest=shortest ; clauses= casedef} ->
   590:         let (re,actions,_,ntags) = encode_casedef casedef in
   591:         { lex_name = entry_name;
   592:           lex_regexp = re;
   593:           lex_mem_tags = ntags ;
   594:           lex_actions = List.rev actions },args,shortest)
   595:       def in
   596:   let chr = Array.of_list (List.rev !chars) in
   597:   chars := [];
   598:   (chr, entry_list)
   599: 
   600: (* To generate directly a NFA from a regular expression.
   601:      Confer Aho-Sethi-Ullman, dragon book, chap. 3
   602:    Extension to tagged automata.
   603:      Confer
   604:        Ville Larikari
   605:       ``NFAs with Tagged Transitions, their Conversion to Deterministic
   606:         Automata and Application to Regular Expressions''.
   607:        Symposium on String Processing and Information Retrieval (SPIRE 2000),
   608:      http://kouli.iki.fi/~vlaurika/spire2000-tnfa.ps
   609: (See also)
   610:      http://kouli.iki.fi/~vlaurika/regex-submatch.ps.gz
   611: *)
   612: 
   613: type t_transition =
   614:     OnChars of int
   615:   | ToAction of int
   616: 
   617: type transition = t_transition * Tags.t
   618: 
   619: let compare_trans (t1,tags1) (t2,tags2) =
   620:   match Pervasives.compare  t1 t2 with
   621:   | 0 -> Tags.compare tags1 tags2
   622:   | r -> r
   623: 
   624: 
   625: module TransSet =
   626:   Set.Make(struct type t = transition let compare = compare end)
   627: 
   628: let rec nullable = function
   629:   | Empty|Tag _ -> true
   630:   | Chars (_,_)|Action _ -> false
   631:   | Seq(r1,r2) -> nullable r1 && nullable r2
   632:   | Alt(r1,r2) -> nullable r1 || nullable r2
   633:   | Star r     -> true
   634: 
   635: let rec emptymatch = function
   636:   | Empty | Chars (_,_) | Action _ -> Tags.empty
   637:   | Tag t       -> Tags.add t Tags.empty
   638:   | Seq (r1,r2) -> Tags.union (emptymatch r1) (emptymatch r2)
   639:   | Alt(r1,r2)  ->
   640:       if nullable r1 then
   641:         emptymatch r1
   642:       else
   643:         emptymatch r2
   644:   | Star r ->
   645:       if nullable r then
   646:         emptymatch r
   647:       else
   648:         Tags.empty
   649: 
   650: let addtags transs tags =
   651:   TransSet.fold
   652:     (fun (t,tags_t) r -> TransSet.add (t, Tags.union tags tags_t) r)
   653:     transs TransSet.empty
   654: 
   655: 
   656: let rec firstpos = function
   657:     Empty|Tag _ -> TransSet.empty
   658:   | Chars (pos,_) -> TransSet.add (OnChars pos,Tags.empty) TransSet.empty
   659:   | Action act -> TransSet.add (ToAction act,Tags.empty) TransSet.empty
   660:   | Seq(r1,r2) ->
   661:       if nullable r1 then
   662:         TransSet.union (firstpos r1) (addtags (firstpos r2) (emptymatch r1))
   663:       else
   664:         firstpos r1
   665:   | Alt(r1,r2) -> TransSet.union (firstpos r1) (firstpos r2)
   666:   | Star r     -> firstpos r
   667: 
   668: 
   669: (* Berry-sethi followpos *)
   670: let followpos size entry_list =
   671:   let v = Array.create size TransSet.empty in
   672:   let rec fill s = function
   673:     | Empty|Action _|Tag _ -> ()
   674:     | Chars (n,_) -> v.(n) <- s
   675:     | Alt (r1,r2) ->
   676:         fill s r1 ; fill s r2
   677:     | Seq (r1,r2) ->
   678:         fill
   679:           (if nullable r2 then
   680:             TransSet.union (firstpos r2) (addtags s (emptymatch r2))
   681:           else
   682:             (firstpos r2))
   683:           r1 ;
   684:         fill s r2
   685:     | Star r ->
   686:         fill (TransSet.union (firstpos r) s) r in
   687:   List.iter (fun (entry,_,_) -> fill TransSet.empty entry.lex_regexp) entry_list ;
   688:   v
   689: 
   690: (************************)
   691: (* The algorithm itself *)
   692: (************************)
   693: 
   694: let no_action = max_int
   695: 
   696: module StateSet =
   697:   Set.Make (struct type t = t_transition let compare = Pervasives.compare end)
   698: 
   699: 
   700: module MemMap =
   701:   Map.Make (struct type t = int let compare = Pervasives.compare end)
   702: 
   703: type 'a dfa_state =
   704:   {final : int * ('a * int TagMap.t) ;
   705:    others : ('a * int TagMap.t) MemMap.t}
   706: 
   707: (* *)
   708: let dtag oc t =
   709:   fprintf oc "%s<%s>" t.id (if t.start then "s" else "e")
   710: 
   711: let dmem_map dp ds m =
   712:   MemMap.iter
   713:     (fun k x ->
   714:       eprintf "%d -> " k ; dp x ; ds ())
   715:     m
   716: 
   717: and dtag_map dp ds m =
   718:   TagMap.iter
   719:     (fun t x ->
   720:       dtag stderr t ; eprintf " -> " ; dp x ; ds ())
   721:     m
   722: 
   723: let dstate {final=(act,(_,m)) ; others=o} =
   724:   if act <> no_action then begin
   725:     eprintf "final=%d " act ;
   726:     dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m ;
   727:     prerr_endline ""
   728:   end ;
   729:   dmem_map
   730:     (fun (_,m) ->
   731:       dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m)
   732:     (fun () -> prerr_endline "")
   733:     o
   734: (* *)
   735: 
   736: let dfa_state_empty =
   737:   {final=(no_action, (max_int,TagMap.empty)) ;
   738:    others=MemMap.empty}
   739: 
   740: and dfa_state_is_empty {final=(act,_) ; others=o} =
   741:   act = no_action &&
   742:   o = MemMap.empty
   743: 
   744: 
   745: (* A key is an abstraction on a dfa state,
   746:    two states with the same key can be made the same by
   747:    copying some memory cells into others *)
   748: 
   749: 
   750: module StateSetSet =
   751:   Set.Make (struct type t = StateSet.t let compare = StateSet.compare end)
   752: 
   753: type t_equiv = {tag:tag_info ; equiv:StateSetSet.t}
   754: 
   755: module MemKey =
   756:   Set.Make
   757:    (struct
   758:      type t = t_equiv
   759: 
   760:      let compare e1 e2 = match Pervasives.compare e1.tag e2.tag with
   761:      | 0 -> StateSetSet.compare e1.equiv e2.equiv
   762:      | r -> r
   763:    end)
   764: 
   765: type dfa_key = {kstate : StateSet.t ; kmem : MemKey.t}
   766: 
   767: (* Map a state to its key *)
   768: let env_to_class m =
   769:   let env1 =
   770:     MemMap.fold
   771:       (fun _ (tag,s) r ->
   772:         try
   773:           let ss = TagMap.find tag r in
   774:           let r = TagMap.remove tag r in
   775:           TagMap.add tag (StateSetSet.add s ss) r
   776:         with
   777:         | Not_found ->
   778:             TagMap.add tag (StateSetSet.add s StateSetSet.empty) r)
   779:       m TagMap.empty in
   780:   TagMap.fold
   781:     (fun tag ss r -> MemKey.add {tag=tag ; equiv=ss} r)
   782:     env1 MemKey.empty
   783: 
   784: 
   785: (* trans is nfa_state, m is associated memory map *)
   786: let inverse_mem_map trans m r =
   787:   TagMap.fold
   788:     (fun tag addr r ->
   789:       try
   790:         let otag,s = MemMap.find addr r in
   791:         assert (tag = otag) ;
   792:         let r = MemMap.remove addr r in
   793:         MemMap.add addr (tag,StateSet.add trans s) r
   794:       with
   795:       | Not_found ->
   796:           MemMap.add addr (tag,StateSet.add trans StateSet.empty) r)
   797:     m r
   798: 
   799: let inverse_mem_map_other n (_,m) r = inverse_mem_map (OnChars n) m r
   800: 
   801: let get_key {final=(act,(_,m_act)) ; others=o} =
   802:   let env =
   803:     MemMap.fold inverse_mem_map_other
   804:       o
   805:       (if act = no_action then MemMap.empty
   806:       else inverse_mem_map (ToAction act) m_act MemMap.empty) in
   807:   let state_key =
   808:     MemMap.fold (fun n _ r -> StateSet.add (OnChars n) r) o
   809:       (if act=no_action then StateSet.empty
   810:       else StateSet.add (ToAction act) StateSet.empty) in
   811:   let mem_key = env_to_class  env in
   812:   {kstate = state_key ; kmem = mem_key}
   813: 
   814: 
   815: let key_compare k1 k2 = match StateSet.compare k1.kstate k2.kstate with
   816: | 0 -> MemKey.compare k1.kmem k2.kmem
   817: | r -> r
   818: 
   819: (* Association dfa_state -> state_num *)
   820: 
   821: module StateMap =
   822:   Map.Make(struct type t = dfa_key let compare = key_compare end)
   823: 
   824: let state_map = ref (StateMap.empty : int StateMap.t)
   825: let todo = Stack.create()
   826: let next_state_num = ref 0
   827: let next_mem_cell = ref 0
   828: let temp_pending = ref false
   829: let tag_cells = Hashtbl.create 17
   830: let state_table = Inria_table.create dfa_state_empty
   831: 
   832: 
   833: let reset_state_mem () =
   834:   state_map := StateMap.empty;
   835:   Stack.clear todo;
   836:   next_state_num := 0 ;
   837:   let _ = Inria_table.trim state_table in
   838:   ()
   839: 
   840: (* Allocation of memory cells *)
   841: let reset_cell_mem ntags =
   842:   next_mem_cell := ntags ;
   843:   Hashtbl.clear tag_cells ;
   844:   temp_pending := false
   845: 
   846: let do_alloc_temp () =
   847:   temp_pending := true ;
   848:   let n = !next_mem_cell in
   849:   n
   850: 
   851: let do_alloc_cell used t =
   852:   let available =
   853:     try Hashtbl.find tag_cells t with Not_found -> Ints.empty in
   854:   try
   855:     Ints.choose (Ints.diff available used)
   856:   with
   857:   | Not_found ->
   858:       temp_pending := false ;
   859:       let n = !next_mem_cell in
   860:       if n >= 255 then raise Memory_overflow ;
   861:       Hashtbl.replace tag_cells t (Ints.add n available) ;
   862:       incr next_mem_cell ;
   863:       n
   864: 
   865: let is_old_addr a = a >= 0
   866: and is_new_addr a = a < 0
   867: 
   868: let old_in_map m r =
   869:   TagMap.fold
   870:     (fun _ addr r ->
   871:       if is_old_addr addr then
   872:         Ints.add addr r
   873:       else
   874:         r)
   875:     m r
   876: 
   877: let alloc_map used m mvs =
   878:   TagMap.fold
   879:     (fun tag a (r,mvs) ->
   880:       let a,mvs =
   881:         if is_new_addr a then
   882:           let a = do_alloc_cell used tag in
   883:           a,Ints.add a mvs
   884:         else a,mvs in
   885:       TagMap.add tag a r,mvs)
   886:     m (TagMap.empty,mvs)
   887: 
   888: let create_new_state {final=(act,(_,m_act)) ; others=o} =
   889:   let used =
   890:     MemMap.fold (fun _ (_,m) r -> old_in_map m r)
   891:       o (old_in_map m_act Ints.empty) in
   892: 
   893:   let new_m_act,mvs  = alloc_map used m_act Ints.empty in
   894:   let new_o,mvs =
   895:     MemMap.fold (fun k (x,m) (r,mvs) ->
   896:       let m,mvs = alloc_map used m mvs in
   897:       MemMap.add k (x,m) r,mvs)
   898:       o (MemMap.empty,mvs) in
   899:   {final=(act,(0,new_m_act)) ; others=new_o},
   900:   Ints.fold (fun x r -> Set x::r) mvs []
   901: 
   902: type new_addr_gen = {mutable count : int ; mutable env : int TagMap.t}
   903: 
   904: let create_new_addr_gen () = {count = -1 ; env = TagMap.empty}
   905: 
   906: let alloc_new_addr tag r =
   907:   try
   908:     TagMap.find tag r.env
   909:   with
   910:   | Not_found ->
   911:       let a = r.count in
   912:       r.count <- a-1 ;
   913:       r.env <- TagMap.add tag a r.env ;
   914:       a
   915: 
   916: 
   917: let create_mem_map tags gen =
   918:   Tags.fold
   919:     (fun tag r -> TagMap.add tag (alloc_new_addr tag gen) r)
   920:     tags TagMap.empty
   921: 
   922: let create_init_state pos =
   923:   let gen = create_new_addr_gen () in
   924:   let st =
   925:     TransSet.fold
   926:       (fun (t,tags) st ->
   927:         match t with
   928:         | ToAction n ->
   929:             let on,otags = st.final in
   930:             if n < on then
   931:               {st with final = (n, (0,create_mem_map tags gen))}
   932:             else
   933:               st
   934:         | OnChars n ->
   935:             try
   936:               let _ = MemMap.find n st.others in assert false
   937:             with
   938:             | Not_found ->
   939:                 {st with others =
   940:                   MemMap.add n (0,create_mem_map tags gen) st.others})
   941:       pos dfa_state_empty in
   942:   st
   943: 
   944: 
   945: let get_map t st = match t with
   946: | ToAction _ -> let _,(_,m) = st.final in m
   947: | OnChars n  ->
   948:     let (_,m) = MemMap.find n st.others in
   949:     m
   950: 
   951: let dest = function | Copy (d,_) | Set d  -> d
   952: and orig = function | Copy (_,o) -> o | Set _ -> -1
   953: 
   954: let pmv oc mv = fprintf oc "%d <- %d" (dest mv) (orig mv)
   955: let pmvs oc mvs =
   956:   List.iter (fun mv -> fprintf oc "%a " pmv  mv) mvs ;
   957:   output_char oc '\n' ; flush oc
   958: 
   959: 
   960: (* Topological sort << a la louche >> *)
   961: let sort_mvs mvs =
   962:   let rec do_rec r mvs = match mvs with
   963:   | [] -> r
   964:   | _  ->
   965:       let dests =
   966:         List.fold_left
   967:           (fun r mv -> Ints.add (dest mv) r)
   968:           Ints.empty mvs in
   969:       let rem,here =
   970:         List.partition
   971:           (fun mv -> Ints.mem (orig mv) dests)
   972:           mvs in
   973:       match here with
   974:       | [] ->
   975:           begin match rem with
   976:           | Copy (d,_)::_ ->
   977:               let d' = do_alloc_temp () in
   978:               Copy (d',d)::
   979:               do_rec r
   980:                 (List.map
   981:                    (fun mv ->
   982:                      if orig mv = d then
   983:                        Copy (dest mv,d')
   984:                      else
   985:                        mv)
   986:                    rem)
   987:           | _ -> assert false
   988:           end
   989:       | _  -> do_rec (here@r) rem  in
   990:   do_rec [] mvs
   991: 
   992: let move_to mem_key src tgt =
   993:   let mvs =
   994:     MemKey.fold
   995:       (fun {tag=tag ; equiv=m} r ->
   996:         StateSetSet.fold
   997:           (fun s r ->
   998:             try
   999:               let t = StateSet.choose s  in
  1000:               let src = TagMap.find tag (get_map t src)
  1001:               and tgt = TagMap.find tag (get_map t tgt) in
  1002:               if src <> tgt then begin
  1003:                 if is_new_addr src then
  1004:                   Set tgt::r
  1005:                 else
  1006:                   Copy (tgt, src)::r
  1007:               end else
  1008:                 r
  1009:             with
  1010:             | Not_found -> assert false)
  1011:           m r)
  1012:       mem_key [] in
  1013: (* Moves are topologically sorted *)
  1014:   sort_mvs mvs
  1015: 
  1016: 
  1017: let get_state st =
  1018:   let key = get_key st in
  1019:   try
  1020:     let num = StateMap.find key !state_map in
  1021:     num,move_to key.kmem st (Inria_table.get state_table num)
  1022:   with Not_found ->
  1023:     let num = !next_state_num in
  1024:     incr next_state_num;
  1025:     let st,mvs = create_new_state st in
  1026:     Inria_table.emit state_table st ;
  1027:     state_map := StateMap.add key num !state_map;
  1028:     Stack.push (st, num) todo;
  1029:     num,mvs
  1030: 
  1031: let map_on_all_states f old_res =
  1032:   let res = ref old_res in
  1033:   begin try
  1034:     while true do
  1035:       let (st, i) = Stack.pop todo in
  1036:       let r = f st in
  1037:       res := (r, i) :: !res
  1038:     done
  1039:   with Stack.Empty -> ()
  1040:   end;
  1041:   !res
  1042: 
  1043: let goto_state st =
  1044:   if
  1045:     dfa_state_is_empty st
  1046:   then
  1047:     Backtrack,[]
  1048:   else
  1049:     let n,moves = get_state st in
  1050:     Goto n,moves
  1051: 
  1052: (****************************)
  1053: (* compute reachable states *)
  1054: (****************************)
  1055: 
  1056: let add_tags_to_map gen tags m =
  1057:   Tags.fold
  1058:     (fun tag m ->
  1059:       let m = TagMap.remove tag m in
  1060:       TagMap.add tag (alloc_new_addr tag gen) m)
  1061:     tags m
  1062: 
  1063: let apply_transition gen r pri m = function
  1064:   | ToAction n,tags ->
  1065:       let on,(opri,_) = r.final in
  1066:       if n < on || (on=n && pri < opri) then
  1067:         let m = add_tags_to_map gen tags m in
  1068:         {r with final=n,(pri,m)}
  1069:       else r
  1070:   |  OnChars n,tags ->
  1071:       try
  1072:         let (opri,_) = MemMap.find n r.others in
  1073:         if pri < opri then
  1074:           let m = add_tags_to_map gen tags m in
  1075:           {r with others=MemMap.add n (pri,m) (MemMap.remove n r.others)}
  1076:         else
  1077:           r
  1078:       with
  1079:       | Not_found ->
  1080:           let m = add_tags_to_map gen tags m in
  1081:           {r with others=MemMap.add n (pri,m) r.others}
  1082: 
  1083: (* add transitions ts to new state r
  1084:    transitions in ts start from state pri and memory map m
  1085: *)
  1086: let apply_transitions gen r pri m ts =
  1087:   TransSet.fold
  1088:     (fun t r -> apply_transition gen r pri m t)
  1089:     ts r
  1090: 
  1091: 
  1092: (* For a given nfa_state pos, refine char partition *)
  1093: let rec split_env gen follow pos m s = function
  1094:   | [] -> (* Can occur ! because of non-matching regexp ([^'\000'-'\255']) *)
  1095:       []
  1096:   | (s1,st1) as p::rem ->
  1097:       let here = Inria_cset.inter s s1 in
  1098:       if Inria_cset.is_empty here then
  1099:         p::split_env gen follow pos m s rem
  1100:       else
  1101:         let rest = Inria_cset.diff s here in
  1102:         let rem =
  1103:           if Inria_cset.is_empty rest then
  1104:             rem
  1105:           else
  1106:             split_env gen follow pos m rest rem
  1107:         and new_st = apply_transitions gen st1 pos m follow in
  1108:         let stay = Inria_cset.diff s1 here in
  1109:         if Inria_cset.is_empty stay then
  1110:           (here, new_st)::rem
  1111:         else
  1112:           (stay, st1)::(here, new_st)::rem
  1113: 
  1114: 
  1115: (* For all nfa_state pos in a dfa state st *)
  1116: let comp_shift gen chars follow st =
  1117:   MemMap.fold
  1118:     (fun pos (_,m) env -> split_env gen follow.(pos) pos m chars.(pos) env)
  1119:     st [Inria_cset.all_chars_eof,dfa_state_empty]
  1120: 
  1121: 
  1122: let reachs chars follow st =
  1123:   let gen = create_new_addr_gen () in
  1124: (* build a association list (char set -> new state) *)
  1125:   let env = comp_shift gen chars follow st in
  1126: (* change it into (char set -> new state_num) *)
  1127:   let env =
  1128:     List.map
  1129:       (fun (s,dfa_state) -> s,goto_state dfa_state) env in
  1130: (* finally build the char indexed array -> new state num *)
  1131:   let shift = Inria_cset.env_to_array env in
  1132:   shift
  1133: 
  1134: 
  1135: let get_tag_mem n env t =
  1136:   try
  1137:     TagMap.find t env.(n)
  1138:   with
  1139:   | Not_found -> assert false
  1140: 
  1141: let do_tag_actions n env  m =
  1142: 
  1143:   let used,r =
  1144:     TagMap.fold (fun t m (used,r) ->
  1145:       let a = get_tag_mem n env t in
  1146:       Ints.add a used,SetTag (a,m)::r) m (Ints.empty,[]) in
  1147:   let _,r =
  1148:     TagMap.fold
  1149:       (fun tag m (used,r) ->
  1150:         if not (Ints.mem m used) && tag.start then
  1151:           Ints.add m used, EraseTag m::r
  1152:         else
  1153:           used,r)
  1154:       env.(n) (used,r) in
  1155:   r
  1156: 
  1157: 
  1158: let translate_state shortest_match tags chars follow st =
  1159:   let (n,(_,m)) = st.final in
  1160:   if MemMap.empty = st.others then
  1161:     Perform (n,do_tag_actions n tags m)
  1162:   else if shortest_match then begin
  1163:     if n=no_action then
  1164:       Shift (No_remember,reachs chars follow st.others)
  1165:     else
  1166:       Perform(n, do_tag_actions n tags m)
  1167:   end else begin
  1168:     Shift (
  1169:     (if n = no_action then
  1170:       No_remember
  1171:     else
  1172:       Remember (n,do_tag_actions n tags m)),
  1173:     reachs chars follow st.others)
  1174:   end
  1175: 
  1176: (* *)
  1177: let dtags chan tags =
  1178:   Tags.iter
  1179:     (fun t -> fprintf chan " %a" dtag t)
  1180:     tags
  1181: 
  1182: let dtransset s =
  1183:   TransSet.iter
  1184:     (fun trans -> match trans with
  1185:     | OnChars i,tags ->
  1186:         eprintf " (-> %d,%a)" i dtags tags
  1187:     | ToAction i,tags ->
  1188:         eprintf " ([%d],%a)" i dtags tags)
  1189:     s
  1190: 
  1191: let dfollow t =
  1192:   eprintf "follow=[" ;
  1193:   for i = 0 to Array.length t-1 do
  1194:     eprintf "\n%d:" i ;
  1195:     dtransset t.(i)
  1196:   done ;
  1197:   prerr_endline "]"
  1198: (* *)
  1199: 
  1200: let make_tag_entry id start act a r = match a with
  1201:   | Sum (Mem m,0) ->
  1202:       TagMap.add {id=id ; start=start ; action=act} m r
  1203:   | _ -> r
  1204: 
  1205: let extract_tags l =
  1206:   let envs = Array.create (List.length l) TagMap.empty in
  1207:   List.iter
  1208:     (fun (act,m,_) ->
  1209:       envs.(act) <-
  1210:          List.fold_right
  1211:            (fun (x,v) r -> match v with
  1212:            | Ident_char (_,t) -> make_tag_entry x true act t r
  1213:            | Ident_string (_,t1,t2) ->
  1214:                make_tag_entry x true act t1
  1215:                (make_tag_entry x false act t2 r))
  1216:            m TagMap.empty)
  1217:     l ;
  1218:   envs
  1219: 
  1220: 
  1221: let make_dfa lexdef =
  1222:   let (chars, entry_list) = encode_lexdef lexdef in
  1223:   let follow = followpos (Array.length chars) entry_list in
  1224: (* *)
  1225:   dfollow follow ;
  1226: (* *)
  1227:   reset_state_mem () ;
  1228:   let r_states = ref [] in
  1229:   let initial_states =
  1230:     List.map
  1231:       (fun (le,args,shortest) ->
  1232:         let tags = extract_tags le.lex_actions in
  1233:         reset_cell_mem le.lex_mem_tags ;
  1234:         let pos_set = firstpos le.lex_regexp in
  1235: (* *)
  1236:         prerr_string "trans={" ; dtransset pos_set ; prerr_endline "}" ;
  1237: (* *)
  1238:         let init_state = create_init_state pos_set in
  1239:         let init_num = get_state init_state in
  1240:         r_states :=
  1241:            map_on_all_states
  1242:              (translate_state shortest tags chars follow) !r_states ;
  1243:         { auto_name = le.lex_name;
  1244:           auto_args = args ;
  1245:           auto_mem_size =
  1246:             (if !temp_pending then !next_mem_cell+1 else !next_mem_cell) ;
  1247:           auto_initial_state = init_num ;
  1248:           auto_actions = le.lex_actions })
  1249:       entry_list in
  1250:   let states = !r_states in
  1251: (* *)
  1252:   prerr_endline "** states **" ;
  1253:   for i = 0 to !next_state_num-1 do
  1254:     eprintf "+++ %d +++\n" i ;
  1255:     dstate (Inria_table.get state_table i) ;
  1256:     prerr_endline ""
  1257:   done ;
  1258:   eprintf "%d states\n" !next_state_num ;
  1259: (* *)
  1260:   let actions = Array.create !next_state_num (Perform (0,[])) in
  1261:   List.iter (fun (act, i) -> actions.(i) <- act ) states;
  1262: 
  1263:   print_endline "Actions ..";
  1264:   for i = 0 to !next_state_num - 1 do
  1265:     let act = actions.(i) in
  1266:     print_endline
  1267:     ("State " ^ string_of_int i ^ " has action " ^ string_of_automata act)
  1268:   done;
  1269:   reset_state_mem () ;
  1270:   reset_cell_mem  0 ;
  1271: 
  1272:   print_endline "Initial states ..";
  1273:   List.iter
  1274:   (fun ae ->
  1275:     print_endline "--- automata entry -- ";
  1276:     print_endline (string_of_automata_entry ae);
  1277:   )
  1278:   initial_states
  1279:   ;
  1280:   (initial_states, actions)
End ocaml section to src/inria_lexgen.ml[1]