5.62. DFA

Start ocaml section to src/flx_dfa.mli[1 /1 ]
     1: # 3 "./lpsrc/flx_regexp.ipk"
     2: open Flx_ast
     3: open Flx_mtypes2
     4: 
     5: val process_regexp:
     6:   regexp_t ->
     7:   CharSet.t * (* alphabet*)
     8:   int                                * (* state count *)
     9:   (int, expr_t) Hashtbl.t            * (* term_codes *)
    10:   (int * int, int) Hashtbl.t             (* transition matrix *)
    11: 
End ocaml section to src/flx_dfa.mli[1]
Start ocaml section to src/flx_dfa.ml[1 /1 ]
     1: # 14 "./lpsrc/flx_regexp.ipk"
     2: (* build a DFA from a regular expression
     3: *)
     4: open Flx_ast
     5: open Flx_print
     6: open Flx_mtypes2
     7: 
     8: let hashtbl_length h =
     9:   let n = ref 0 in
    10:   Hashtbl.iter (fun _ _ -> incr n) h;
    11:   !n
    12: 
    13: let augment re = `REGEXP_seq (re,`REGEXP_sentinel)
    14: 
    15: type annotation_t =
    16:  {
    17:    nullable: bool;
    18:    firstpos: PosSet.t;
    19:    lastpos: PosSet.t
    20:  }
    21: 
    22: let fp_get followpos i =
    23:   try Hashtbl.find followpos i
    24:   with Not_found -> PosSet.empty
    25: 
    26: let fp_add followpos i j =
    27:   Hashtbl.replace followpos i (PosSet.add j (fp_get followpos i))
    28: 
    29: let fp_union followpos i x =
    30:   Hashtbl.replace followpos i (PosSet.union (fp_get followpos i) x)
    31: 
    32: let rec annotate counter followpos posmap codemap re =
    33:   let a r = annotate counter followpos posmap codemap r in
    34:   match re with
    35:   | `REGEXP_group _ -> failwith "Can't handle groups yet"
    36:   | `REGEXP_name _ -> failwith "Unbound regular expresion name"
    37:   | `REGEXP_seq (r1,r2) ->
    38:     let a1 = a r1 and a2 = a r2 in
    39:     let au =
    40:     {
    41:       nullable = a1.nullable && a2.nullable;
    42:       firstpos =
    43:         if a1.nullable
    44:         then PosSet.union a1.firstpos a2.firstpos
    45:         else a1.firstpos;
    46:       lastpos =
    47:         if a2.nullable
    48:         then PosSet.union a1.lastpos a2.lastpos
    49:         else a2.lastpos;
    50:     }
    51:     in
    52:       PosSet.iter
    53:       (fun i -> fp_union followpos i a2.firstpos)
    54:       a1.lastpos
    55:       ;
    56:       au
    57: 
    58:   | `REGEXP_alt (r1,r2) ->
    59:     let a1 = a r1 and a2 = a r2 in
    60:     {
    61:       nullable = a1.nullable || a2.nullable;
    62:       firstpos =
    63:         PosSet.union a1.firstpos a2.firstpos;
    64:       lastpos =
    65:         PosSet.union a1.lastpos a2.lastpos
    66:     }
    67: 
    68:   | `REGEXP_aster r1 ->
    69:     let a1 = a r1 in
    70:     let au =
    71:     {
    72:       nullable = true;
    73:       firstpos = a1.firstpos;
    74:       lastpos = a1.lastpos
    75:     }
    76:     in
    77:       PosSet.iter
    78:       (fun i -> fp_union followpos i a1.firstpos)
    79:       a1.lastpos
    80:       ;
    81:       au
    82: 
    83:   | `REGEXP_string s ->
    84:     let n = String.length s in
    85:     if n = 0
    86:     then (a `REGEXP_epsilon)
    87:     else
    88:       begin
    89:         let start = !counter in
    90:         counter := start + n;
    91:         let last = !counter - 1 in
    92:         let au =
    93:         {
    94:           nullable = false;
    95:           firstpos = PosSet.singleton start;
    96:           lastpos = PosSet.singleton last
    97:         }
    98:         in
    99:           for i = start to last-1 do
   100:             fp_add followpos i (i+1);
   101:             Hashtbl.add posmap i (Char.code s.[i-start])
   102:           done
   103:           ;
   104:           Hashtbl.add posmap last (Char.code s.[last-start])
   105:           ;
   106:           au
   107:       end
   108: 
   109:   | `REGEXP_epsilon ->
   110:     {
   111:       nullable = true;
   112:       firstpos = PosSet.empty;
   113:       lastpos = PosSet.empty
   114:     }
   115: 
   116:   | `REGEXP_code s ->
   117:     Hashtbl.add codemap !counter s;
   118:     let u =
   119:     {
   120:       nullable = false;
   121:       firstpos = PosSet.singleton !counter;
   122:       lastpos = PosSet.singleton !counter
   123:     }
   124:     in
   125:       incr counter;
   126:       u
   127: 
   128:   | `REGEXP_sentinel ->
   129:     let u =
   130:     {
   131:       nullable = false;
   132:       firstpos = PosSet.singleton !counter;
   133:       lastpos = PosSet.singleton !counter;
   134:     }
   135:     in
   136:       Hashtbl.add followpos !counter PosSet.empty;
   137:       u
   138: 
   139: let list_of_set x =
   140:   let lst = ref [] in
   141:   PosSet.iter
   142:   (fun i -> lst := i :: !lst)
   143:   x
   144:   ;
   145:   !lst
   146: 
   147: let string_of_set x =
   148:   "{" ^
   149:   String.concat ", " (List.map string_of_int (list_of_set x)) ^
   150:   "}"
   151: 
   152: let print_followpos followpos =
   153:   Hashtbl.iter
   154:   (fun i fp ->
   155:     print_endline (
   156:       (string_of_int i) ^
   157:       " -> " ^
   158:       string_of_set fp
   159:     )
   160:   )
   161:   followpos
   162: 
   163: let print_int_set s =
   164:   print_string "{";
   165:   PosSet.iter
   166:   (fun i -> print_string (string_of_int i ^ ", "))
   167:   s
   168:   ;
   169:   print_string "}"
   170: 
   171: exception Found of int
   172: ;;
   173: 
   174: let process_regexp re =
   175:     (*
   176:     print_endline ("  | " ^ Lex_print.string_of_re re);
   177:     *)
   178:     let are = augment re in
   179:     let followpos = Hashtbl.create 97 in
   180:     let codemap = Hashtbl.create 97 in
   181:     let posmap = Hashtbl.create 97 in
   182:     let counter = ref 1 in
   183:     let root = annotate counter followpos posmap codemap are in
   184:     let posarray = Array.make !counter 0 in
   185:     let alphabet = ref CharSet.empty in
   186:     Hashtbl.iter
   187:     (fun i c ->
   188:       posarray.(i-1) <- c;
   189:       alphabet := CharSet.add c !alphabet
   190:     )
   191:     posmap;
   192:     (*
   193:     print_endline "Followpos:";
   194:     print_followpos followpos;
   195:     print_endline ("Charpos '" ^ posarray ^ "'");
   196:     print_endline ("Codepos: ");
   197:     Hashtbl.iter
   198:     (fun i c ->
   199:       print_endline ((string_of_int i) ^ " -> " ^ c)
   200:     )
   201:     codemap
   202:     ;
   203:     print_string "alphabet '";
   204:     CharSet.iter
   205:     (fun c -> print_char c)
   206:     !alphabet;
   207:     print_endline "'";
   208:     *)
   209:     let marked_dstates = ref PosSetSet.empty in
   210:     let unmarked_dstates = ref (PosSetSet.singleton root.firstpos) in
   211:     let find_char c t =
   212:       try
   213:         PosSet.iter
   214:         (fun i -> if posarray.(i-1) = c then raise (Found i))
   215:         t
   216:         ;
   217:         print_endline ("Can't find char '" ^ String.make 1 (Char.chr c) ^ "'")
   218:         ;
   219:         raise Not_found
   220:       with Found p -> p
   221:     in
   222:     let state_counter = ref 1 in
   223:     let state_map = Hashtbl.create 97 in
   224:     let inv_state_map = Hashtbl.create 97 in
   225: 
   226:     let dtran = Hashtbl.create 97 in
   227:     Hashtbl.add state_map 0 root.firstpos;
   228:     Hashtbl.add inv_state_map root.firstpos 0;
   229:     (*
   230:     print_endline "Root is";
   231:     print_int_set root.firstpos;
   232:     print_endline "";
   233:     *)
   234:     while not (PosSetSet.is_empty !unmarked_dstates) do
   235:       let t = PosSetSet.choose !unmarked_dstates in
   236:       unmarked_dstates := PosSetSet.remove t !unmarked_dstates;
   237:       marked_dstates := PosSetSet.add t !marked_dstates;
   238:       let src_state_index =
   239:         try
   240:           let state_index = Hashtbl.find inv_state_map t in
   241:           (*
   242:           print_endline ("src_state = " ^ string_of_int state_index);
   243:           *)
   244:           state_index
   245:         with Not_found ->
   246:           print_endline "Can't find "; print_int_set t;
   247:           print_endline "";
   248:           raise Not_found
   249:       in
   250: 
   251:       CharSet.iter
   252:       (fun c ->
   253:         let u = ref (PosSet.empty) in
   254:         PosSet.iter
   255:         (fun i ->
   256:           if posarray.(i-1) = c
   257:           then begin
   258:             u := PosSet.union !u (try Hashtbl.find followpos i with
   259:             Not_found -> failwith ("Can't find followpos of index " ^ string_of_int i))
   260:           end
   261:         )
   262:         t
   263:         ;
   264:         if not (PosSet.is_empty !u)
   265:         then
   266:           let dst_state_index =
   267:             if not (PosSetSet.mem !u !marked_dstates)
   268:             && not (PosSetSet.mem !u !unmarked_dstates)
   269:             then begin
   270:               let state_index = !state_counter in
   271:               incr state_counter
   272:               ;
   273:               (*
   274:               print_string ("Adding new state " ^ string_of_int state_index ^ " = ");
   275:               print_int_set !u;
   276:               print_endline "";
   277:               *)
   278:               Hashtbl.add state_map state_index !u;
   279:               Hashtbl.add inv_state_map !u state_index;
   280:               let n1 = PosSetSet.cardinal !unmarked_dstates in
   281:               unmarked_dstates := PosSetSet.add !u !unmarked_dstates;
   282:               assert(n1 <> PosSetSet.cardinal !unmarked_dstates);
   283:               state_index
   284:             end
   285:             else
   286:               try Hashtbl.find inv_state_map !u with Not_found -> failwith "ERROR 2"
   287:           in
   288:           Hashtbl.add dtran (c,src_state_index) dst_state_index
   289:       )
   290:       !alphabet
   291:     done;
   292:     (*
   293:     print_endline "states:";
   294:     PosSetSet.iter
   295:     (fun s -> print_int_set s; print_endline "")
   296:     !marked_dstates
   297:     ;
   298:     print_endline "";
   299: 
   300:     print_endline "states:";
   301:     Hashtbl.iter
   302:     (fun idx state ->
   303:       print_string (string_of_int idx ^ " -> ");
   304:       print_int_set state;
   305:       print_endline ""
   306:     )
   307:     state_map
   308:     ;
   309:     *)
   310: 
   311:     let term_codes = Hashtbl.create 97 in
   312:     Hashtbl.iter
   313:     (fun idx state ->
   314:       try
   315:         PosSet.iter
   316:         (fun i ->
   317:           if Hashtbl.mem codemap i
   318:           then raise (Found i)
   319:         )
   320:         state
   321:         ;
   322:         raise Not_found
   323:       with
   324:       | Found i ->
   325:         let code = Hashtbl.find codemap i in
   326:         Hashtbl.add term_codes idx code
   327:       |  Not_found -> ()
   328:     )
   329:     state_map
   330:     ;
   331: 
   332:     !alphabet,!state_counter, term_codes, dtran
   333: 
   334: 
End ocaml section to src/flx_dfa.ml[1]