5.25. Tokeniser

Start ocaml section to src/flx_tok.ml[1 /1 ]
     1: # 179 "./lpsrc/flx_tokeniser.ipk"
     2: open Flx_ast
     3: open Flx_exceptions
     4: open List
     5: open Flx_srcref
     6: open Flx_parse
     7: 
     8: let print_pre_token t =
     9:   let emit t = print_string (Flx_prelex.string_of_token t) in
    10:     begin match t with
    11:     | COMMENT_NEWLINE s ->
    12:       print_endline ("//" ^ s);
    13: 
    14:     | NEWLINE ->
    15:       print_endline ""
    16: 
    17:     | ENDMARKER -> print_endline "<<EOF>>"
    18:     | _ -> emit t
    19:     end;
    20:     flush stdout
    21: 
    22: let print_pre_tokens ts =
    23:   if (length ts) = 0
    24:   then print_string "<Empty pretoken list>";
    25:   print_string "   1: ";
    26:   iter print_pre_token ts
    27: 
    28: let print_tokens ts =
    29:   let lineno = ref 0 in
    30:   let indent = ref 0 in
    31:   let emit t =
    32:     print_string ((Flx_prelex.string_of_token t) ^ " ")
    33:   and emit_eol t =
    34:     print_endline t;
    35:     let s' = "    " ^ (string_of_int !lineno) in
    36:     let n = String.length s' in
    37:     print_string ((String.sub s' (n-4) 4) ^ ": ");
    38:     for i=0 to !indent -1 do print_string "  " done
    39:   in
    40:   let print_token t =
    41:     begin match t with
    42:     | NEWLINE  ->
    43:       emit_eol ("//")
    44:     | LBRACE _ ->
    45:       incr indent;
    46:       emit_eol "  {"
    47:     | RBRACE _ ->
    48:       decr indent;
    49:       emit_eol "}"
    50:     | ENDMARKER -> emit_eol "#<<EOF>>"
    51:     | _ -> emit t
    52:     end;
    53:     flush stdout
    54:   in
    55:     iter print_token ts
    56: ;;
    57: 
    58: class tokeniser t =
    59: object(self)
    60:   val mutable tokens = []
    61:   val mutable tokens_copy = []
    62:   val mutable current_token_index = 0
    63:   initializer tokens  <- t; tokens_copy <- t
    64: 
    65:   method token_peek (dummy :Lexing.lexbuf) =
    66:     hd tokens
    67: 
    68:   method token_src (dummy :Lexing.lexbuf) =
    69:     let tmp = hd tokens in
    70:     tokens <- tl tokens;
    71:     current_token_index <- current_token_index + 1;
    72:     match tmp with
    73:     | USER_STATEMENT_KEYWORD (sr,s,tkss,nonterminals) ->
    74:       (*
    75:       print_endline ("TRANSLATING USER STATEMENT KEYWORD " ^ s);
    76:       *)
    77:       let f = fun () -> self#parse_user_statement s (slift sr) tkss nonterminals in
    78:       USER_STATEMENT_DRIVER (sr,s,f)
    79:     | _ -> tmp
    80: 
    81:   method put_back (x:token) =
    82:     tokens <- x :: tokens;
    83:     current_token_index <- current_token_index - 1
    84: 
    85:   method get_loc =
    86:     let token = nth tokens_copy current_token_index in
    87:     slift (Flx_prelex.src_of_token token)
    88: 
    89:   method report_syntax_error =
    90:     print_endline "";
    91:     print_endline "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!";
    92:     let n = length tokens_copy in
    93:     let first = max 0 (current_token_index - 20)
    94:     and last = min (n-1) (current_token_index + 20)
    95:     and slist = ref [] in
    96:     for i = first to current_token_index-1 do
    97:       slist := concat [!slist; [nth tokens_copy i]]
    98:     done;
    99:     print_tokens !slist;
   100:     print_endline "";
   101: 
   102:     let j =
   103:       begin
   104:         if length tokens_copy = current_token_index
   105:         then begin
   106:           print_string "Unexpected End Of File";
   107:           current_token_index - 1
   108:         end else begin
   109:           print_string "Syntax Error before token ";
   110:           print_string (string_of_int current_token_index);
   111:           current_token_index
   112:         end
   113:       end
   114:     in
   115:     let token = nth tokens_copy j in
   116:     let sr = ref (Flx_prelex.src_of_token token) in
   117:     let file,line,scol,ecol = !sr in
   118:     if line <> 0 or j = 0 then
   119:       print_endline
   120:       (
   121:         " in " ^ file ^
   122:         ", line " ^ string_of_int line ^
   123:         " col " ^ string_of_int scol
   124:       )
   125:     else begin
   126:       let token = nth tokens_copy (j-1) in
   127:       sr := Flx_prelex.src_of_token token;
   128:       let file,line,scol,ecol = !sr in
   129:       print_endline
   130:       (
   131:         " in " ^ file ^
   132:         ", after line " ^ string_of_int line ^
   133:         " col " ^ string_of_int scol
   134:       )
   135:     end
   136:     ;
   137: 
   138:     slist := [];
   139:     for i = current_token_index to last do
   140:       slist := concat [!slist; [nth tokens_copy i]]
   141:     done;
   142:     print_tokens !slist;
   143:     print_endline "";
   144:     print_endline "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!";
   145:     flush stdout;
   146:     (*
   147:     clierr (slift (!sr)) "Syntax Error";
   148:     ()
   149:     *)
   150: 
   151:   method parse_user_statement
   152:     (name:string)
   153:     (sr:range_srcref)
   154:     (tokss: (token list * ast_term_t) list)
   155:     (nonterminals: (string,(token list * ast_term_t) list) Hashtbl.t)
   156:   : statement_t =
   157:     let term = self#parse_alternatives name sr tokss nonterminals in
   158:     `AST_user_statement (sr,name,term)
   159: 
   160:   method private parse_alternatives
   161:     (name:string)
   162:     (sr:range_srcref)
   163:     (tokss:(token list * ast_term_t) list)
   164:     (nonterminals: (string,(token list * ast_term_t) list) Hashtbl.t)
   165:   : ast_term_t =
   166: 
   167:     (* save state for backtracking *)
   168:     let saved_tokens = tokens in
   169:     let saved_token_index = current_token_index in
   170:     let rec aux tokss = match tokss with
   171:       | (toks,term) :: tail ->
   172:         begin
   173:           try
   174:             `Apply_term (term,self#parse_production name sr toks nonterminals)
   175:           with RDP_match_fail _ ->
   176:             (* backtrack for next try *)
   177:             (*
   178:             print_endline "DEBUG: match fail caught (backtrack exception)";
   179:             *)
   180:             tokens <- saved_tokens;
   181:             current_token_index <- saved_token_index;
   182:             aux tail
   183:           | x ->
   184:             print_endline ("RDP: unexpected error: " ^ Printexc.to_string x);
   185:             failwith "RDP: unexpected error!"
   186:         end
   187:       | [] ->
   188:           (*
   189:           print_endline "DEBUG: raise alternatives exhausted";
   190:           *)
   191:           rdp_alternatives_exhausted sr
   192:             ("Syntax error matching user statement " ^ name)
   193: 
   194:     in
   195:     match tokss with
   196:     | [] -> clierr sr "Woops, no alternatives??"
   197:     | [toks,term] ->
   198:        begin try
   199:          `Apply_term (term,self#parse_production name sr toks nonterminals)
   200:        with RDP_match_fail (sr1,sr2,s) ->
   201:          (*
   202:          print_endline "RD match failure on sole alternative";
   203:          *)
   204:          rdp_match_fail sr1 sr2 s
   205:        end
   206:     | xs -> aux xs
   207: 
   208:   method private parse_production
   209:     (name:string)
   210:     (sr:range_srcref)
   211:     (toks:token list)
   212:     (nonterminals: (string,(token list * ast_term_t) list) Hashtbl.t)
   213:   : ast_term_t list =
   214: 
   215:     let dummy_lexbuf = Lexing.from_string "blah" in
   216:     let rec aux toks res = match toks with
   217:     | h :: t ->
   218:       begin match h with
   219:       | EXPRESSION _ ->
   220:         (*
   221:         print_endline "Matching expression ..";
   222:         *)
   223:         let e,tk =
   224:           try exprx self#token_src dummy_lexbuf
   225:           with ParseError s -> rdp_match_fail sr self#get_loc "parsing exprx"
   226:         in
   227:         (*
   228:         print_endline (
   229:           "Expression matched, stopped by " ^
   230:           Flx_prelex.string_of_token tk
   231:         );
   232:         *)
   233:         self#put_back tk;
   234:         aux t (`Expression_term e :: res)
   235: 
   236:       | STATEMENT _ ->
   237:         (*
   238:         print_endline "Matching statement ..";
   239:         *)
   240:         let s =
   241:           try statement self#token_src dummy_lexbuf
   242:           with ParseError s -> rdp_match_fail sr self#get_loc "parsing statement"
   243:         in
   244:         aux t (`Statement_term s :: res)
   245: 
   246:       | STATEMENTS _ ->
   247:         (*
   248:         print_endline "Matching statements ..";
   249:         *)
   250:         let s,tk =
   251:           try statementsx self#token_src dummy_lexbuf
   252:           with ParseError s -> rdp_match_fail sr self#get_loc "parsing statementsx"
   253:         in
   254:         self#put_back tk;
   255:         aux t (`Statements_term s :: res)
   256: 
   257:       | IDENT _ ->
   258:         (*
   259:         print_endline "Matching ident ..";
   260:         *)
   261:         let tok' = self#token_src dummy_lexbuf in
   262:         begin match tok' with
   263:         | NAME (sr,s) ->
   264:           aux t (`Identifier_term s :: res)
   265:         | _ ->
   266:           rdp_match_fail sr self#get_loc
   267:           (
   268:             "User statement: identifier requires, got " ^
   269:             Flx_prelex.string_of_token tok'
   270:           )
   271:         end
   272: 
   273: 
   274:       | INTEGER_LITERAL _ ->
   275:         let tok' = self#token_src dummy_lexbuf in
   276:         begin match tok' with
   277:         | INTEGER (sr,kind,vl) ->
   278:           let j = `AST_literal (slift sr,`AST_int (kind,vl)) in
   279:           aux t (`Expression_term j :: res)
   280:         | _ ->
   281:           rdp_match_fail sr self#get_loc
   282:           (
   283:             "User statement: integer required, got " ^
   284:             Flx_prelex.string_of_token tok'
   285:           )
   286:         end
   287: 
   288:       | FLOAT_LITERAL _ ->
   289:         let tok' = self#token_src dummy_lexbuf in
   290:         begin match tok' with
   291:         | FLOAT (sr,kind,vl) ->
   292:           let j = `AST_literal (slift sr,`AST_float (kind,vl)) in
   293:           aux t (`Expression_term j :: res)
   294:         | _ ->
   295:           rdp_match_fail sr self#get_loc
   296:           (
   297:             "User statement: integer required, got " ^
   298:             Flx_prelex.string_of_token tok'
   299:           )
   300:         end
   301: 
   302:       | STRING_LITERAL _ ->
   303:         let tok' = self#token_src dummy_lexbuf in
   304:         begin match tok' with
   305:         | STRING (sr,s) ->
   306:           let j = `AST_literal (slift sr,`AST_string s) in
   307:           aux t (`Expression_term j :: res)
   308:         | CSTRING (sr,s) ->
   309:           let j = `AST_literal (slift sr,`AST_cstring s) in
   310:           aux t (`Expression_term j :: res)
   311:         | WSTRING (sr,s) ->
   312:           let j = `AST_literal (slift sr,`AST_wstring s) in
   313:           aux t (`Expression_term j :: res)
   314:         | USTRING (sr,s) ->
   315:           let j = `AST_literal (slift sr,`AST_ustring s) in
   316:           aux t (`Expression_term j :: res)
   317:         | _ ->
   318:           rdp_match_fail sr self#get_loc
   319:           (
   320:             "User statement: integer required, got " ^
   321:             Flx_prelex.string_of_token tok'
   322:           )
   323:         end
   324: 
   325:       | tok ->
   326:         let s = Flx_prelex.string_of_token tok in
   327:         (*
   328:         print_endline ("Checking if " ^ s ^ " is a nonterminal");
   329:         *)
   330:         let alts =
   331:           try Some (Hashtbl.find nonterminals s)
   332:           with Not_found -> None
   333:         in
   334:         begin match alts with
   335:         | Some productions ->
   336:           (*
   337:           print_endline ("FOUND NONTERMINAL " ^ s);
   338:           *)
   339:           let result =
   340:             try self#parse_alternatives s sr productions nonterminals
   341:             with RDP_alternatives_exhausted (sr2,s) ->
   342:               rdp_match_fail sr sr2 s
   343:           in
   344:           aux t (result :: res)
   345: 
   346:         | None  ->
   347:         (*
   348:         print_endline "Nope, not a non-terminal";
   349:         *)
   350:         let tok' = self#token_src dummy_lexbuf in
   351:         let s' = Flx_prelex.string_of_token tok' in
   352:         (*
   353:         print_endline ("Matching other token " ^ s ^ " with " ^ s');
   354:         *)
   355:         if s = s' then
   356:           aux t (`Keyword_term s :: res)
   357:         else rdp_match_fail sr self#get_loc
   358:         (
   359:           "Syntax Error in user statement: " ^
   360:           "Failed to match keyword or symbol " ^
   361:           s ^ ", got " ^ s' ^ " instead"
   362:         )
   363:         end
   364:       end
   365: 
   366:     | [] -> rev res
   367:     in aux toks []
   368: 
   369: end
   370: ;;
   371: 
   372: 
   373: type 'a parser_t =
   374:   (Lexing.lexbuf  -> Flx_parse.token) ->
   375:   Lexing.lexbuf ->
   376:   'a
   377: 
   378: let parse_tokens (parser:'a parser_t) (tokens: Flx_parse.token list) =
   379:   let toker = (new tokeniser tokens) in
   380:   try
   381:     parser (toker#token_src) (Lexing.from_string "dummy" )
   382:   with
   383:   | Flx_exceptions.ClientError _
   384:   | Flx_exceptions.ClientError2 _
   385:   | Flx_exceptions.ClientErrorn _ as x ->
   386:     (*
   387:     print_endline ("got client error from parse..");
   388:     *)
   389:     toker#report_syntax_error;
   390:     raise x
   391: 
   392:   | Flx_exceptions.ParseError _ as x ->
   393:     (*
   394:     print_endline ("got ParseError from parse..");
   395:     *)
   396:     toker#report_syntax_error;
   397:     raise x
   398: 
   399:   | Flx_exceptions.RDP_match_fail _ as x ->
   400:     (*
   401:     print_endline ("got RDP_match_fail from parse..");
   402:     *)
   403:     toker#report_syntax_error;
   404:     raise x
   405: 
   406:   | _ ->
   407:     (*
   408:     print_endline "Got unknown error from parse..";
   409:     *)
   410:     toker#report_syntax_error;
   411:     raise (Flx_exceptions.ParseError "Parsing Tokens")
   412: 
   413: 
End ocaml section to src/flx_tok.ml[1]
Start ocaml section to src/flx_tok.mli[1 /1 ]
     1: # 593 "./lpsrc/flx_tokeniser.ipk"
     2: open Flx_parse
     3: open Flx_ast
     4: 
     5: val print_pre_tokens : token list -> unit
     6: val print_tokens : token list -> unit
     7: class tokeniser :
     8:   token list ->
     9:   object
    10:     val mutable current_token_index : int
    11:     val mutable tokens : token list
    12:     val mutable tokens_copy : token list
    13:     method report_syntax_error : unit
    14:     method put_back : token -> unit
    15:     method get_loc: range_srcref
    16:     method token_src : Lexing.lexbuf -> token
    17:     method token_peek : Lexing.lexbuf -> token
    18:     method parse_user_statement:
    19:       string ->
    20:       range_srcref ->
    21:       (token list * ast_term_t) list ->
    22:       (string, (token list * ast_term_t) list) Hashtbl.t ->
    23:       statement_t
    24:   end
    25: 
    26: type 'a parser_t =
    27:   (Lexing.lexbuf  -> token) ->
    28:   Lexing.lexbuf ->
    29:   'a
    30: 
    31: val parse_tokens:
    32:   'a parser_t ->
    33:   token list ->
    34:   'a
    35: 
End ocaml section to src/flx_tok.mli[1]