diff options
author | Xavier Leroy <xavier.leroy@inria.fr> | 1995-05-04 10:15:53 +0000 |
---|---|---|
committer | Xavier Leroy <xavier.leroy@inria.fr> | 1995-05-04 10:15:53 +0000 |
commit | 61bd8ace6bdb2652f4d51d64e3239a7105f56c26 (patch) | |
tree | e8b957df0957c1b483d41d68973824e280445548 /test/Lex | |
parent | 8f9ea2a7b886e3e0a5cfd76b11fe79d083a7f20c (diff) |
Passage a la version bootstrappee (franchissement du Rubicon)
git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@2 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
Diffstat (limited to 'test/Lex')
-rw-r--r-- | test/Lex/gram_aux.ml | 33 | ||||
-rw-r--r-- | test/Lex/grammar.mly | 100 | ||||
-rw-r--r-- | test/Lex/lexgen.ml | 252 | ||||
-rw-r--r-- | test/Lex/main.ml | 104 | ||||
-rw-r--r-- | test/Lex/output.ml | 155 | ||||
-rw-r--r-- | test/Lex/scan_aux.ml | 46 | ||||
-rw-r--r-- | test/Lex/scanner.mll | 118 | ||||
-rw-r--r-- | test/Lex/syntax.ml | 26 | ||||
-rw-r--r-- | test/Lex/testmain.ml | 34 | ||||
-rw-r--r-- | test/Lex/testscanner.mll | 121 |
10 files changed, 989 insertions, 0 deletions
diff --git a/test/Lex/gram_aux.ml b/test/Lex/gram_aux.ml new file mode 100644 index 000000000..525ee69b5 --- /dev/null +++ b/test/Lex/gram_aux.ml @@ -0,0 +1,33 @@ +(* Auxiliaries for the parser. *) + +open Syntax + +let regexp_for_string s = + let l = String.length s in + if l = 0 then + Epsilon + else begin + let re = ref(Characters [String.get s (l - 1)]) in + for i = l - 2 downto 0 do + re := Sequence(Characters [String.get s i], !re) + done; + !re + end + + +let char_class c1 c2 = + let class = ref [] in + for i = Char.code c2 downto Char.code c1 do + class := Char.chr i :: !class + done; + !class + + +let all_chars = char_class '\001' '\255' + + +let rec subtract l1 l2 = + match l1 with + [] -> [] + | a::l -> if List.mem a l2 then subtract l l2 else a :: subtract l l2 + diff --git a/test/Lex/grammar.mly b/test/Lex/grammar.mly new file mode 100644 index 000000000..eb1c8cc24 --- /dev/null +++ b/test/Lex/grammar.mly @@ -0,0 +1,100 @@ +/* The grammar for lexer definitions */ + +%{ +open Syntax +open Gram_aux +%} + +%token <string> Tident +%token <char> Tchar +%token <string> Tstring +%token <Syntax.location> Taction +%token Trule Tparse Tand Tequal Tend Tor Tunderscore Teof Tlbracket Trbracket +%token Tstar Tmaybe Tplus Tlparen Trparen Tcaret Tdash + +%left Tor +%left CONCAT +%nonassoc Tmaybe +%left Tstar +%left Tplus + +%start lexer_definition +%type <Syntax.lexer_definition> lexer_definition + +%% + +lexer_definition: + header Trule definition other_definitions Tend + { Lexdef($1, $3::(List.rev $4)) } +; +header: + Taction + { $1 } + | + { Location(0,0) } +; +other_definitions: + other_definitions Tand definition + { $3::$1 } + | + { [] } +; +definition: + Tident Tequal entry + { ($1,$3) } +; +entry: + Tparse case rest_of_entry + { $2 :: List.rev $3 } +; +rest_of_entry: + rest_of_entry Tor case + { $3::$1 } + | + { [] } +; +case: + regexp Taction + { ($1,$2) } +; +regexp: + Tunderscore + { Characters all_chars } + | Teof + { Characters ['\000'] } + | Tchar + { Characters [$1] } + | Tstring + { regexp_for_string $1 } + | Tlbracket char_class Trbracket + { Characters $2 } + | regexp Tstar + { Repetition $1 } + | regexp Tmaybe + { Alternative($1, Epsilon) } + | regexp Tplus + { Sequence($1, Repetition $1) } + | regexp Tor regexp + { Alternative($1,$3) } + | regexp regexp %prec CONCAT + { Sequence($1,$2) } + | Tlparen regexp Trparen + { $2 } +; +char_class: + Tcaret char_class1 + { subtract all_chars $2 } + | char_class1 + { $1 } +; +char_class1: + Tchar Tdash Tchar + { char_class $1 $3 } + | Tchar + { [$1] } + | char_class char_class %prec CONCAT + { $1 @ $2 } +; + +%% + diff --git a/test/Lex/lexgen.ml b/test/Lex/lexgen.ml new file mode 100644 index 000000000..73d011577 --- /dev/null +++ b/test/Lex/lexgen.ml @@ -0,0 +1,252 @@ +(* Compiling a lexer definition *) + +open Syntax + +(* Deep abstract syntax for regular expressions *) + +type regexp = + Empty + | Chars of int + | Action of int + | Seq of regexp * regexp + | Alt of regexp * regexp + | Star of regexp + +(* From shallow to deep syntax *) + +(*** + +let print_char_class c = + let print_interval low high = + prerr_int low; + if high - 1 > low then begin + prerr_char '-'; + prerr_int (high-1) + end; + prerr_char ' ' in + let rec print_class first next = function + [] -> print_interval first next + | c::l -> + if char.code c = next + then print_class first (next+1) l + else begin + print_interval first next; + print_class (char.code c) (char.code c + 1) l + end in + match c with + [] -> prerr_newline() + | c::l -> print_class (char.code c) (char.code c + 1) l; prerr_newline() + + +let rec print_regexp = function + Empty -> prerr_string "Empty" + | Chars n -> prerr_string "Chars "; prerr_int n + | Action n -> prerr_string "Action "; prerr_int n + | Seq(r1,r2) -> print_regexp r1; prerr_string "; "; print_regexp r2 + | Alt(r1,r2) -> prerr_string "("; print_regexp r1; prerr_string " | "; print_regexp r2; prerr_string ")" + | Star r -> prerr_string "("; print_regexp r; prerr_string ")*" + +***) + +let chars = ref ([] : char list list) +let chars_count = ref 0 +let actions = ref ([] : (int * location) list) +let actions_count = ref 0 + +let rec encode_regexp = function + Epsilon -> Empty + | Characters cl -> + let n = !chars_count in +(*** prerr_int n; prerr_char ' '; print_char_class cl; ***) + chars := cl :: !chars; + chars_count := !chars_count + 1; + Chars(n) + | Sequence(r1,r2) -> + Seq(encode_regexp r1, encode_regexp r2) + | Alternative(r1,r2) -> + Alt(encode_regexp r1, encode_regexp r2) + | Repetition r -> + Star (encode_regexp r) + + +let encode_casedef = + List.fold_left + (fun reg (expr,act) -> + let act_num = !actions_count in + actions_count := !actions_count + 1; + actions := (act_num, act) :: !actions; + Alt(reg, Seq(encode_regexp expr, Action act_num))) + Empty + + +let encode_lexdef (Lexdef(_, ld)) = + chars := []; + chars_count := 0; + actions := []; + actions_count := 0; + let name_regexp_list = + List.map (fun (name, casedef) -> (name, encode_casedef casedef)) ld in +(* List.iter print_char_class chars; *) + let chr = Array.of_list (List.rev !chars) + and act = !actions in + chars := []; + actions := []; + (chr, name_regexp_list, act) + + +(* To generate directly a NFA from a regular expression. + Confer Aho-Sethi-Ullman, dragon book, chap. 3 *) + +type transition = + OnChars of int + | ToAction of int + + +let rec merge_trans l1 l2 = + match (l1, l2) with + ([], s2) -> s2 + | (s1, []) -> s1 + | ((OnChars n1 as t1) :: r1 as s1), ((OnChars n2 as t2) :: r2 as s2) -> + if n1 = n2 then t1 :: merge_trans r1 r2 else + if n1 < n2 then t1 :: merge_trans r1 s2 else + t2 :: merge_trans s1 r2 + | ((ToAction n1 as t1) :: r1 as s1), ((ToAction n2 as t2) :: r2 as s2) -> + if n1 = n2 then t1 :: merge_trans r1 r2 else + if n1 < n2 then t1 :: merge_trans r1 s2 else + t2 :: merge_trans s1 r2 + | ((OnChars n1 as t1) :: r1 as s1), ((ToAction n2 as t2) :: r2 as s2) -> + t1 :: merge_trans r1 s2 + | ((ToAction n1 as t1) :: r1 as s1), ((OnChars n2 as t2) :: r2 as s2) -> + t2 :: merge_trans s1 r2 + + +let rec nullable = function + Empty -> true + | Chars _ -> false + | Action _ -> false + | Seq(r1,r2) -> nullable r1 & nullable r2 + | Alt(r1,r2) -> nullable r1 or nullable r2 + | Star r -> true + + +let rec firstpos = function + Empty -> [] + | Chars pos -> [OnChars pos] + | Action act -> [ToAction act] + | Seq(r1,r2) -> if nullable r1 + then merge_trans (firstpos r1) (firstpos r2) + else firstpos r1 + | Alt(r1,r2) -> merge_trans (firstpos r1) (firstpos r2) + | Star r -> firstpos r + + +let rec lastpos = function + Empty -> [] + | Chars pos -> [OnChars pos] + | Action act -> [ToAction act] + | Seq(r1,r2) -> if nullable r2 + then merge_trans (lastpos r1) (lastpos r2) + else lastpos r2 + | Alt(r1,r2) -> merge_trans (lastpos r1) (lastpos r2) + | Star r -> lastpos r + + +let followpos size name_regexp_list = + let v = Array.new size [] in + let fill_pos first = function + OnChars pos -> v.(pos) <- merge_trans first v.(pos); () + | ToAction _ -> () in + let rec fill = function + Seq(r1,r2) -> + fill r1; fill r2; + List.iter (fill_pos (firstpos r2)) (lastpos r1) + | Alt(r1,r2) -> + fill r1; fill r2 + | Star r -> + fill r; + List.iter (fill_pos (firstpos r)) (lastpos r) + | _ -> () in + List.iter (fun (name, regexp) -> fill regexp) name_regexp_list; + v + + +let no_action = 0x3FFFFFFF + +let split_trans_set = + List.fold_left + (fun (act, pos_set as act_pos_set) trans -> + match trans with + OnChars pos -> (act, pos :: pos_set) + | ToAction act1 -> if act1 < act then (act1, pos_set) + else act_pos_set) + (no_action, []) + + +let memory = (Hashtbl.new 131 : (transition list, int) Hashtbl.t) +let todo = ref ([] : (transition list * int) list) +let next = ref 0 + +let get_state st = + try + Hashtbl.find memory st + with Not_found -> + let nbr = !next in + next := !next + 1; + Hashtbl.add memory st nbr; + todo := (st, nbr) :: !todo; + nbr + +let rec map_on_states f = + match !todo with + [] -> [] + | (st,i)::r -> todo := r; let res = f st in (res,i) :: map_on_states f + +let number_of_states () = !next + +let goto_state = function + [] -> Backtrack + | ps -> Goto (get_state ps) + + +let transition_from chars follow pos_set = + let tr = Array.new 256 [] + and shift = Array.new 256 Backtrack in + List.iter + (fun pos -> + List.iter + (fun c -> + tr.(Char.code c) <- + merge_trans tr.(Char.code c) follow.(pos)) + chars.(pos)) + pos_set; + for i = 0 to 255 do + shift.(i) <- goto_state tr.(i) + done; + shift + + +let translate_state chars follow state = + match split_trans_set state with + n, [] -> Perform n + | n, ps -> Shift( (if n = no_action then No_remember else Remember n), + transition_from chars follow ps) + + +let make_dfa lexdef = + let (chars, name_regexp_list, actions) = + encode_lexdef lexdef in +(** + List.iter (fun (name, regexp) -> prerr_string name; prerr_string " = "; print_regexp regexp; prerr_newline()) name_regexp_list; +**) + let follow = + followpos (Array.length chars) name_regexp_list in + let initial_states = + List.map (fun (name, regexp) -> (name, get_state(firstpos regexp))) + name_regexp_list in + let states = + map_on_states (translate_state chars follow) in + let v = + Array.new (number_of_states()) (Perform 0) in + List.iter (fun (auto, i) -> v.(i) <- auto) states; + (initial_states, v, actions) + diff --git a/test/Lex/main.ml b/test/Lex/main.ml new file mode 100644 index 000000000..94902ed21 --- /dev/null +++ b/test/Lex/main.ml @@ -0,0 +1,104 @@ +(* The lexer generator. Command-line parsing. *) + +open Syntax +open Scanner +open Grammar +open Lexgen +open Output + +let main () = + if Array.length Sys.argv <> 2 then begin + prerr_string "Usage: camllex <input file>\n"; + exit 2 + end; + let source_name = Sys.argv.(1) in + let dest_name = + if Filename.check_suffix source_name ".mll" then + Filename.chop_suffix source_name ".mll" ^ ".ml" + else + source_name ^ ".ml" in + ic := open_in source_name; + oc := open_out dest_name; + let lexbuf = Lexing.from_channel !ic in + let (Lexdef(header,_) as def) = + try + Grammar.lexer_definition Scanner.main lexbuf + with + Parsing.Parse_error -> + prerr_string "Syntax error around char "; + prerr_int (Lexing.lexeme_start lexbuf); + prerr_endline "."; + exit 2 + | Scan_aux.Lexical_error s -> + prerr_string "Lexical error around char "; + prerr_int (Lexing.lexeme_start lexbuf); + prerr_string ": "; + prerr_string s; + prerr_endline "."; + exit 2 in + let ((init, states, acts) as dfa) = make_dfa def in + output_lexdef header dfa; + close_in !ic; + close_out !oc + +let _ = main(); exit 0 + + +(***** +let main () = + ic := stdin; + oc := stdout; + let lexbuf = lexing.from_channel ic in + let (Lexdef(header,_) as def) = + try + grammar.lexer_definition scanner.main lexbuf + with + parsing.Parse_error x -> + prerr_string "Syntax error around char "; + prerr_int (lexing.lexeme_start lexbuf); + prerr_endline "."; + sys.exit 2 + | scan_aux.Lexical_error s -> + prerr_string "Lexical error around char "; + prerr_int (lexing.lexeme_start lexbuf); + prerr_string ": "; + prerr_string s; + prerr_endline "."; + sys.exit 2 in + let ((init, states, acts) as dfa) = make_dfa def in + output_lexdef header dfa + +****) + +(**** +let debug_scanner lexbuf = + let tok = scanner.main lexbuf in + begin match tok with + Tident s -> prerr_string "Tident "; prerr_string s + | Tchar c -> prerr_string "Tchar "; prerr_char c + | Tstring s -> prerr_string "Tstring "; prerr_string s + | Taction(Location(i1,i2)) -> + prerr_string "Taction "; prerr_int i1; prerr_string "-"; + prerr_int i2 + | Trule -> prerr_string "Trule" + | Tparse -> prerr_string "Tparse" + | Tand -> prerr_string "Tand" + | Tequal -> prerr_string "Tequal" + | Tend -> prerr_string "Tend" + | Tor -> prerr_string "Tor" + | Tunderscore -> prerr_string "Tunderscore" + | Teof -> prerr_string "Teof" + | Tlbracket -> prerr_string "Tlbracket" + | Trbracket -> prerr_string "Trbracket" + | Tstar -> prerr_string "Tstar" + | Tmaybe -> prerr_string "Tmaybe" + | Tplus -> prerr_string "Tplus" + | Tlparen -> prerr_string "Tlparen" + | Trparen -> prerr_string "Trparen" + | Tcaret -> prerr_string "Tcaret" + | Tdash -> prerr_string "Tdash" + end; + prerr_newline(); + tok + +****) diff --git a/test/Lex/output.ml b/test/Lex/output.ml new file mode 100644 index 000000000..301edcba3 --- /dev/null +++ b/test/Lex/output.ml @@ -0,0 +1,155 @@ +(* Generating a DFA as a set of mutually recursive functions *) + +open Syntax + +let ic = ref stdin +let oc = ref stdout + +(* 1- Generating the actions *) + +let copy_buffer = String.create 1024 + +let copy_chunk (Location(start,stop)) = + seek_in !ic start; + let tocopy = ref(stop - start) in + while !tocopy > 0 do + let m = + input !ic copy_buffer 0 (min !tocopy (String.length copy_buffer)) in + output !oc copy_buffer 0 m; + tocopy := !tocopy - m + done + + +let output_action (i,act) = + output_string !oc ("action_" ^ string_of_int i ^ " lexbuf = (\n"); + copy_chunk act; + output_string !oc ")\nand " + + +(* 2- Generating the states *) + +let states = ref ([||] : automata array) + +type occurrence = + { mutable pos: int list; + mutable freq: int } + +let enumerate_vect v = + let env = ref [] in + for pos = 0 to Array.length v - 1 do + try + let occ = List.assoc v.(pos) !env in + occ.pos <- pos :: occ.pos; + occ.freq <- occ.freq + 1 + with Not_found -> + env := (v.(pos), {pos = [pos]; freq = 1 }) :: !env + done; + Sort.list (fun (e1, occ1) (e2, occ2) -> occ1.freq >= occ2.freq) !env + + +let output_move = function + Backtrack -> + output_string !oc "lexing.backtrack lexbuf" + | Goto dest -> + match !states.(dest) with + Perform act_num -> + output_string !oc ("action_" ^ string_of_int act_num ^ " lexbuf") + | _ -> + output_string !oc ("state_" ^ string_of_int dest ^ " lexbuf") + + +(* Cannot use standard char_for_read because the characters to escape + are not the same in CL6 and CL1999. *) + +let output_char_lit oc = function + '\'' -> output_string oc "\\'" + | '\\' -> output_string oc "\\\\" + | '\n' -> output_string oc "\\n" + | '\t' -> output_string oc "\\t" + | c -> if Char.code c >= 32 & Char.code c < 128 then + output_char oc c + else begin + let n = Char.code c in + output_char oc '\\'; + output_char oc (Char.chr (48 + n / 100)); + output_char oc (Char.chr (48 + (n / 10) mod 10)); + output_char oc (Char.chr (48 + n mod 10)) + end + +let rec output_chars = function + [] -> + failwith "output_chars" + | [c] -> + output_string !oc "'"; + output_char_lit !oc (Char.chr c); + output_string !oc "'" + | c::cl -> + output_string !oc "'"; + output_char_lit !oc (Char.chr c); + output_string !oc "'|"; + output_chars cl + +let output_one_trans (dest, occ) = + output_chars occ.pos; + output_string !oc " -> "; + output_move dest; + output_string !oc "\n | " + +let output_all_trans trans = + output_string !oc " match lexing.next_char lexbuf with\n "; + match enumerate_vect trans with + [] -> + failwith "output_all_trans" + | (default, _) :: rest -> + List.iter output_one_trans rest; + output_string !oc "_ -> "; + output_move default; + output_string !oc "\nand " + +let output_state state_num = function + Perform i -> + () + | Shift(what_to_do, moves) -> + output_string !oc + ("state_" ^ string_of_int state_num ^ " lexbuf =\n"); + begin match what_to_do with + No_remember -> () + | Remember i -> + output_string !oc + (" Lexing.set_backtrack lexbuf action_" ^ + string_of_int i ^ ";\n") + end; + output_all_trans moves + + +(* 3- Generating the entry points *) + +let rec output_entries = function + [] -> failwith "output_entries" + | (name,state_num) :: rest -> + output_string !oc (name ^ " lexbuf =\n"); + output_string !oc " Lexing.init lexbuf;\n"; + output_string !oc (" state_" ^ string_of_int state_num ^ + " lexbuf\n"); + match rest with + [] -> output_string !oc "\n"; () + | _ -> output_string !oc "\nand "; output_entries rest + + +(* All together *) + +let output_lexdef header (initial_st, st, actions) = + prerr_int (Array.length st); prerr_string " states, "; + prerr_int (List.length actions); prerr_string " actions."; + prerr_newline(); + copy_chunk header; + output_string !oc "\nlet rec "; + states := st; + List.iter output_action actions; + for i = 0 to Array.length st - 1 do + output_state i st.(i) + done; + output_entries initial_st + + + diff --git a/test/Lex/scan_aux.ml b/test/Lex/scan_aux.ml new file mode 100644 index 000000000..8b01d6347 --- /dev/null +++ b/test/Lex/scan_aux.ml @@ -0,0 +1,46 @@ +(* Auxiliaries for the lexical analyzer *) + +let brace_depth = ref 0 +let comment_depth = ref 0 + +exception Lexical_error of string + +let initial_string_buffer = String.create 256 +let string_buff = ref initial_string_buffer +let string_index = ref 0 + +let reset_string_buffer () = + string_buff := initial_string_buffer; + string_index := 0 + + +let store_string_char c = + begin + if !string_index >= String.length !string_buff then begin + let new_buff = String.create (String.length !string_buff * 2) in + String.blit new_buff 0 !string_buff 0 (String.length !string_buff); + string_buff := new_buff + end + end; + String.unsafe_set !string_buff !string_index c; + incr string_index + +let get_stored_string () = + let s = String.sub !string_buff 0 !string_index in + string_buff := initial_string_buffer; + s + + +let char_for_backslash = function + 'n' -> '\010' (* '\n' when bootstrapped *) + | 't' -> '\009' (* '\t' *) + | 'b' -> '\008' (* '\b' *) + | 'r' -> '\013' (* '\r' *) + | c -> c + + +let char_for_decimal_code lexbuf i = + Char.chr(100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) + + 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) + + (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48)) + diff --git a/test/Lex/scanner.mll b/test/Lex/scanner.mll new file mode 100644 index 000000000..7cb13ba70 --- /dev/null +++ b/test/Lex/scanner.mll @@ -0,0 +1,118 @@ +(* The lexical analyzer for lexer definitions. *) + +{ +open Syntax +open Grammar +open Scan_aux +} + +rule main = parse + [' ' '\010' '\013' '\009' ] + + { main lexbuf } + | "(*" + { comment_depth := 1; + comment lexbuf; + main lexbuf } + | (['A'-'Z' 'a'-'z'] | '_' ['A'-'Z' 'a'-'z' '\'' '0'-'9']) + ( '_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9'] ) * + { match Lexing.lexeme lexbuf with + "rule" -> Trule + | "parse" -> Tparse + | "and" -> Tand + | "eof" -> Teof + | s -> Tident s } + | '"' + { reset_string_buffer(); + string lexbuf; + Tstring(get_stored_string()) } + | "'" + { Tchar(char lexbuf) } + | '{' + { let n1 = Lexing.lexeme_end lexbuf in + brace_depth := 1; + let n2 = action lexbuf in + Taction(Location(n1, n2)) } + | '=' { Tequal } + | ";;" { Tend } + | '|' { Tor } + | '_' { Tunderscore } + | "eof" { Teof } + | '[' { Tlbracket } + | ']' { Trbracket } + | '*' { Tstar } + | '?' { Tmaybe } + | '+' { Tplus } + | '(' { Tlparen } + | ')' { Trparen } + | '^' { Tcaret } + | '-' { Tdash } + | eof + { raise(Lexical_error "unterminated lexer definition") } + | _ + { raise(Lexical_error("illegal character " ^ Lexing.lexeme lexbuf)) } + +and action = parse + '{' + { incr brace_depth; + action lexbuf } + | '}' + { decr brace_depth; + if !brace_depth = 0 then Lexing.lexeme_start lexbuf else action lexbuf } + | '"' + { reset_string_buffer(); + string lexbuf; + reset_string_buffer(); + action lexbuf } + | '\'' + { char lexbuf; action lexbuf } + | "(*" + { comment_depth := 1; + comment lexbuf; + action lexbuf } + | eof + { raise (Lexical_error "unterminated action") } + | _ + { action lexbuf } + +and string = parse + '"' + { () } + | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] + + { string lexbuf } + | '\\' ['\\' '"' 'n' 't' 'b' 'r'] + { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1)); + string lexbuf } + | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] + { store_string_char(char_for_decimal_code lexbuf 1); + string lexbuf } + | eof + { raise(Lexical_error "unterminated string") } + | _ + { store_string_char(Lexing.lexeme_char lexbuf 0); + string lexbuf } + +and char = parse + [^ '\\'] "'" + { Lexing.lexeme_char lexbuf 0 } + | '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" + { char_for_backslash (Lexing.lexeme_char lexbuf 1) } + | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" + { char_for_decimal_code lexbuf 1 } + | _ + { raise(Lexical_error "bad character constant") } + +and comment = parse + "(*" + { incr comment_depth; comment lexbuf } + | "*)" + { decr comment_depth; + if !comment_depth = 0 then () else comment lexbuf } + | '"' + { reset_string_buffer(); + string lexbuf; + reset_string_buffer(); + comment lexbuf } + | eof + { raise(Lexical_error "unterminated comment") } + | _ + { comment lexbuf } diff --git a/test/Lex/syntax.ml b/test/Lex/syntax.ml new file mode 100644 index 000000000..f692e6f62 --- /dev/null +++ b/test/Lex/syntax.ml @@ -0,0 +1,26 @@ +(* The shallow abstract syntax *) + +type location = + Location of int * int + +type regular_expression = + Epsilon + | Characters of char list + | Sequence of regular_expression * regular_expression + | Alternative of regular_expression * regular_expression + | Repetition of regular_expression + +type lexer_definition = + Lexdef of location * (string * (regular_expression * location) list) list + +(* Representation of automata *) + +type automata = + Perform of int + | Shift of automata_trans * automata_move array +and automata_trans = + No_remember + | Remember of int +and automata_move = + Backtrack + | Goto of int diff --git a/test/Lex/testmain.ml b/test/Lex/testmain.ml new file mode 100644 index 000000000..e0a914ee0 --- /dev/null +++ b/test/Lex/testmain.ml @@ -0,0 +1,34 @@ +(* The lexer generator. Command-line parsing. *) + +#open "syntax";; +#open "testscanner";; +#open "grammar";; +#open "lexgen";; +#open "output";; + +let main () = + ic := stdin; + oc := stdout; + let lexbuf = lexing.from_channel ic in + let (Lexdef(header,_) as def) = + try + grammar.lexer_definition testscanner.main lexbuf + with + parsing.Parse_error x -> + prerr_string "Syntax error around char "; + prerr_int (lexing.lexeme_start lexbuf); + prerr_endline "."; + sys.exit 2 + | scan_aux.Lexical_error s -> + prerr_string "Lexical error around char "; + prerr_int (lexing.lexeme_start lexbuf); + prerr_string ": "; + prerr_string s; + prerr_endline "."; + sys.exit 2 in + let ((init, states, acts) as dfa) = make_dfa def in + output_lexdef header dfa +;; + +main(); sys.exit 0 +;; diff --git a/test/Lex/testscanner.mll b/test/Lex/testscanner.mll new file mode 100644 index 000000000..91ada299f --- /dev/null +++ b/test/Lex/testscanner.mll @@ -0,0 +1,121 @@ +(* The lexical analyzer for lexer definitions. *) + +{ +#open "syntax";; +#open "grammar";; +#open "scan_aux";; +} + +rule main = parse + _ * "qwertyuiopasdfghjklzxcvbnm0123456789!@#$%^&*()" + { main lexbuf } + | [' ' '\010' '\013' '\009' ] + + { main lexbuf } + | "(*" + { comment_depth := 1; + comment lexbuf; + main lexbuf } + | (['A'-'Z' 'a'-'z'] | '_' ['A'-'Z' 'a'-'z' '\'' '0'-'9']) + ( '_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9'] ) * + { match lexing.lexeme lexbuf with + "rule" -> Trule + | "parse" -> Tparse + | "and" -> Tand + | "eof" -> Teof + | s -> Tident s } + | '"' + { reset_string_buffer(); + string lexbuf; + Tstring(get_stored_string()) } + | "'" + { Tchar(char lexbuf) } + | '{' + { let n1 = lexing.lexeme_end lexbuf in + brace_depth := 1; + let n2 = action lexbuf in + Taction(Location(n1, n2)) } + | '=' { Tequal } + | ";;" { Tend } + | '|' { Tor } + | '_' { Tunderscore } + | "eof" { Teof } + | '[' { Tlbracket } + | ']' { Trbracket } + | '*' { Tstar } + | '?' { Tmaybe } + | '+' { Tplus } + | '(' { Tlparen } + | ')' { Trparen } + | '^' { Tcaret } + | '-' { Tdash } + | eof + { raise(Lexical_error "unterminated lexer definition") } + | _ + { raise(Lexical_error("illegal character " ^ lexing.lexeme lexbuf)) } + +and action = parse + '{' + { brace_depth := brace_depth + 1; + action lexbuf } + | '}' + { brace_depth := brace_depth - 1; + if brace_depth = 0 then lexing.lexeme_start lexbuf else action lexbuf } + | '"' + { reset_string_buffer(); + string lexbuf; + reset_string_buffer(); + action lexbuf } + | '\'' + { char lexbuf; action lexbuf } + | "(*" + { comment_depth := 1; + comment lexbuf; + action lexbuf } + | eof + { raise (Lexical_error "unterminated action") } + | _ + { action lexbuf } + +and string = parse + '"' + { () } + | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] + + { string lexbuf } + | '\\' ['\\' '"' 'n' 't' 'b' 'r'] + { store_string_char(char_for_backslash(lexing.lexeme_char lexbuf 1)); + string lexbuf } + | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] + { store_string_char(char_for_decimal_code lexbuf 1); + string lexbuf } + | eof + { raise(Lexical_error "unterminated string") } + | _ + { store_string_char(lexing.lexeme_char lexbuf 0); + string lexbuf } + +and char = parse + [^ '\\'] "'" + { lexing.lexeme_char lexbuf 0 } + | '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" + { char_for_backslash (lexing.lexeme_char lexbuf 1) } + | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" + { char_for_decimal_code lexbuf 1 } + | _ + { raise(Lexical_error "bad character constant") } + +and comment = parse + "(*" + { comment_depth := comment_depth + 1; comment lexbuf } + | "*)" + { comment_depth := comment_depth - 1; + if comment_depth = 0 then () else comment lexbuf } + | '"' + { reset_string_buffer(); + string lexbuf; + reset_string_buffer(); + comment lexbuf } + | eof + { raise(Lexical_error "unterminated comment") } + | _ + { comment lexbuf } +;; |