summaryrefslogtreecommitdiffstats
path: root/test/Lex
diff options
context:
space:
mode:
authorXavier Leroy <xavier.leroy@inria.fr>1995-05-04 10:15:53 +0000
committerXavier Leroy <xavier.leroy@inria.fr>1995-05-04 10:15:53 +0000
commit61bd8ace6bdb2652f4d51d64e3239a7105f56c26 (patch)
treee8b957df0957c1b483d41d68973824e280445548 /test/Lex
parent8f9ea2a7b886e3e0a5cfd76b11fe79d083a7f20c (diff)
Passage a la version bootstrappee (franchissement du Rubicon)
git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@2 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
Diffstat (limited to 'test/Lex')
-rw-r--r--test/Lex/gram_aux.ml33
-rw-r--r--test/Lex/grammar.mly100
-rw-r--r--test/Lex/lexgen.ml252
-rw-r--r--test/Lex/main.ml104
-rw-r--r--test/Lex/output.ml155
-rw-r--r--test/Lex/scan_aux.ml46
-rw-r--r--test/Lex/scanner.mll118
-rw-r--r--test/Lex/syntax.ml26
-rw-r--r--test/Lex/testmain.ml34
-rw-r--r--test/Lex/testscanner.mll121
10 files changed, 989 insertions, 0 deletions
diff --git a/test/Lex/gram_aux.ml b/test/Lex/gram_aux.ml
new file mode 100644
index 000000000..525ee69b5
--- /dev/null
+++ b/test/Lex/gram_aux.ml
@@ -0,0 +1,33 @@
+(* Auxiliaries for the parser. *)
+
+open Syntax
+
+let regexp_for_string s =
+ let l = String.length s in
+ if l = 0 then
+ Epsilon
+ else begin
+ let re = ref(Characters [String.get s (l - 1)]) in
+ for i = l - 2 downto 0 do
+ re := Sequence(Characters [String.get s i], !re)
+ done;
+ !re
+ end
+
+
+let char_class c1 c2 =
+ let class = ref [] in
+ for i = Char.code c2 downto Char.code c1 do
+ class := Char.chr i :: !class
+ done;
+ !class
+
+
+let all_chars = char_class '\001' '\255'
+
+
+let rec subtract l1 l2 =
+ match l1 with
+ [] -> []
+ | a::l -> if List.mem a l2 then subtract l l2 else a :: subtract l l2
+
diff --git a/test/Lex/grammar.mly b/test/Lex/grammar.mly
new file mode 100644
index 000000000..eb1c8cc24
--- /dev/null
+++ b/test/Lex/grammar.mly
@@ -0,0 +1,100 @@
+/* The grammar for lexer definitions */
+
+%{
+open Syntax
+open Gram_aux
+%}
+
+%token <string> Tident
+%token <char> Tchar
+%token <string> Tstring
+%token <Syntax.location> Taction
+%token Trule Tparse Tand Tequal Tend Tor Tunderscore Teof Tlbracket Trbracket
+%token Tstar Tmaybe Tplus Tlparen Trparen Tcaret Tdash
+
+%left Tor
+%left CONCAT
+%nonassoc Tmaybe
+%left Tstar
+%left Tplus
+
+%start lexer_definition
+%type <Syntax.lexer_definition> lexer_definition
+
+%%
+
+lexer_definition:
+ header Trule definition other_definitions Tend
+ { Lexdef($1, $3::(List.rev $4)) }
+;
+header:
+ Taction
+ { $1 }
+ |
+ { Location(0,0) }
+;
+other_definitions:
+ other_definitions Tand definition
+ { $3::$1 }
+ |
+ { [] }
+;
+definition:
+ Tident Tequal entry
+ { ($1,$3) }
+;
+entry:
+ Tparse case rest_of_entry
+ { $2 :: List.rev $3 }
+;
+rest_of_entry:
+ rest_of_entry Tor case
+ { $3::$1 }
+ |
+ { [] }
+;
+case:
+ regexp Taction
+ { ($1,$2) }
+;
+regexp:
+ Tunderscore
+ { Characters all_chars }
+ | Teof
+ { Characters ['\000'] }
+ | Tchar
+ { Characters [$1] }
+ | Tstring
+ { regexp_for_string $1 }
+ | Tlbracket char_class Trbracket
+ { Characters $2 }
+ | regexp Tstar
+ { Repetition $1 }
+ | regexp Tmaybe
+ { Alternative($1, Epsilon) }
+ | regexp Tplus
+ { Sequence($1, Repetition $1) }
+ | regexp Tor regexp
+ { Alternative($1,$3) }
+ | regexp regexp %prec CONCAT
+ { Sequence($1,$2) }
+ | Tlparen regexp Trparen
+ { $2 }
+;
+char_class:
+ Tcaret char_class1
+ { subtract all_chars $2 }
+ | char_class1
+ { $1 }
+;
+char_class1:
+ Tchar Tdash Tchar
+ { char_class $1 $3 }
+ | Tchar
+ { [$1] }
+ | char_class char_class %prec CONCAT
+ { $1 @ $2 }
+;
+
+%%
+
diff --git a/test/Lex/lexgen.ml b/test/Lex/lexgen.ml
new file mode 100644
index 000000000..73d011577
--- /dev/null
+++ b/test/Lex/lexgen.ml
@@ -0,0 +1,252 @@
+(* Compiling a lexer definition *)
+
+open Syntax
+
+(* Deep abstract syntax for regular expressions *)
+
+type regexp =
+ Empty
+ | Chars of int
+ | Action of int
+ | Seq of regexp * regexp
+ | Alt of regexp * regexp
+ | Star of regexp
+
+(* From shallow to deep syntax *)
+
+(***
+
+let print_char_class c =
+ let print_interval low high =
+ prerr_int low;
+ if high - 1 > low then begin
+ prerr_char '-';
+ prerr_int (high-1)
+ end;
+ prerr_char ' ' in
+ let rec print_class first next = function
+ [] -> print_interval first next
+ | c::l ->
+ if char.code c = next
+ then print_class first (next+1) l
+ else begin
+ print_interval first next;
+ print_class (char.code c) (char.code c + 1) l
+ end in
+ match c with
+ [] -> prerr_newline()
+ | c::l -> print_class (char.code c) (char.code c + 1) l; prerr_newline()
+
+
+let rec print_regexp = function
+ Empty -> prerr_string "Empty"
+ | Chars n -> prerr_string "Chars "; prerr_int n
+ | Action n -> prerr_string "Action "; prerr_int n
+ | Seq(r1,r2) -> print_regexp r1; prerr_string "; "; print_regexp r2
+ | Alt(r1,r2) -> prerr_string "("; print_regexp r1; prerr_string " | "; print_regexp r2; prerr_string ")"
+ | Star r -> prerr_string "("; print_regexp r; prerr_string ")*"
+
+***)
+
+let chars = ref ([] : char list list)
+let chars_count = ref 0
+let actions = ref ([] : (int * location) list)
+let actions_count = ref 0
+
+let rec encode_regexp = function
+ Epsilon -> Empty
+ | Characters cl ->
+ let n = !chars_count in
+(*** prerr_int n; prerr_char ' '; print_char_class cl; ***)
+ chars := cl :: !chars;
+ chars_count := !chars_count + 1;
+ Chars(n)
+ | Sequence(r1,r2) ->
+ Seq(encode_regexp r1, encode_regexp r2)
+ | Alternative(r1,r2) ->
+ Alt(encode_regexp r1, encode_regexp r2)
+ | Repetition r ->
+ Star (encode_regexp r)
+
+
+let encode_casedef =
+ List.fold_left
+ (fun reg (expr,act) ->
+ let act_num = !actions_count in
+ actions_count := !actions_count + 1;
+ actions := (act_num, act) :: !actions;
+ Alt(reg, Seq(encode_regexp expr, Action act_num)))
+ Empty
+
+
+let encode_lexdef (Lexdef(_, ld)) =
+ chars := [];
+ chars_count := 0;
+ actions := [];
+ actions_count := 0;
+ let name_regexp_list =
+ List.map (fun (name, casedef) -> (name, encode_casedef casedef)) ld in
+(* List.iter print_char_class chars; *)
+ let chr = Array.of_list (List.rev !chars)
+ and act = !actions in
+ chars := [];
+ actions := [];
+ (chr, name_regexp_list, act)
+
+
+(* To generate directly a NFA from a regular expression.
+ Confer Aho-Sethi-Ullman, dragon book, chap. 3 *)
+
+type transition =
+ OnChars of int
+ | ToAction of int
+
+
+let rec merge_trans l1 l2 =
+ match (l1, l2) with
+ ([], s2) -> s2
+ | (s1, []) -> s1
+ | ((OnChars n1 as t1) :: r1 as s1), ((OnChars n2 as t2) :: r2 as s2) ->
+ if n1 = n2 then t1 :: merge_trans r1 r2 else
+ if n1 < n2 then t1 :: merge_trans r1 s2 else
+ t2 :: merge_trans s1 r2
+ | ((ToAction n1 as t1) :: r1 as s1), ((ToAction n2 as t2) :: r2 as s2) ->
+ if n1 = n2 then t1 :: merge_trans r1 r2 else
+ if n1 < n2 then t1 :: merge_trans r1 s2 else
+ t2 :: merge_trans s1 r2
+ | ((OnChars n1 as t1) :: r1 as s1), ((ToAction n2 as t2) :: r2 as s2) ->
+ t1 :: merge_trans r1 s2
+ | ((ToAction n1 as t1) :: r1 as s1), ((OnChars n2 as t2) :: r2 as s2) ->
+ t2 :: merge_trans s1 r2
+
+
+let rec nullable = function
+ Empty -> true
+ | Chars _ -> false
+ | Action _ -> false
+ | Seq(r1,r2) -> nullable r1 & nullable r2
+ | Alt(r1,r2) -> nullable r1 or nullable r2
+ | Star r -> true
+
+
+let rec firstpos = function
+ Empty -> []
+ | Chars pos -> [OnChars pos]
+ | Action act -> [ToAction act]
+ | Seq(r1,r2) -> if nullable r1
+ then merge_trans (firstpos r1) (firstpos r2)
+ else firstpos r1
+ | Alt(r1,r2) -> merge_trans (firstpos r1) (firstpos r2)
+ | Star r -> firstpos r
+
+
+let rec lastpos = function
+ Empty -> []
+ | Chars pos -> [OnChars pos]
+ | Action act -> [ToAction act]
+ | Seq(r1,r2) -> if nullable r2
+ then merge_trans (lastpos r1) (lastpos r2)
+ else lastpos r2
+ | Alt(r1,r2) -> merge_trans (lastpos r1) (lastpos r2)
+ | Star r -> lastpos r
+
+
+let followpos size name_regexp_list =
+ let v = Array.new size [] in
+ let fill_pos first = function
+ OnChars pos -> v.(pos) <- merge_trans first v.(pos); ()
+ | ToAction _ -> () in
+ let rec fill = function
+ Seq(r1,r2) ->
+ fill r1; fill r2;
+ List.iter (fill_pos (firstpos r2)) (lastpos r1)
+ | Alt(r1,r2) ->
+ fill r1; fill r2
+ | Star r ->
+ fill r;
+ List.iter (fill_pos (firstpos r)) (lastpos r)
+ | _ -> () in
+ List.iter (fun (name, regexp) -> fill regexp) name_regexp_list;
+ v
+
+
+let no_action = 0x3FFFFFFF
+
+let split_trans_set =
+ List.fold_left
+ (fun (act, pos_set as act_pos_set) trans ->
+ match trans with
+ OnChars pos -> (act, pos :: pos_set)
+ | ToAction act1 -> if act1 < act then (act1, pos_set)
+ else act_pos_set)
+ (no_action, [])
+
+
+let memory = (Hashtbl.new 131 : (transition list, int) Hashtbl.t)
+let todo = ref ([] : (transition list * int) list)
+let next = ref 0
+
+let get_state st =
+ try
+ Hashtbl.find memory st
+ with Not_found ->
+ let nbr = !next in
+ next := !next + 1;
+ Hashtbl.add memory st nbr;
+ todo := (st, nbr) :: !todo;
+ nbr
+
+let rec map_on_states f =
+ match !todo with
+ [] -> []
+ | (st,i)::r -> todo := r; let res = f st in (res,i) :: map_on_states f
+
+let number_of_states () = !next
+
+let goto_state = function
+ [] -> Backtrack
+ | ps -> Goto (get_state ps)
+
+
+let transition_from chars follow pos_set =
+ let tr = Array.new 256 []
+ and shift = Array.new 256 Backtrack in
+ List.iter
+ (fun pos ->
+ List.iter
+ (fun c ->
+ tr.(Char.code c) <-
+ merge_trans tr.(Char.code c) follow.(pos))
+ chars.(pos))
+ pos_set;
+ for i = 0 to 255 do
+ shift.(i) <- goto_state tr.(i)
+ done;
+ shift
+
+
+let translate_state chars follow state =
+ match split_trans_set state with
+ n, [] -> Perform n
+ | n, ps -> Shift( (if n = no_action then No_remember else Remember n),
+ transition_from chars follow ps)
+
+
+let make_dfa lexdef =
+ let (chars, name_regexp_list, actions) =
+ encode_lexdef lexdef in
+(**
+ List.iter (fun (name, regexp) -> prerr_string name; prerr_string " = "; print_regexp regexp; prerr_newline()) name_regexp_list;
+**)
+ let follow =
+ followpos (Array.length chars) name_regexp_list in
+ let initial_states =
+ List.map (fun (name, regexp) -> (name, get_state(firstpos regexp)))
+ name_regexp_list in
+ let states =
+ map_on_states (translate_state chars follow) in
+ let v =
+ Array.new (number_of_states()) (Perform 0) in
+ List.iter (fun (auto, i) -> v.(i) <- auto) states;
+ (initial_states, v, actions)
+
diff --git a/test/Lex/main.ml b/test/Lex/main.ml
new file mode 100644
index 000000000..94902ed21
--- /dev/null
+++ b/test/Lex/main.ml
@@ -0,0 +1,104 @@
+(* The lexer generator. Command-line parsing. *)
+
+open Syntax
+open Scanner
+open Grammar
+open Lexgen
+open Output
+
+let main () =
+ if Array.length Sys.argv <> 2 then begin
+ prerr_string "Usage: camllex <input file>\n";
+ exit 2
+ end;
+ let source_name = Sys.argv.(1) in
+ let dest_name =
+ if Filename.check_suffix source_name ".mll" then
+ Filename.chop_suffix source_name ".mll" ^ ".ml"
+ else
+ source_name ^ ".ml" in
+ ic := open_in source_name;
+ oc := open_out dest_name;
+ let lexbuf = Lexing.from_channel !ic in
+ let (Lexdef(header,_) as def) =
+ try
+ Grammar.lexer_definition Scanner.main lexbuf
+ with
+ Parsing.Parse_error ->
+ prerr_string "Syntax error around char ";
+ prerr_int (Lexing.lexeme_start lexbuf);
+ prerr_endline ".";
+ exit 2
+ | Scan_aux.Lexical_error s ->
+ prerr_string "Lexical error around char ";
+ prerr_int (Lexing.lexeme_start lexbuf);
+ prerr_string ": ";
+ prerr_string s;
+ prerr_endline ".";
+ exit 2 in
+ let ((init, states, acts) as dfa) = make_dfa def in
+ output_lexdef header dfa;
+ close_in !ic;
+ close_out !oc
+
+let _ = main(); exit 0
+
+
+(*****
+let main () =
+ ic := stdin;
+ oc := stdout;
+ let lexbuf = lexing.from_channel ic in
+ let (Lexdef(header,_) as def) =
+ try
+ grammar.lexer_definition scanner.main lexbuf
+ with
+ parsing.Parse_error x ->
+ prerr_string "Syntax error around char ";
+ prerr_int (lexing.lexeme_start lexbuf);
+ prerr_endline ".";
+ sys.exit 2
+ | scan_aux.Lexical_error s ->
+ prerr_string "Lexical error around char ";
+ prerr_int (lexing.lexeme_start lexbuf);
+ prerr_string ": ";
+ prerr_string s;
+ prerr_endline ".";
+ sys.exit 2 in
+ let ((init, states, acts) as dfa) = make_dfa def in
+ output_lexdef header dfa
+
+****)
+
+(****
+let debug_scanner lexbuf =
+ let tok = scanner.main lexbuf in
+ begin match tok with
+ Tident s -> prerr_string "Tident "; prerr_string s
+ | Tchar c -> prerr_string "Tchar "; prerr_char c
+ | Tstring s -> prerr_string "Tstring "; prerr_string s
+ | Taction(Location(i1,i2)) ->
+ prerr_string "Taction "; prerr_int i1; prerr_string "-";
+ prerr_int i2
+ | Trule -> prerr_string "Trule"
+ | Tparse -> prerr_string "Tparse"
+ | Tand -> prerr_string "Tand"
+ | Tequal -> prerr_string "Tequal"
+ | Tend -> prerr_string "Tend"
+ | Tor -> prerr_string "Tor"
+ | Tunderscore -> prerr_string "Tunderscore"
+ | Teof -> prerr_string "Teof"
+ | Tlbracket -> prerr_string "Tlbracket"
+ | Trbracket -> prerr_string "Trbracket"
+ | Tstar -> prerr_string "Tstar"
+ | Tmaybe -> prerr_string "Tmaybe"
+ | Tplus -> prerr_string "Tplus"
+ | Tlparen -> prerr_string "Tlparen"
+ | Trparen -> prerr_string "Trparen"
+ | Tcaret -> prerr_string "Tcaret"
+ | Tdash -> prerr_string "Tdash"
+ end;
+ prerr_newline();
+ tok
+
+****)
diff --git a/test/Lex/output.ml b/test/Lex/output.ml
new file mode 100644
index 000000000..301edcba3
--- /dev/null
+++ b/test/Lex/output.ml
@@ -0,0 +1,155 @@
+(* Generating a DFA as a set of mutually recursive functions *)
+
+open Syntax
+
+let ic = ref stdin
+let oc = ref stdout
+
+(* 1- Generating the actions *)
+
+let copy_buffer = String.create 1024
+
+let copy_chunk (Location(start,stop)) =
+ seek_in !ic start;
+ let tocopy = ref(stop - start) in
+ while !tocopy > 0 do
+ let m =
+ input !ic copy_buffer 0 (min !tocopy (String.length copy_buffer)) in
+ output !oc copy_buffer 0 m;
+ tocopy := !tocopy - m
+ done
+
+
+let output_action (i,act) =
+ output_string !oc ("action_" ^ string_of_int i ^ " lexbuf = (\n");
+ copy_chunk act;
+ output_string !oc ")\nand "
+
+
+(* 2- Generating the states *)
+
+let states = ref ([||] : automata array)
+
+type occurrence =
+ { mutable pos: int list;
+ mutable freq: int }
+
+let enumerate_vect v =
+ let env = ref [] in
+ for pos = 0 to Array.length v - 1 do
+ try
+ let occ = List.assoc v.(pos) !env in
+ occ.pos <- pos :: occ.pos;
+ occ.freq <- occ.freq + 1
+ with Not_found ->
+ env := (v.(pos), {pos = [pos]; freq = 1 }) :: !env
+ done;
+ Sort.list (fun (e1, occ1) (e2, occ2) -> occ1.freq >= occ2.freq) !env
+
+
+let output_move = function
+ Backtrack ->
+ output_string !oc "lexing.backtrack lexbuf"
+ | Goto dest ->
+ match !states.(dest) with
+ Perform act_num ->
+ output_string !oc ("action_" ^ string_of_int act_num ^ " lexbuf")
+ | _ ->
+ output_string !oc ("state_" ^ string_of_int dest ^ " lexbuf")
+
+
+(* Cannot use standard char_for_read because the characters to escape
+ are not the same in CL6 and CL1999. *)
+
+let output_char_lit oc = function
+ '\'' -> output_string oc "\\'"
+ | '\\' -> output_string oc "\\\\"
+ | '\n' -> output_string oc "\\n"
+ | '\t' -> output_string oc "\\t"
+ | c -> if Char.code c >= 32 & Char.code c < 128 then
+ output_char oc c
+ else begin
+ let n = Char.code c in
+ output_char oc '\\';
+ output_char oc (Char.chr (48 + n / 100));
+ output_char oc (Char.chr (48 + (n / 10) mod 10));
+ output_char oc (Char.chr (48 + n mod 10))
+ end
+
+let rec output_chars = function
+ [] ->
+ failwith "output_chars"
+ | [c] ->
+ output_string !oc "'";
+ output_char_lit !oc (Char.chr c);
+ output_string !oc "'"
+ | c::cl ->
+ output_string !oc "'";
+ output_char_lit !oc (Char.chr c);
+ output_string !oc "'|";
+ output_chars cl
+
+let output_one_trans (dest, occ) =
+ output_chars occ.pos;
+ output_string !oc " -> ";
+ output_move dest;
+ output_string !oc "\n | "
+
+let output_all_trans trans =
+ output_string !oc " match lexing.next_char lexbuf with\n ";
+ match enumerate_vect trans with
+ [] ->
+ failwith "output_all_trans"
+ | (default, _) :: rest ->
+ List.iter output_one_trans rest;
+ output_string !oc "_ -> ";
+ output_move default;
+ output_string !oc "\nand "
+
+let output_state state_num = function
+ Perform i ->
+ ()
+ | Shift(what_to_do, moves) ->
+ output_string !oc
+ ("state_" ^ string_of_int state_num ^ " lexbuf =\n");
+ begin match what_to_do with
+ No_remember -> ()
+ | Remember i ->
+ output_string !oc
+ (" Lexing.set_backtrack lexbuf action_" ^
+ string_of_int i ^ ";\n")
+ end;
+ output_all_trans moves
+
+
+(* 3- Generating the entry points *)
+
+let rec output_entries = function
+ [] -> failwith "output_entries"
+ | (name,state_num) :: rest ->
+ output_string !oc (name ^ " lexbuf =\n");
+ output_string !oc " Lexing.init lexbuf;\n";
+ output_string !oc (" state_" ^ string_of_int state_num ^
+ " lexbuf\n");
+ match rest with
+ [] -> output_string !oc "\n"; ()
+ | _ -> output_string !oc "\nand "; output_entries rest
+
+
+(* All together *)
+
+let output_lexdef header (initial_st, st, actions) =
+ prerr_int (Array.length st); prerr_string " states, ";
+ prerr_int (List.length actions); prerr_string " actions.";
+ prerr_newline();
+ copy_chunk header;
+ output_string !oc "\nlet rec ";
+ states := st;
+ List.iter output_action actions;
+ for i = 0 to Array.length st - 1 do
+ output_state i st.(i)
+ done;
+ output_entries initial_st
+
+
+
diff --git a/test/Lex/scan_aux.ml b/test/Lex/scan_aux.ml
new file mode 100644
index 000000000..8b01d6347
--- /dev/null
+++ b/test/Lex/scan_aux.ml
@@ -0,0 +1,46 @@
+(* Auxiliaries for the lexical analyzer *)
+
+let brace_depth = ref 0
+let comment_depth = ref 0
+
+exception Lexical_error of string
+
+let initial_string_buffer = String.create 256
+let string_buff = ref initial_string_buffer
+let string_index = ref 0
+
+let reset_string_buffer () =
+ string_buff := initial_string_buffer;
+ string_index := 0
+
+
+let store_string_char c =
+ begin
+ if !string_index >= String.length !string_buff then begin
+ let new_buff = String.create (String.length !string_buff * 2) in
+ String.blit new_buff 0 !string_buff 0 (String.length !string_buff);
+ string_buff := new_buff
+ end
+ end;
+ String.unsafe_set !string_buff !string_index c;
+ incr string_index
+
+let get_stored_string () =
+ let s = String.sub !string_buff 0 !string_index in
+ string_buff := initial_string_buffer;
+ s
+
+
+let char_for_backslash = function
+ 'n' -> '\010' (* '\n' when bootstrapped *)
+ | 't' -> '\009' (* '\t' *)
+ | 'b' -> '\008' (* '\b' *)
+ | 'r' -> '\013' (* '\r' *)
+ | c -> c
+
+
+let char_for_decimal_code lexbuf i =
+ Char.chr(100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
+ 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
+ (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48))
+
diff --git a/test/Lex/scanner.mll b/test/Lex/scanner.mll
new file mode 100644
index 000000000..7cb13ba70
--- /dev/null
+++ b/test/Lex/scanner.mll
@@ -0,0 +1,118 @@
+(* The lexical analyzer for lexer definitions. *)
+
+{
+open Syntax
+open Grammar
+open Scan_aux
+}
+
+rule main = parse
+ [' ' '\010' '\013' '\009' ] +
+ { main lexbuf }
+ | "(*"
+ { comment_depth := 1;
+ comment lexbuf;
+ main lexbuf }
+ | (['A'-'Z' 'a'-'z'] | '_' ['A'-'Z' 'a'-'z' '\'' '0'-'9'])
+ ( '_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9'] ) *
+ { match Lexing.lexeme lexbuf with
+ "rule" -> Trule
+ | "parse" -> Tparse
+ | "and" -> Tand
+ | "eof" -> Teof
+ | s -> Tident s }
+ | '"'
+ { reset_string_buffer();
+ string lexbuf;
+ Tstring(get_stored_string()) }
+ | "'"
+ { Tchar(char lexbuf) }
+ | '{'
+ { let n1 = Lexing.lexeme_end lexbuf in
+ brace_depth := 1;
+ let n2 = action lexbuf in
+ Taction(Location(n1, n2)) }
+ | '=' { Tequal }
+ | ";;" { Tend }
+ | '|' { Tor }
+ | '_' { Tunderscore }
+ | "eof" { Teof }
+ | '[' { Tlbracket }
+ | ']' { Trbracket }
+ | '*' { Tstar }
+ | '?' { Tmaybe }
+ | '+' { Tplus }
+ | '(' { Tlparen }
+ | ')' { Trparen }
+ | '^' { Tcaret }
+ | '-' { Tdash }
+ | eof
+ { raise(Lexical_error "unterminated lexer definition") }
+ | _
+ { raise(Lexical_error("illegal character " ^ Lexing.lexeme lexbuf)) }
+
+and action = parse
+ '{'
+ { incr brace_depth;
+ action lexbuf }
+ | '}'
+ { decr brace_depth;
+ if !brace_depth = 0 then Lexing.lexeme_start lexbuf else action lexbuf }
+ | '"'
+ { reset_string_buffer();
+ string lexbuf;
+ reset_string_buffer();
+ action lexbuf }
+ | '\''
+ { char lexbuf; action lexbuf }
+ | "(*"
+ { comment_depth := 1;
+ comment lexbuf;
+ action lexbuf }
+ | eof
+ { raise (Lexical_error "unterminated action") }
+ | _
+ { action lexbuf }
+
+and string = parse
+ '"'
+ { () }
+ | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] +
+ { string lexbuf }
+ | '\\' ['\\' '"' 'n' 't' 'b' 'r']
+ { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
+ string lexbuf }
+ | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
+ { store_string_char(char_for_decimal_code lexbuf 1);
+ string lexbuf }
+ | eof
+ { raise(Lexical_error "unterminated string") }
+ | _
+ { store_string_char(Lexing.lexeme_char lexbuf 0);
+ string lexbuf }
+
+and char = parse
+ [^ '\\'] "'"
+ { Lexing.lexeme_char lexbuf 0 }
+ | '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"
+ { char_for_backslash (Lexing.lexeme_char lexbuf 1) }
+ | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
+ { char_for_decimal_code lexbuf 1 }
+ | _
+ { raise(Lexical_error "bad character constant") }
+
+and comment = parse
+ "(*"
+ { incr comment_depth; comment lexbuf }
+ | "*)"
+ { decr comment_depth;
+ if !comment_depth = 0 then () else comment lexbuf }
+ | '"'
+ { reset_string_buffer();
+ string lexbuf;
+ reset_string_buffer();
+ comment lexbuf }
+ | eof
+ { raise(Lexical_error "unterminated comment") }
+ | _
+ { comment lexbuf }
diff --git a/test/Lex/syntax.ml b/test/Lex/syntax.ml
new file mode 100644
index 000000000..f692e6f62
--- /dev/null
+++ b/test/Lex/syntax.ml
@@ -0,0 +1,26 @@
+(* The shallow abstract syntax *)
+
+type location =
+ Location of int * int
+
+type regular_expression =
+ Epsilon
+ | Characters of char list
+ | Sequence of regular_expression * regular_expression
+ | Alternative of regular_expression * regular_expression
+ | Repetition of regular_expression
+
+type lexer_definition =
+ Lexdef of location * (string * (regular_expression * location) list) list
+
+(* Representation of automata *)
+
+type automata =
+ Perform of int
+ | Shift of automata_trans * automata_move array
+and automata_trans =
+ No_remember
+ | Remember of int
+and automata_move =
+ Backtrack
+ | Goto of int
diff --git a/test/Lex/testmain.ml b/test/Lex/testmain.ml
new file mode 100644
index 000000000..e0a914ee0
--- /dev/null
+++ b/test/Lex/testmain.ml
@@ -0,0 +1,34 @@
+(* The lexer generator. Command-line parsing. *)
+
+#open "syntax";;
+#open "testscanner";;
+#open "grammar";;
+#open "lexgen";;
+#open "output";;
+
+let main () =
+ ic := stdin;
+ oc := stdout;
+ let lexbuf = lexing.from_channel ic in
+ let (Lexdef(header,_) as def) =
+ try
+ grammar.lexer_definition testscanner.main lexbuf
+ with
+ parsing.Parse_error x ->
+ prerr_string "Syntax error around char ";
+ prerr_int (lexing.lexeme_start lexbuf);
+ prerr_endline ".";
+ sys.exit 2
+ | scan_aux.Lexical_error s ->
+ prerr_string "Lexical error around char ";
+ prerr_int (lexing.lexeme_start lexbuf);
+ prerr_string ": ";
+ prerr_string s;
+ prerr_endline ".";
+ sys.exit 2 in
+ let ((init, states, acts) as dfa) = make_dfa def in
+ output_lexdef header dfa
+;;
+
+main(); sys.exit 0
+;;
diff --git a/test/Lex/testscanner.mll b/test/Lex/testscanner.mll
new file mode 100644
index 000000000..91ada299f
--- /dev/null
+++ b/test/Lex/testscanner.mll
@@ -0,0 +1,121 @@
+(* The lexical analyzer for lexer definitions. *)
+
+{
+#open "syntax";;
+#open "grammar";;
+#open "scan_aux";;
+}
+
+rule main = parse
+ _ * "qwertyuiopasdfghjklzxcvbnm0123456789!@#$%^&*()"
+ { main lexbuf }
+ | [' ' '\010' '\013' '\009' ] +
+ { main lexbuf }
+ | "(*"
+ { comment_depth := 1;
+ comment lexbuf;
+ main lexbuf }
+ | (['A'-'Z' 'a'-'z'] | '_' ['A'-'Z' 'a'-'z' '\'' '0'-'9'])
+ ( '_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9'] ) *
+ { match lexing.lexeme lexbuf with
+ "rule" -> Trule
+ | "parse" -> Tparse
+ | "and" -> Tand
+ | "eof" -> Teof
+ | s -> Tident s }
+ | '"'
+ { reset_string_buffer();
+ string lexbuf;
+ Tstring(get_stored_string()) }
+ | "'"
+ { Tchar(char lexbuf) }
+ | '{'
+ { let n1 = lexing.lexeme_end lexbuf in
+ brace_depth := 1;
+ let n2 = action lexbuf in
+ Taction(Location(n1, n2)) }
+ | '=' { Tequal }
+ | ";;" { Tend }
+ | '|' { Tor }
+ | '_' { Tunderscore }
+ | "eof" { Teof }
+ | '[' { Tlbracket }
+ | ']' { Trbracket }
+ | '*' { Tstar }
+ | '?' { Tmaybe }
+ | '+' { Tplus }
+ | '(' { Tlparen }
+ | ')' { Trparen }
+ | '^' { Tcaret }
+ | '-' { Tdash }
+ | eof
+ { raise(Lexical_error "unterminated lexer definition") }
+ | _
+ { raise(Lexical_error("illegal character " ^ lexing.lexeme lexbuf)) }
+
+and action = parse
+ '{'
+ { brace_depth := brace_depth + 1;
+ action lexbuf }
+ | '}'
+ { brace_depth := brace_depth - 1;
+ if brace_depth = 0 then lexing.lexeme_start lexbuf else action lexbuf }
+ | '"'
+ { reset_string_buffer();
+ string lexbuf;
+ reset_string_buffer();
+ action lexbuf }
+ | '\''
+ { char lexbuf; action lexbuf }
+ | "(*"
+ { comment_depth := 1;
+ comment lexbuf;
+ action lexbuf }
+ | eof
+ { raise (Lexical_error "unterminated action") }
+ | _
+ { action lexbuf }
+
+and string = parse
+ '"'
+ { () }
+ | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] +
+ { string lexbuf }
+ | '\\' ['\\' '"' 'n' 't' 'b' 'r']
+ { store_string_char(char_for_backslash(lexing.lexeme_char lexbuf 1));
+ string lexbuf }
+ | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
+ { store_string_char(char_for_decimal_code lexbuf 1);
+ string lexbuf }
+ | eof
+ { raise(Lexical_error "unterminated string") }
+ | _
+ { store_string_char(lexing.lexeme_char lexbuf 0);
+ string lexbuf }
+
+and char = parse
+ [^ '\\'] "'"
+ { lexing.lexeme_char lexbuf 0 }
+ | '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"
+ { char_for_backslash (lexing.lexeme_char lexbuf 1) }
+ | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
+ { char_for_decimal_code lexbuf 1 }
+ | _
+ { raise(Lexical_error "bad character constant") }
+
+and comment = parse
+ "(*"
+ { comment_depth := comment_depth + 1; comment lexbuf }
+ | "*)"
+ { comment_depth := comment_depth - 1;
+ if comment_depth = 0 then () else comment lexbuf }
+ | '"'
+ { reset_string_buffer();
+ string lexbuf;
+ reset_string_buffer();
+ comment lexbuf }
+ | eof
+ { raise(Lexical_error "unterminated comment") }
+ | _
+ { comment lexbuf }
+;;