Passage a la version bootstrappee (franchissement du Rubicon)

git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@2 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
author: Xavier Leroy <xavier.leroy@inria.fr> 1995-05-04 10:15:53 +0000
committer: Xavier Leroy <xavier.leroy@inria.fr> 1995-05-04 10:15:53 +0000
commit: 61bd8ace6bdb2652f4d51d64e3239a7105f56c26 (patch)
tree: e8b957df0957c1b483d41d68973824e280445548 /test/Lex
parent: 8f9ea2a7b886e3e0a5cfd76b11fe79d083a7f20c (diff)
10 files changed, 989 insertions, 0 deletions
diff --git a/test/Lex/gram_aux.ml b/test/Lex/gram_aux.ml
new file mode 100644
index 000000000..525ee69b5
--- /dev/null
+++ b/test/Lex/gram_aux.ml
@@ -0,0 +1,33 @@
+(* Auxiliaries for the parser. *)
+
+open Syntax
+
+let regexp_for_string s =
+  let l = String.length s in
+  if l = 0 then
+    Epsilon
+  else begin
+    let re = ref(Characters [String.get s (l - 1)]) in
+    for i = l - 2 downto 0 do
+      re := Sequence(Characters [String.get s i], !re)
+    done;
+    !re
+  end
+
+
+let char_class c1 c2 =
+  let class = ref [] in
+  for i = Char.code c2 downto Char.code c1 do
+    class := Char.chr i :: !class
+  done;
+  !class
+
+
+let all_chars = char_class '\001' '\255'
+
+
+let rec subtract l1 l2 =
+  match l1 with
+    [] -> []
+  | a::l -> if List.mem a l2 then subtract l l2 else a :: subtract l l2
+
diff --git a/test/Lex/grammar.mly b/test/Lex/grammar.mly
new file mode 100644
index 000000000..eb1c8cc24
--- /dev/null
+++ b/test/Lex/grammar.mly
@@ -0,0 +1,100 @@
+/* The grammar for lexer definitions */
+
+%{
+open Syntax
+open Gram_aux
+%}
+
+%token <string> Tident
+%token <char> Tchar
+%token <string> Tstring
+%token <Syntax.location> Taction
+%token Trule Tparse Tand Tequal Tend Tor Tunderscore Teof Tlbracket Trbracket
+%token Tstar Tmaybe Tplus Tlparen Trparen Tcaret Tdash
+
+%left Tor
+%left CONCAT
+%nonassoc Tmaybe
+%left Tstar
+%left Tplus
+
+%start lexer_definition
+%type <Syntax.lexer_definition> lexer_definition
+
+%%
+
+lexer_definition:
+    header Trule definition other_definitions Tend
+        { Lexdef($1, $3::(List.rev $4)) }
+;
+header:
+    Taction
+        { $1 }
+  |
+        { Location(0,0) }
+;
+other_definitions:
+    other_definitions Tand definition
+        { $3::$1 }
+  |     
+        { [] }
+;
+definition:
+    Tident Tequal entry
+        { ($1,$3) }
+;
+entry:
+    Tparse case rest_of_entry
+        { $2 :: List.rev $3 }
+;
+rest_of_entry:
+    rest_of_entry Tor case
+        { $3::$1 }
+  |
+        { [] }
+;
+case:
+    regexp Taction
+        { ($1,$2) }
+;
+regexp:
+    Tunderscore
+        { Characters all_chars }
+  | Teof
+        { Characters ['\000'] }
+  | Tchar
+        { Characters [$1] }
+  | Tstring
+        { regexp_for_string $1 }
+  | Tlbracket char_class Trbracket
+        { Characters $2 }
+  | regexp Tstar
+        { Repetition $1 }
+  | regexp Tmaybe
+        { Alternative($1, Epsilon) }
+  | regexp Tplus
+        { Sequence($1, Repetition $1) }
+  | regexp Tor regexp
+        { Alternative($1,$3) }
+  | regexp regexp %prec CONCAT
+        { Sequence($1,$2) }
+  | Tlparen regexp Trparen
+        { $2 }
+;
+char_class:
+    Tcaret char_class1
+        { subtract all_chars $2 }
+  | char_class1
+        { $1 }
+;
+char_class1:
+    Tchar Tdash Tchar
+        { char_class $1 $3 }
+  | Tchar
+        { [$1] }
+  | char_class char_class %prec CONCAT
+        { $1 @ $2 }
+;
+
+%%
+
diff --git a/test/Lex/lexgen.ml b/test/Lex/lexgen.ml
new file mode 100644
index 000000000..73d011577
--- /dev/null
+++ b/test/Lex/lexgen.ml
@@ -0,0 +1,252 @@
+(* Compiling a lexer definition *)
+
+open Syntax
+
+(* Deep abstract syntax for regular expressions *)
+
+type regexp =
+    Empty
+  | Chars of int
+  | Action of int
+  | Seq of regexp * regexp
+  | Alt of regexp * regexp
+  | Star of regexp
+
+(* From shallow to deep syntax *)
+
+(***
+
+let print_char_class c =
+  let print_interval low high =
+    prerr_int low;
+    if high - 1 > low then begin
+      prerr_char '-';
+      prerr_int (high-1)
+    end;
+    prerr_char ' ' in
+  let rec print_class first next = function
+    [] -> print_interval first next
+  | c::l ->
+      if char.code c = next
+      then print_class first (next+1) l
+      else begin
+        print_interval first next;
+        print_class (char.code c) (char.code c + 1) l
+      end in
+  match c with
+    [] -> prerr_newline()
+  | c::l -> print_class (char.code c) (char.code c + 1) l; prerr_newline()
+
+
+let rec print_regexp = function
+    Empty -> prerr_string "Empty"
+  | Chars n -> prerr_string "Chars "; prerr_int n
+  | Action n -> prerr_string "Action "; prerr_int n
+  | Seq(r1,r2) -> print_regexp r1; prerr_string "; "; print_regexp r2
+  | Alt(r1,r2) -> prerr_string "("; print_regexp r1; prerr_string " | "; print_regexp r2; prerr_string ")"
+  | Star r -> prerr_string "("; print_regexp r; prerr_string ")*"
+
+***)
+
+let chars = ref ([] : char list list)
+let chars_count = ref 0
+let actions = ref ([] : (int * location) list)
+let actions_count = ref 0
+
+let rec encode_regexp = function
+    Epsilon -> Empty
+  | Characters cl ->
+      let n = !chars_count in
+(***      prerr_int n; prerr_char ' '; print_char_class cl; ***)
+      chars := cl :: !chars;
+      chars_count := !chars_count + 1;
+      Chars(n)
+  | Sequence(r1,r2) ->
+      Seq(encode_regexp r1, encode_regexp r2)
+  | Alternative(r1,r2) ->
+      Alt(encode_regexp r1, encode_regexp r2)
+  | Repetition r ->
+      Star (encode_regexp r)
+
+
+let encode_casedef =
+  List.fold_left
+   (fun reg (expr,act) ->
+     let act_num = !actions_count in
+     actions_count := !actions_count + 1;
+     actions := (act_num, act) :: !actions;
+     Alt(reg, Seq(encode_regexp expr, Action act_num)))
+  Empty
+
+
+let encode_lexdef (Lexdef(_, ld)) =
+  chars := [];
+  chars_count := 0;
+  actions := [];
+  actions_count := 0;
+  let name_regexp_list =
+    List.map (fun (name, casedef) -> (name, encode_casedef casedef)) ld in
+(*  List.iter print_char_class chars; *)
+  let chr = Array.of_list (List.rev !chars)
+  and act = !actions in
+  chars := [];
+  actions := [];
+  (chr, name_regexp_list, act)
+
+
+(* To generate directly a NFA from a regular expression.
+   Confer Aho-Sethi-Ullman, dragon book, chap. 3 *)
+
+type transition =
+    OnChars of int
+  | ToAction of int
+
+
+let rec merge_trans l1 l2 =
+  match (l1, l2) with
+    ([], s2) -> s2
+  | (s1, []) -> s1
+  | ((OnChars n1 as t1) :: r1 as s1), ((OnChars n2 as t2) :: r2 as s2) ->
+      if n1 = n2 then t1 :: merge_trans r1 r2 else
+      if n1 < n2 then t1 :: merge_trans r1 s2 else
+                      t2 :: merge_trans s1 r2
+  | ((ToAction n1 as t1) :: r1 as s1), ((ToAction n2 as t2) :: r2 as s2) ->
+      if n1 = n2 then t1 :: merge_trans r1 r2 else
+      if n1 < n2 then t1 :: merge_trans r1 s2 else
+                      t2 :: merge_trans s1 r2
+  | ((OnChars n1 as t1) :: r1 as s1), ((ToAction n2 as t2) :: r2 as s2) ->
+      t1 :: merge_trans r1 s2
+  | ((ToAction n1 as t1) :: r1 as s1), ((OnChars n2 as t2) :: r2 as s2) ->
+      t2 :: merge_trans s1 r2
+
+
+let rec nullable = function
+    Empty      -> true
+  | Chars _    -> false
+  | Action _   -> false
+  | Seq(r1,r2) -> nullable r1 & nullable r2
+  | Alt(r1,r2) -> nullable r1 or nullable r2
+  | Star r     -> true
+
+
+let rec firstpos = function
+    Empty      -> []
+  | Chars pos  -> [OnChars pos]
+  | Action act -> [ToAction act]
+  | Seq(r1,r2) -> if nullable r1
+                  then merge_trans (firstpos r1) (firstpos r2)
+                  else firstpos r1
+  | Alt(r1,r2) -> merge_trans (firstpos r1) (firstpos r2)
+  | Star r     -> firstpos r
+
+
+let rec lastpos = function
+    Empty      -> []
+  | Chars pos  -> [OnChars pos]
+  | Action act -> [ToAction act]
+  | Seq(r1,r2) -> if nullable r2
+                  then merge_trans (lastpos r1) (lastpos r2)
+                  else lastpos r2
+  | Alt(r1,r2) -> merge_trans (lastpos r1) (lastpos r2)
+  | Star r     -> lastpos r
+
+
+let followpos size name_regexp_list =
+  let v = Array.new size [] in
+    let fill_pos first = function
+        OnChars pos -> v.(pos) <- merge_trans first v.(pos); ()
+      | ToAction _  -> () in
+    let rec fill = function
+        Seq(r1,r2) ->
+          fill r1; fill r2;
+          List.iter (fill_pos (firstpos r2)) (lastpos r1)
+      | Alt(r1,r2) ->
+          fill r1; fill r2
+      | Star r ->
+          fill r;
+          List.iter (fill_pos (firstpos r)) (lastpos r)
+      | _ -> () in
+    List.iter (fun (name, regexp) -> fill regexp) name_regexp_list;
+    v
+
+
+let no_action = 0x3FFFFFFF
+
+let split_trans_set =
+  List.fold_left
+    (fun (act, pos_set as act_pos_set) trans ->
+       match trans with
+         OnChars pos   -> (act, pos :: pos_set)
+       | ToAction act1 -> if act1 < act then (act1, pos_set)
+                                             else act_pos_set)
+    (no_action, [])
+
+
+let memory = (Hashtbl.new 131 : (transition list, int) Hashtbl.t)
+let todo = ref ([] : (transition list * int) list)
+let next = ref 0
+
+let get_state st = 
+  try
+    Hashtbl.find memory st
+  with Not_found ->
+    let nbr = !next in
+    next := !next + 1;
+    Hashtbl.add memory st nbr;
+    todo := (st, nbr) :: !todo;
+    nbr
+
+let rec map_on_states f =
+  match !todo with
+    []  -> []
+  | (st,i)::r -> todo := r; let res = f st in (res,i) :: map_on_states f
+
+let number_of_states () = !next
+
+let goto_state = function
+    [] -> Backtrack
+  | ps -> Goto (get_state ps)
+
+
+let transition_from chars follow pos_set = 
+  let tr = Array.new 256 []
+  and shift = Array.new 256 Backtrack in
+    List.iter
+      (fun pos ->
+        List.iter
+          (fun c ->
+             tr.(Char.code c) <-
+               merge_trans tr.(Char.code c) follow.(pos))
+          chars.(pos))
+      pos_set;
+    for i = 0 to 255 do
+      shift.(i) <- goto_state tr.(i)
+    done;
+    shift
+
+
+let translate_state chars follow state =
+  match split_trans_set state with
+    n, [] -> Perform n
+  | n, ps -> Shift( (if n = no_action then No_remember else Remember n),
+                    transition_from chars follow ps)
+
+
+let make_dfa lexdef =
+  let (chars, name_regexp_list, actions) =
+    encode_lexdef lexdef in
+(**
+  List.iter (fun (name, regexp) -> prerr_string name; prerr_string " = "; print_regexp regexp; prerr_newline()) name_regexp_list;
+**)
+  let follow =
+    followpos (Array.length chars) name_regexp_list in
+  let initial_states =
+    List.map (fun (name, regexp) -> (name, get_state(firstpos regexp)))
+             name_regexp_list in
+  let states =
+    map_on_states (translate_state chars follow) in
+  let v =
+    Array.new (number_of_states()) (Perform 0) in
+  List.iter (fun (auto, i) -> v.(i) <- auto) states;
+  (initial_states, v, actions)
+
diff --git a/test/Lex/main.ml b/test/Lex/main.ml
new file mode 100644
index 000000000..94902ed21
--- /dev/null
+++ b/test/Lex/main.ml
@@ -0,0 +1,104 @@
+(* The lexer generator. Command-line parsing. *)
+
+open Syntax
+open Scanner
+open Grammar
+open Lexgen
+open Output
+
+let main () =
+  if Array.length Sys.argv <> 2 then begin
+    prerr_string "Usage: camllex <input file>\n";
+    exit 2
+  end;
+  let source_name = Sys.argv.(1) in
+  let dest_name =
+    if Filename.check_suffix source_name ".mll" then
+      Filename.chop_suffix source_name ".mll" ^ ".ml"
+    else
+      source_name ^ ".ml" in
+  ic := open_in source_name;
+  oc := open_out dest_name;
+  let lexbuf = Lexing.from_channel !ic in
+  let (Lexdef(header,_) as def) =
+    try
+      Grammar.lexer_definition Scanner.main lexbuf
+    with
+      Parsing.Parse_error ->
+        prerr_string "Syntax error around char ";
+        prerr_int (Lexing.lexeme_start lexbuf);
+        prerr_endline ".";
+        exit 2
+    | Scan_aux.Lexical_error s ->
+        prerr_string "Lexical error around char ";
+        prerr_int (Lexing.lexeme_start lexbuf);
+        prerr_string ": ";
+        prerr_string s;
+        prerr_endline ".";
+        exit 2 in
+  let ((init, states, acts) as dfa) = make_dfa def in
+  output_lexdef header dfa;
+  close_in !ic;
+  close_out !oc
+
+let _ = main(); exit 0
+
+
+(*****
+let main () =
+  ic := stdin;
+  oc := stdout;
+  let lexbuf = lexing.from_channel ic in
+  let (Lexdef(header,_) as def) =
+    try
+      grammar.lexer_definition scanner.main lexbuf
+    with
+      parsing.Parse_error x ->
+        prerr_string "Syntax error around char ";
+        prerr_int (lexing.lexeme_start lexbuf);
+        prerr_endline ".";
+        sys.exit 2
+    | scan_aux.Lexical_error s ->
+        prerr_string "Lexical error around char ";
+        prerr_int (lexing.lexeme_start lexbuf);
+        prerr_string ": ";
+        prerr_string s;
+        prerr_endline ".";
+        sys.exit 2 in
+  let ((init, states, acts) as dfa) = make_dfa def in
+  output_lexdef header dfa
+
+****)
+
+(****
+let debug_scanner lexbuf =
+  let tok = scanner.main lexbuf in
+  begin match tok with
+    Tident s -> prerr_string "Tident "; prerr_string s
+  | Tchar c -> prerr_string "Tchar "; prerr_char c
+  | Tstring s -> prerr_string "Tstring "; prerr_string s
+  | Taction(Location(i1,i2)) ->
+      prerr_string "Taction "; prerr_int i1; prerr_string "-";
+      prerr_int i2
+  | Trule -> prerr_string "Trule"
+  | Tparse -> prerr_string "Tparse"
+  | Tand -> prerr_string "Tand"
+  | Tequal -> prerr_string "Tequal"
+  | Tend -> prerr_string "Tend"
+  | Tor -> prerr_string "Tor"
+  | Tunderscore -> prerr_string "Tunderscore"
+  | Teof -> prerr_string "Teof"
+  | Tlbracket -> prerr_string "Tlbracket"
+  | Trbracket -> prerr_string "Trbracket"
+  | Tstar -> prerr_string "Tstar"
+  | Tmaybe -> prerr_string "Tmaybe"
+  | Tplus -> prerr_string "Tplus"
+  | Tlparen -> prerr_string "Tlparen"
+  | Trparen -> prerr_string "Trparen"
+  | Tcaret -> prerr_string "Tcaret"
+  | Tdash -> prerr_string "Tdash"
+  end;
+  prerr_newline();
+  tok
+
+****)
diff --git a/test/Lex/output.ml b/test/Lex/output.ml
new file mode 100644
index 000000000..301edcba3
--- /dev/null
+++ b/test/Lex/output.ml
@@ -0,0 +1,155 @@
+(* Generating a DFA as a set of mutually recursive functions *)
+
+open Syntax
+
+let ic = ref stdin
+let oc = ref stdout
+
+(* 1- Generating the actions *)
+
+let copy_buffer = String.create 1024
+
+let copy_chunk (Location(start,stop)) =
+  seek_in !ic start;
+  let tocopy = ref(stop - start) in
+  while !tocopy > 0 do
+    let m =
+      input !ic copy_buffer 0 (min !tocopy (String.length copy_buffer)) in
+    output !oc copy_buffer 0 m;
+    tocopy := !tocopy - m
+  done
+
+
+let output_action (i,act) =
+  output_string !oc ("action_" ^ string_of_int i ^ " lexbuf = (\n");
+  copy_chunk act;
+  output_string !oc ")\nand "
+
+
+(* 2- Generating the states *)
+
+let states = ref ([||] : automata array)
+
+type occurrence =
+  { mutable pos: int list;
+    mutable freq: int }
+
+let enumerate_vect v =
+  let env = ref [] in
+  for pos = 0 to Array.length v - 1 do
+    try
+      let occ = List.assoc v.(pos) !env in
+      occ.pos <- pos :: occ.pos;
+      occ.freq <- occ.freq + 1
+    with Not_found ->
+      env := (v.(pos), {pos = [pos]; freq = 1 }) :: !env
+  done;
+  Sort.list (fun (e1, occ1) (e2, occ2) -> occ1.freq >= occ2.freq) !env
+
+
+let output_move = function
+    Backtrack ->
+      output_string !oc "lexing.backtrack lexbuf"
+  | Goto dest ->
+      match !states.(dest) with
+        Perform act_num ->
+          output_string !oc ("action_" ^ string_of_int act_num ^ " lexbuf")
+      | _ ->
+          output_string !oc ("state_" ^ string_of_int dest ^ " lexbuf")
+
+
+(* Cannot use standard char_for_read because the characters to escape
+   are not the same in CL6 and CL1999. *)
+
+let output_char_lit oc = function
+    '\'' -> output_string oc "\\'"
+  | '\\' -> output_string oc "\\\\"
+  | '\n' -> output_string oc "\\n"
+  | '\t' -> output_string oc "\\t"
+  | c ->  if Char.code c >= 32 & Char.code c < 128 then
+            output_char oc c
+          else begin
+            let n = Char.code c in
+            output_char oc '\\';
+            output_char oc (Char.chr (48 + n / 100));
+            output_char oc (Char.chr (48 + (n / 10) mod 10));
+            output_char oc (Char.chr (48 + n mod 10))
+          end
+
+let rec output_chars = function
+    [] ->
+      failwith "output_chars"
+  | [c] ->
+      output_string !oc "'";
+      output_char_lit !oc (Char.chr c);
+      output_string !oc "'"
+  | c::cl ->
+      output_string !oc "'";
+      output_char_lit !oc (Char.chr c);
+      output_string !oc "'|";
+      output_chars cl
+
+let output_one_trans (dest, occ) =
+  output_chars occ.pos;
+  output_string !oc " -> ";
+  output_move dest;
+  output_string !oc "\n |  "
+
+let output_all_trans trans =
+  output_string !oc "  match lexing.next_char lexbuf with\n    ";
+  match enumerate_vect trans with
+    [] ->
+      failwith "output_all_trans"
+  | (default, _) :: rest ->
+      List.iter output_one_trans rest;
+      output_string !oc "_ -> ";
+      output_move default;
+      output_string !oc "\nand "
+
+let output_state state_num = function
+    Perform i ->
+      ()
+  | Shift(what_to_do, moves) ->
+      output_string !oc
+        ("state_"  ^ string_of_int state_num ^ " lexbuf =\n");
+      begin match what_to_do with
+        No_remember -> ()
+      | Remember i ->
+          output_string !oc
+            ("  Lexing.set_backtrack lexbuf action_" ^
+             string_of_int i ^ ";\n")
+      end;
+      output_all_trans moves
+
+
+(* 3- Generating the entry points *)
+          
+let rec output_entries = function
+    [] -> failwith "output_entries"
+  | (name,state_num) :: rest ->
+      output_string !oc (name ^ " lexbuf =\n");
+      output_string !oc "  Lexing.init lexbuf;\n";
+      output_string !oc ("  state_" ^ string_of_int state_num ^
+                        " lexbuf\n");
+      match rest with
+        [] -> output_string !oc "\n"; ()
+      | _  -> output_string !oc "\nand "; output_entries rest
+
+
+(* All together *)
+
+let output_lexdef header (initial_st, st, actions) =
+  prerr_int (Array.length st); prerr_string " states, ";
+  prerr_int (List.length actions); prerr_string " actions.";
+  prerr_newline();
+  copy_chunk header;
+  output_string !oc "\nlet rec ";
+  states := st;
+  List.iter output_action actions;
+  for i = 0 to Array.length st - 1 do
+    output_state i st.(i)
+  done;
+  output_entries initial_st
+
+
+
diff --git a/test/Lex/scan_aux.ml b/test/Lex/scan_aux.ml
new file mode 100644
index 000000000..8b01d6347
--- /dev/null
+++ b/test/Lex/scan_aux.ml
@@ -0,0 +1,46 @@
+(* Auxiliaries for the lexical analyzer *)
+
+let brace_depth = ref 0
+let comment_depth = ref 0
+
+exception Lexical_error of string
+
+let initial_string_buffer = String.create 256
+let string_buff = ref initial_string_buffer
+let string_index = ref 0
+
+let reset_string_buffer () =
+  string_buff := initial_string_buffer;
+  string_index := 0
+
+
+let store_string_char c =
+  begin
+    if !string_index >= String.length !string_buff then begin
+      let new_buff = String.create (String.length !string_buff * 2) in
+      String.blit new_buff 0 !string_buff 0 (String.length !string_buff);
+      string_buff := new_buff
+    end
+  end;
+  String.unsafe_set !string_buff !string_index c;
+  incr string_index
+
+let get_stored_string () =
+  let s = String.sub !string_buff 0 !string_index in
+  string_buff := initial_string_buffer;
+  s
+
+
+let char_for_backslash = function
+    'n' -> '\010' (* '\n' when bootstrapped *)
+  | 't' -> '\009' (* '\t' *)
+  | 'b' -> '\008' (* '\b' *)
+  | 'r' -> '\013' (* '\r' *)
+  | c   -> c
+
+
+let char_for_decimal_code lexbuf i =
+  Char.chr(100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
+            10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
+                 (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48))
+
diff --git a/test/Lex/scanner.mll b/test/Lex/scanner.mll
new file mode 100644
index 000000000..7cb13ba70
--- /dev/null
+++ b/test/Lex/scanner.mll
@@ -0,0 +1,118 @@
+(* The lexical analyzer for lexer definitions. *)
+
+{
+open Syntax
+open Grammar
+open Scan_aux
+}
+
+rule main = parse
+    [' ' '\010' '\013' '\009' ] + 
+    { main lexbuf }
+  | "(*" 
+    { comment_depth := 1;
+      comment lexbuf;
+      main lexbuf }
+  | (['A'-'Z' 'a'-'z'] | '_' ['A'-'Z' 'a'-'z' '\'' '0'-'9'])
+    ( '_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9'] ) * 
+    { match Lexing.lexeme lexbuf with
+        "rule" -> Trule
+      | "parse" -> Tparse
+      | "and" -> Tand
+      | "eof" -> Teof
+      | s -> Tident s }
+  | '"' 
+    { reset_string_buffer();
+      string lexbuf;
+      Tstring(get_stored_string()) }
+  | "'" 
+    { Tchar(char lexbuf) }
+  | '{' 
+    { let n1 = Lexing.lexeme_end lexbuf in
+        brace_depth := 1;
+        let n2 = action lexbuf in
+          Taction(Location(n1, n2)) }
+  | '='  { Tequal }
+  | ";;"  { Tend }
+  | '|'  { Tor }
+  | '_'  { Tunderscore }
+  | "eof"  { Teof }
+  | '['  { Tlbracket }
+  | ']'  { Trbracket }
+  | '*'  { Tstar }
+  | '?'  { Tmaybe }
+  | '+'  { Tplus }
+  | '('  { Tlparen }
+  | ')'  { Trparen }
+  | '^'  { Tcaret }
+  | '-'  { Tdash }
+  | eof
+    { raise(Lexical_error "unterminated lexer definition") }
+  | _
+    { raise(Lexical_error("illegal character " ^ Lexing.lexeme lexbuf)) }
+
+and action = parse
+    '{' 
+    { incr brace_depth;
+      action lexbuf }
+  | '}' 
+    { decr brace_depth;
+      if !brace_depth = 0 then Lexing.lexeme_start lexbuf else action lexbuf }
+  | '"' 
+    { reset_string_buffer();
+      string lexbuf;
+      reset_string_buffer();
+      action lexbuf }
+  | '\''
+    { char lexbuf; action lexbuf }
+  | "(*" 
+    { comment_depth := 1;
+      comment lexbuf;
+      action lexbuf }
+  | eof 
+    { raise (Lexical_error "unterminated action") }
+  | _ 
+    { action lexbuf }
+      
+and string = parse
+    '"' 
+    { () }
+  | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] +
+    { string lexbuf }
+  | '\\' ['\\' '"' 'n' 't' 'b' 'r'] 
+    { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
+      string lexbuf }
+  | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] 
+    { store_string_char(char_for_decimal_code lexbuf 1);
+      string lexbuf }
+  | eof 
+    { raise(Lexical_error "unterminated string") }
+  | _ 
+    { store_string_char(Lexing.lexeme_char lexbuf 0);
+      string lexbuf }
+
+and char = parse
+    [^ '\\'] "'" 
+    { Lexing.lexeme_char lexbuf 0 }
+  | '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" 
+    { char_for_backslash (Lexing.lexeme_char lexbuf 1) }
+  | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" 
+    { char_for_decimal_code lexbuf 1 }
+  | _ 
+    { raise(Lexical_error "bad character constant") }
+
+and comment = parse
+    "(*" 
+    { incr comment_depth; comment lexbuf }
+  | "*)" 
+    { decr comment_depth;
+      if !comment_depth = 0 then () else comment lexbuf }
+  | '"' 
+    { reset_string_buffer();
+      string lexbuf;
+      reset_string_buffer();
+      comment lexbuf }
+  | eof 
+    { raise(Lexical_error "unterminated comment") }
+  | _ 
+    { comment lexbuf }
diff --git a/test/Lex/syntax.ml b/test/Lex/syntax.ml
new file mode 100644
index 000000000..f692e6f62
--- /dev/null
+++ b/test/Lex/syntax.ml
@@ -0,0 +1,26 @@
+(* The shallow abstract syntax *)
+
+type location =
+    Location of int * int
+
+type regular_expression =
+    Epsilon
+  | Characters of char list
+  | Sequence of regular_expression * regular_expression
+  | Alternative of regular_expression * regular_expression
+  | Repetition of regular_expression
+
+type lexer_definition =
+    Lexdef of location * (string * (regular_expression * location) list) list
+
+(* Representation of automata *)
+
+type automata =
+    Perform of int
+  | Shift of automata_trans * automata_move array
+and automata_trans =
+    No_remember
+  | Remember of int
+and automata_move =
+    Backtrack
+  | Goto of int
diff --git a/test/Lex/testmain.ml b/test/Lex/testmain.ml
new file mode 100644
index 000000000..e0a914ee0
--- /dev/null
+++ b/test/Lex/testmain.ml
@@ -0,0 +1,34 @@
+(* The lexer generator. Command-line parsing. *)
+
+#open "syntax";;
+#open "testscanner";;
+#open "grammar";;
+#open "lexgen";;
+#open "output";;
+
+let main () =
+  ic := stdin;
+  oc := stdout;
+  let lexbuf = lexing.from_channel ic in
+  let (Lexdef(header,_) as def) =
+    try
+      grammar.lexer_definition testscanner.main lexbuf
+    with
+      parsing.Parse_error x ->
+        prerr_string "Syntax error around char ";
+        prerr_int (lexing.lexeme_start lexbuf);
+        prerr_endline ".";
+        sys.exit 2
+    | scan_aux.Lexical_error s ->
+        prerr_string "Lexical error around char ";
+        prerr_int (lexing.lexeme_start lexbuf);
+        prerr_string ": ";
+        prerr_string s;
+        prerr_endline ".";
+        sys.exit 2 in
+  let ((init, states, acts) as dfa) = make_dfa def in
+  output_lexdef header dfa
+;;
+
+main(); sys.exit 0
+;;
diff --git a/test/Lex/testscanner.mll b/test/Lex/testscanner.mll
new file mode 100644
index 000000000..91ada299f
--- /dev/null
+++ b/test/Lex/testscanner.mll
@@ -0,0 +1,121 @@
+(* The lexical analyzer for lexer definitions. *)
+
+{
+#open "syntax";;
+#open "grammar";;
+#open "scan_aux";;
+}
+
+rule main = parse
+    _ * "qwertyuiopasdfghjklzxcvbnm0123456789!@#$%^&*()"
+    { main lexbuf }
+  | [' ' '\010' '\013' '\009' ] + 
+    { main lexbuf }
+  | "(*" 
+    { comment_depth := 1;
+      comment lexbuf;
+      main lexbuf }
+  | (['A'-'Z' 'a'-'z'] | '_' ['A'-'Z' 'a'-'z' '\'' '0'-'9'])
+    ( '_' ? ['A'-'Z' 'a'-'z' ''' '0'-'9'] ) * 
+    { match lexing.lexeme lexbuf with
+        "rule" -> Trule
+      | "parse" -> Tparse
+      | "and" -> Tand
+      | "eof" -> Teof
+      | s -> Tident s }
+  | '"' 
+    { reset_string_buffer();
+      string lexbuf;
+      Tstring(get_stored_string()) }
+  | "'" 
+    { Tchar(char lexbuf) }
+  | '{' 
+    { let n1 = lexing.lexeme_end lexbuf in
+        brace_depth := 1;
+        let n2 = action lexbuf in
+          Taction(Location(n1, n2)) }
+  | '='  { Tequal }
+  | ";;"  { Tend }
+  | '|'  { Tor }
+  | '_'  { Tunderscore }
+  | "eof"  { Teof }
+  | '['  { Tlbracket }
+  | ']'  { Trbracket }
+  | '*'  { Tstar }
+  | '?'  { Tmaybe }
+  | '+'  { Tplus }
+  | '('  { Tlparen }
+  | ')'  { Trparen }
+  | '^'  { Tcaret }
+  | '-'  { Tdash }
+  | eof
+    { raise(Lexical_error "unterminated lexer definition") }
+  | _
+    { raise(Lexical_error("illegal character " ^ lexing.lexeme lexbuf)) }
+
+and action = parse
+    '{' 
+    { brace_depth := brace_depth + 1;
+      action lexbuf }
+  | '}' 
+    { brace_depth := brace_depth - 1;
+      if brace_depth = 0 then lexing.lexeme_start lexbuf else action lexbuf }
+  | '"' 
+    { reset_string_buffer();
+      string lexbuf;
+      reset_string_buffer();
+      action lexbuf }
+  | '\''
+    { char lexbuf; action lexbuf }
+  | "(*" 
+    { comment_depth := 1;
+      comment lexbuf;
+      action lexbuf }
+  | eof 
+    { raise (Lexical_error "unterminated action") }
+  | _ 
+    { action lexbuf }
+      
+and string = parse
+    '"' 
+    { () }
+  | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] +
+    { string lexbuf }
+  | '\\' ['\\' '"' 'n' 't' 'b' 'r'] 
+    { store_string_char(char_for_backslash(lexing.lexeme_char lexbuf 1));
+      string lexbuf }
+  | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] 
+    { store_string_char(char_for_decimal_code lexbuf 1);
+      string lexbuf }
+  | eof 
+    { raise(Lexical_error "unterminated string") }
+  | _ 
+    { store_string_char(lexing.lexeme_char lexbuf 0);
+      string lexbuf }
+
+and char = parse
+    [^ '\\'] "'" 
+    { lexing.lexeme_char lexbuf 0 }
+  | '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" 
+    { char_for_backslash (lexing.lexeme_char lexbuf 1) }
+  | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" 
+    { char_for_decimal_code lexbuf 1 }
+  | _ 
+    { raise(Lexical_error "bad character constant") }
+
+and comment = parse
+    "(*" 
+    { comment_depth := comment_depth + 1; comment lexbuf }
+  | "*)" 
+    { comment_depth := comment_depth - 1;
+      if comment_depth = 0 then () else comment lexbuf }
+  | '"' 
+    { reset_string_buffer();
+      string lexbuf;
+      reset_string_buffer();
+      comment lexbuf }
+  | eof 
+    { raise(Lexical_error "unterminated comment") }
+  | _ 
+    { comment lexbuf }
+;;
author	Xavier Leroy <xavier.leroy@inria.fr>	1995-05-04 10:15:53 +0000
committer	Xavier Leroy <xavier.leroy@inria.fr>	1995-05-04 10:15:53 +0000
commit	61bd8ace6bdb2652f4d51d64e3239a7105f56c26 (patch)
tree	e8b957df0957c1b483d41d68973824e280445548 /test/Lex
parent	8f9ea2a7b886e3e0a5cfd76b11fe79d083a7f20c (diff)