(***********************************************************************) (* *) (* Objective Caml *) (* *) (* Xavier Leroy, projet Cristal, *) (* Luc Maranget, projet Moscova, *) (* INRIA Rocquencourt *) (* *) (* Copyright 1996 Institut National de Recherche en Informatique et *) (* en Automatique. All rights reserved. This file is distributed *) (* under the terms of the Q Public License version 1.0. *) (* *) (***********************************************************************) (* $Id$ *) (* Compiling a lexer definition *) open Syntax open Printf exception Memory_overflow (* Deep abstract syntax for regular expressions *) type tag_info = {id : string ; start : bool ; action : int} type regexp = Empty | Chars of int * bool | Action of int | Tag of tag_info | Seq of regexp * regexp | Alt of regexp * regexp | Star of regexp type tag_base = Start | End | Mem of int type tag_addr = Sum of (tag_base * int) type ident_info = | Ident_string of bool * tag_addr * tag_addr | Ident_char of bool * tag_addr type t_env = (string * ident_info) list type ('args,'action) lexer_entry = { lex_name: string; lex_regexp: regexp; lex_mem_tags: int ; lex_actions: (int * t_env * 'action) list } type automata = Perform of int * tag_action list | Shift of automata_trans * (automata_move * memory_action list) array and automata_trans = No_remember | Remember of int * tag_action list and automata_move = Backtrack | Goto of int and memory_action = | Copy of int * int | Set of int and tag_action = SetTag of int * int | EraseTag of int (* Representation of entry points *) type ('args,'action) automata_entry = { auto_name: string; auto_args: 'args ; auto_mem_size : int ; auto_initial_state: int * memory_action list; auto_actions: (int * t_env * 'action) list } (* A lot of sets and map structures *) module Ints = Set.Make(struct type t = int let compare = compare end) module Tags = Set.Make(struct type t = tag_info let compare = compare end) module TagMap = Map.Make (struct type t = tag_info let compare = compare end) module StringSet = Set.Make (struct type t = string let compare = Pervasives.compare end) module StringMap = Map.Make (struct type t = string let compare = Pervasives.compare end) (*********************) (* Variable cleaning *) (*********************) (* Silently eliminate nested variables *) let rec do_remove_nested to_remove = function | Bind (e,x) -> if StringSet.mem x to_remove then do_remove_nested to_remove e else Bind (do_remove_nested (StringSet.add x to_remove) e, x) | Epsilon|Eof|Characters _ as e -> e | Sequence (e1, e2) -> Sequence (do_remove_nested to_remove e1, do_remove_nested to_remove e2) | Alternative (e1, e2) -> Alternative (do_remove_nested to_remove e1, do_remove_nested to_remove e2) | Repetition e -> Repetition (do_remove_nested to_remove e) let remove_nested_as e = do_remove_nested StringSet.empty e (*********************) (* Variable analysis *) (*********************) (* Optional variables. A variable is optional when matching of regexp does not implies it binds. The typical case is: ("" | 'a' as x) -> optional ("" as x | 'a' as x) -> non-optional *) let stringset_delta s1 s2 = StringSet.union (StringSet.diff s1 s2) (StringSet.diff s2 s1) let rec find_all_vars = function | Characters _|Epsilon|Eof -> StringSet.empty | Bind (e,x) -> StringSet.add x (find_all_vars e) | Sequence (e1,e2)|Alternative (e1,e2) -> StringSet.union (find_all_vars e1) (find_all_vars e2) | Repetition e -> find_all_vars e let rec do_find_opt = function | Characters _|Epsilon|Eof -> StringSet.empty, StringSet.empty | Bind (e,x) -> let opt,all = do_find_opt e in opt, StringSet.add x all | Sequence (e1,e2) -> let opt1,all1 = do_find_opt e1 and opt2,all2 = do_find_opt e2 in StringSet.union opt1 opt2, StringSet.union all1 all2 | Alternative (e1,e2) -> let opt1,all1 = do_find_opt e1 and opt2,all2 = do_find_opt e2 in StringSet.union (stringset_delta opt1 opt2) (stringset_delta all1 all2), StringSet.union all1 all2 | Repetition e -> let r = find_all_vars e in r,r let find_optional e = let r,_ = do_find_opt e in r (* Double variables A variable is double when it can be bound more than once in a single matching The typical case is: (e1 as x) (e2 as x) *) let rec do_find_double = function | Characters _|Epsilon|Eof -> StringSet.empty, StringSet.empty | Bind (e,x) -> let dbl,all = do_find_double e in (if StringSet.mem x all then StringSet.add x dbl else dbl), StringSet.add x all | Sequence (e1,e2) -> let dbl1, all1 = do_find_double e1 and dbl2, all2 = do_find_double e2 in StringSet.union (StringSet.inter all1 all2) (StringSet.union dbl1 dbl2), StringSet.union all1 all2 | Alternative (e1,e2) -> let dbl1, all1 = do_find_double e1 and dbl2, all2 = do_find_double e2 in StringSet.union dbl1 dbl2, StringSet.union all1 all2 | Repetition e -> let r = find_all_vars e in r,r let find_double e = do_find_double e (* Type of variables: A variable is bound to a char when all its occurences bind a pattern of length 1. The typical case is: (_ as x) -> char *) let add_some x = function | Some i -> Some (x+i) | None -> None let add_some_some x y = match x,y with | Some i, Some j -> Some (i+j) | _,_ -> None let rec do_find_chars sz = function | Epsilon|Eof -> StringSet.empty, StringSet.empty, sz | Characters _ -> StringSet.empty, StringSet.empty, add_some 1 sz | Bind (e,x) -> let c,s,e_sz = do_find_chars (Some 0) e in begin match e_sz with | Some 1 -> StringSet.add x c,s,add_some 1 sz | _ -> c, StringSet.add x s, add_some_some sz e_sz end | Sequence (e1,e2) -> let c1,s1,sz1 = do_find_chars sz e1 in let c2,s2,sz2 = do_find_chars sz1 e2 in StringSet.union c1 c2, StringSet.union s1 s2, sz2 | Alternative (e1,e2) -> let c1,s1,sz1 = do_find_chars sz e1 and c2,s2,sz2 = do_find_chars sz e2 in StringSet.union c1 c2, StringSet.union s1 s2, (if sz1 = sz2 then sz1 else None) | Repetition e -> do_find_chars None e let find_chars e = let c,s,_ = do_find_chars (Some 0) e in StringSet.diff c s (*******************************) (* From shallow to deep syntax *) (*******************************) let chars = ref ([] : Cset.t list) let chars_count = ref 0 let rec encode_regexp char_vars act = function Epsilon -> Empty | Characters cl -> let n = !chars_count in chars := cl :: !chars; incr chars_count; Chars(n,false) | Eof -> let n = !chars_count in chars := Cset.eof :: !chars; incr chars_count; Chars(n,true) | Sequence(r1,r2) -> let r1 = encode_regexp char_vars act r1 in let r2 = encode_regexp char_vars act r2 in Seq (r1, r2) | Alternative(r1,r2) -> let r1 = encode_regexp char_vars act r1 in let r2 = encode_regexp char_vars act r2 in Alt(r1, r2) | Repetition r -> let r = encode_regexp char_vars act r in Star r | Bind (r,x) -> let r = encode_regexp char_vars act r in if StringSet.mem x char_vars then Seq (Tag {id=x ; start=true ; action=act},r) else Seq (Tag {id=x ; start=true ; action=act}, Seq (r, Tag {id=x ; start=false ; action=act})) (* Optimisation, Static optimization : Replace tags by offsets relative to the beginning or end of matched string. Dynamic optimization: Replace some non-optional, non-double tags by offsets w.r.t a previous similar tag. *) let incr_pos = function | None -> None | Some i -> Some (i+1) let decr_pos = function | None -> None | Some i -> Some (i-1) let opt = true let mk_seq r1 r2 = match r1,r2 with | Empty,_ -> r2 | _,Empty -> r1 | _,_ -> Seq (r1,r2) let add_pos p i = match p with | Some (Sum (a,n)) -> Some (Sum (a,n+i)) | None -> None let opt_regexp all_vars char_vars optional_vars double_vars r = (* From removed tags to their addresses *) let env = Hashtbl.create 17 in (* First static optimizations, from start position *) let rec size_forward pos = function | Empty|Chars (_,true)|Tag _ -> Some pos | Chars (_,false) -> Some (pos+1) | Seq (r1,r2) -> begin match size_forward pos r1 with | None -> None | Some pos -> size_forward pos r2 end | Alt (r1,r2) -> let pos1 = size_forward pos r1 and pos2 = size_forward pos r2 in if pos1=pos2 then pos1 else None | Star _ -> None | Action _ -> assert false in let rec simple_forward pos r = match r with | Tag n -> if StringSet.mem n.id double_vars then r,Some pos else begin Hashtbl.add env (n.id,n.start) (Sum (Start, pos)) ; Empty,Some pos end | Empty -> r, Some pos | Chars (_,is_eof) -> r,Some (if is_eof then pos else pos+1) | Seq (r1,r2) -> let r1,pos = simple_forward pos r1 in begin match pos with | None -> mk_seq r1 r2,None | Some pos -> let r2,pos = simple_forward pos r2 in mk_seq r1 r2,pos end | Alt (r1,r2) -> let pos1 = size_forward pos r1 and pos2 = size_forward pos r2 in r,(if pos1=pos2 then pos1 else None) | Star _ -> r,None | Action _ -> assert false in (* Then static optimizations, from end position *) let rec size_backward pos = function | Empty|Chars (_,true)|Tag _ -> Some pos | Chars (_,false) -> Some (pos-1) | Seq (r1,r2) -> begin match size_backward pos r2 with | None -> None | Some pos -> size_backward pos r1 end | Alt (r1,r2) -> let pos1 = size_backward pos r1 and pos2 = size_backward pos r2 in if pos1=pos2 then pos1 else None | Star _ -> None | Action _ -> assert false in let rec simple_backward pos r = match r with | Tag n -> if StringSet.mem n.id double_vars then r,Some pos else begin Hashtbl.add env (n.id,n.start) (Sum (End, pos)) ; Empty,Some pos end | Empty -> r,Some pos | Chars (_,is_eof) -> r,Some (if is_eof then pos else pos-1) | Seq (r1,r2) -> let r2,pos = simple_backward pos r2 in begin match pos with | None -> mk_seq r1 r2,None | Some pos -> let r1,pos = simple_backward pos r1 in mk_seq r1 r2,pos end | Alt (r1,r2) -> let pos1 = size_backward pos r1 and pos2 = size_backward pos r2 in r,(if pos1=pos2 then pos1 else None) | Star _ -> r,None | Action _ -> assert false in let r = if opt then let r,_ = simple_forward 0 r in let r,_ = simple_backward 0 r in r else r in let loc_count = ref 0 in let get_tag_addr t = try Hashtbl.find env t with | Not_found -> let n = !loc_count in incr loc_count ; Hashtbl.add env t (Sum (Mem n,0)) ; Sum (Mem n,0) in let rec alloc_exp pos r = match r with | Tag n -> if StringSet.mem n.id double_vars then r,pos else begin match pos with | Some a -> Hashtbl.add env (n.id,n.start) a ; Empty,pos | None -> let a = get_tag_addr (n.id,n.start) in r,Some a end | Empty -> r,pos | Chars (_,is_eof) -> r,(if is_eof then pos else add_pos pos 1) | Seq (r1,r2) -> let r1,pos = alloc_exp pos r1 in let r2,pos = alloc_exp pos r2 in mk_seq r1 r2,pos | Alt (_,_) -> let off = size_forward 0 r in begin match off with | Some i -> r,add_pos pos i | None -> r,None end | Star _ -> r,None | Action _ -> assert false in let r,_ = alloc_exp None r in let m = StringSet.fold (fun x r -> let v = if StringSet.mem x char_vars then Ident_char (StringSet.mem x optional_vars, get_tag_addr (x,true)) else Ident_string (StringSet.mem x optional_vars, get_tag_addr (x,true), get_tag_addr (x,false)) in (x,v)::r) all_vars [] in m,r, !loc_count let encode_casedef casedef = let r = List.fold_left (fun (reg,actions,count,ntags) (expr, act) -> let expr = remove_nested_as expr in let char_vars = find_chars expr in let r = encode_regexp char_vars count expr and opt_vars = find_optional expr and double_vars,all_vars = find_double expr in let m,r,loc_ntags = opt_regexp all_vars char_vars opt_vars double_vars r in Alt(reg, Seq(r, Action count)), (count, m ,act) :: actions, (succ count), max loc_ntags ntags) (Empty, [], 0, 0) casedef in r let encode_lexdef def = chars := []; chars_count := 0; let entry_list = List.map (fun {name=entry_name ; args=args ; shortest=shortest ; clauses= casedef} -> let (re,actions,_,ntags) = encode_casedef casedef in { lex_name = entry_name; lex_regexp = re; lex_mem_tags = ntags ; lex_actions = List.rev actions },args,shortest) def in let chr = Array.of_list (List.rev !chars) in chars := []; (chr, entry_list) (* To generate directly a NFA from a regular expression. Confer Aho-Sethi-Ullman, dragon book, chap. 3 Extension to tagged automata. Confer Ville Larikari ``NFAs with Tagged Transitions, their Conversion to Deterministic Automata and Application to Regular Expressions''. Symposium on String Processing and Information Retrieval (SPIRE 2000), http://kouli.iki.fi/~vlaurika/spire2000-tnfa.ps (See also) http://kouli.iki.fi/~vlaurika/regex-submatch.ps.gz *) type t_transition = OnChars of int | ToAction of int type transition = t_transition * Tags.t let compare_trans (t1,tags1) (t2,tags2) = match Pervasives.compare t1 t2 with | 0 -> Tags.compare tags1 tags2 | r -> r module TransSet = Set.Make(struct type t = transition let compare = compare end) let rec nullable = function | Empty|Tag _ -> true | Chars (_,_)|Action _ -> false | Seq(r1,r2) -> nullable r1 && nullable r2 | Alt(r1,r2) -> nullable r1 || nullable r2 | Star r -> true let rec emptymatch = function | Empty | Chars (_,_) | Action _ -> Tags.empty | Tag t -> Tags.add t Tags.empty | Seq (r1,r2) -> Tags.union (emptymatch r1) (emptymatch r2) | Alt(r1,r2) -> if nullable r1 then emptymatch r1 else emptymatch r2 | Star r -> if nullable r then emptymatch r else Tags.empty let addtags transs tags = TransSet.fold (fun (t,tags_t) r -> TransSet.add (t, Tags.union tags tags_t) r) transs TransSet.empty let rec firstpos = function Empty|Tag _ -> TransSet.empty | Chars (pos,_) -> TransSet.add (OnChars pos,Tags.empty) TransSet.empty | Action act -> TransSet.add (ToAction act,Tags.empty) TransSet.empty | Seq(r1,r2) -> if nullable r1 then TransSet.union (firstpos r1) (addtags (firstpos r2) (emptymatch r1)) else firstpos r1 | Alt(r1,r2) -> TransSet.union (firstpos r1) (firstpos r2) | Star r -> firstpos r (* Berry-sethi followpos *) let followpos size entry_list = let v = Array.create size TransSet.empty in let rec fill s = function | Empty|Action _|Tag _ -> () | Chars (n,_) -> v.(n) <- s | Alt (r1,r2) -> fill s r1 ; fill s r2 | Seq (r1,r2) -> fill (if nullable r2 then TransSet.union (firstpos r2) (addtags s (emptymatch r2)) else (firstpos r2)) r1 ; fill s r2 | Star r -> fill (TransSet.union (firstpos r) s) r in List.iter (fun (entry,_,_) -> fill TransSet.empty entry.lex_regexp) entry_list ; v (************************) (* The algorithm itself *) (************************) let no_action = max_int module StateSet = Set.Make (struct type t = t_transition let compare = Pervasives.compare end) module MemMap = Map.Make (struct type t = int let compare = Pervasives.compare end) type 'a dfa_state = {final : int * ('a * int TagMap.t) ; others : ('a * int TagMap.t) MemMap.t} (* let dtag oc t = fprintf oc "%s<%s>" t.id (if t.start then "s" else "e") let dmem_map dp ds m = MemMap.iter (fun k x -> eprintf "%d -> " k ; dp x ; ds ()) m and dtag_map dp ds m = TagMap.iter (fun t x -> dtag stderr t ; eprintf " -> " ; dp x ; ds ()) m let dstate {final=(act,(_,m)) ; others=o} = if act <> no_action then begin eprintf "final=%d " act ; dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m ; prerr_endline "" end ; dmem_map (fun (_,m) -> dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m) (fun () -> prerr_endline "") o *) let dfa_state_empty = {final=(no_action, (max_int,TagMap.empty)) ; others=MemMap.empty} and dfa_state_is_empty {final=(act,_) ; others=o} = act = no_action && o = MemMap.empty (* A key is an abstraction on a dfa state, two states with the same key can be made the same by copying some memory cells into others *) module StateSetSet = Set.Make (struct type t = StateSet.t let compare = StateSet.compare end) type t_equiv = {tag:tag_info ; equiv:StateSetSet.t} module MemKey = Set.Make (struct type t = t_equiv let compare e1 e2 = match Pervasives.compare e1.tag e2.tag with | 0 -> StateSetSet.compare e1.equiv e2.equiv | r -> r end) type dfa_key = {kstate : StateSet.t ; kmem : MemKey.t} (* Map a state to its key *) let env_to_class m = let env1 = MemMap.fold (fun _ (tag,s) r -> try let ss = TagMap.find tag r in let r = TagMap.remove tag r in TagMap.add tag (StateSetSet.add s ss) r with | Not_found -> TagMap.add tag (StateSetSet.add s StateSetSet.empty) r) m TagMap.empty in TagMap.fold (fun tag ss r -> MemKey.add {tag=tag ; equiv=ss} r) env1 MemKey.empty (* trans is nfa_state, m is associated memory map *) let inverse_mem_map trans m r = TagMap.fold (fun tag addr r -> try let otag,s = MemMap.find addr r in assert (tag = otag) ; let r = MemMap.remove addr r in MemMap.add addr (tag,StateSet.add trans s) r with | Not_found -> MemMap.add addr (tag,StateSet.add trans StateSet.empty) r) m r let inverse_mem_map_other n (_,m) r = inverse_mem_map (OnChars n) m r let get_key {final=(act,(_,m_act)) ; others=o} = let env = MemMap.fold inverse_mem_map_other o (if act = no_action then MemMap.empty else inverse_mem_map (ToAction act) m_act MemMap.empty) in let state_key = MemMap.fold (fun n _ r -> StateSet.add (OnChars n) r) o (if act=no_action then StateSet.empty else StateSet.add (ToAction act) StateSet.empty) in let mem_key = env_to_class env in {kstate = state_key ; kmem = mem_key} let key_compare k1 k2 = match StateSet.compare k1.kstate k2.kstate with | 0 -> MemKey.compare k1.kmem k2.kmem | r -> r (* Association dfa_state -> state_num *) module StateMap = Map.Make(struct type t = dfa_key let compare = key_compare end) let state_map = ref (StateMap.empty : int StateMap.t) let todo = Stack.create() let next_state_num = ref 0 let next_mem_cell = ref 0 let temp_pending = ref false let tag_cells = Hashtbl.create 17 let state_table = Table.create dfa_state_empty let reset_state_mem () = state_map := StateMap.empty; Stack.clear todo; next_state_num := 0 ; let _ = Table.trim state_table in () (* Allocation of memory cells *) let reset_cell_mem ntags = next_mem_cell := ntags ; Hashtbl.clear tag_cells ; temp_pending := false let do_alloc_temp () = temp_pending := true ; let n = !next_mem_cell in n let do_alloc_cell used t = let available = try Hashtbl.find tag_cells t with Not_found -> Ints.empty in try Ints.choose (Ints.diff available used) with | Not_found -> temp_pending := false ; let n = !next_mem_cell in if n >= 255 then raise Memory_overflow ; Hashtbl.replace tag_cells t (Ints.add n available) ; incr next_mem_cell ; n let is_old_addr a = a >= 0 and is_new_addr a = a < 0 let old_in_map m r = TagMap.fold (fun _ addr r -> if is_old_addr addr then Ints.add addr r else r) m r let alloc_map used m mvs = TagMap.fold (fun tag a (r,mvs) -> let a,mvs = if is_new_addr a then let a = do_alloc_cell used tag in a,Ints.add a mvs else a,mvs in TagMap.add tag a r,mvs) m (TagMap.empty,mvs) let create_new_state {final=(act,(_,m_act)) ; others=o} = let used = MemMap.fold (fun _ (_,m) r -> old_in_map m r) o (old_in_map m_act Ints.empty) in let new_m_act,mvs = alloc_map used m_act Ints.empty in let new_o,mvs = MemMap.fold (fun k (x,m) (r,mvs) -> let m,mvs = alloc_map used m mvs in MemMap.add k (x,m) r,mvs) o (MemMap.empty,mvs) in {final=(act,(0,new_m_act)) ; others=new_o}, Ints.fold (fun x r -> Set x::r) mvs [] type new_addr_gen = {mutable count : int ; mutable env : int TagMap.t} let create_new_addr_gen () = {count = -1 ; env = TagMap.empty} let alloc_new_addr tag r = try TagMap.find tag r.env with | Not_found -> let a = r.count in r.count <- a-1 ; r.env <- TagMap.add tag a r.env ; a let create_mem_map tags gen = Tags.fold (fun tag r -> TagMap.add tag (alloc_new_addr tag gen) r) tags TagMap.empty let create_init_state pos = let gen = create_new_addr_gen () in let st = TransSet.fold (fun (t,tags) st -> match t with | ToAction n -> let on,otags = st.final in if n < on then {st with final = (n, (0,create_mem_map tags gen))} else st | OnChars n -> try let _ = MemMap.find n st.others in assert false with | Not_found -> {st with others = MemMap.add n (0,create_mem_map tags gen) st.others}) pos dfa_state_empty in st let get_map t st = match t with | ToAction _ -> let _,(_,m) = st.final in m | OnChars n -> let (_,m) = MemMap.find n st.others in m let dest = function | Copy (d,_) | Set d -> d and orig = function | Copy (_,o) -> o | Set _ -> -1 let pmv oc mv = fprintf oc "%d <- %d" (dest mv) (orig mv) let pmvs oc mvs = List.iter (fun mv -> fprintf oc "%a " pmv mv) mvs ; output_char oc '\n' ; flush oc (* Topological sort << a la louche >> *) let sort_mvs mvs = let rec do_rec r mvs = match mvs with | [] -> r | _ -> let dests = List.fold_left (fun r mv -> Ints.add (dest mv) r) Ints.empty mvs in let rem,here = List.partition (fun mv -> Ints.mem (orig mv) dests) mvs in match here with | [] -> begin match rem with | Copy (d,_)::_ -> let d' = do_alloc_temp () in Copy (d',d):: do_rec r (List.map (fun mv -> if orig mv = d then Copy (dest mv,d') else mv) rem) | _ -> assert false end | _ -> do_rec (here@r) rem in do_rec [] mvs let move_to mem_key src tgt = let mvs = MemKey.fold (fun {tag=tag ; equiv=m} r -> StateSetSet.fold (fun s r -> try let t = StateSet.choose s in let src = TagMap.find tag (get_map t src) and tgt = TagMap.find tag (get_map t tgt) in if src <> tgt then begin if is_new_addr src then Set tgt::r else Copy (tgt, src)::r end else r with | Not_found -> assert false) m r) mem_key [] in (* Moves are topologically sorted *) sort_mvs mvs let get_state st = let key = get_key st in try let num = StateMap.find key !state_map in num,move_to key.kmem st (Table.get state_table num) with Not_found -> let num = !next_state_num in incr next_state_num; let st,mvs = create_new_state st in Table.emit state_table st ; state_map := StateMap.add key num !state_map; Stack.push (st, num) todo; num,mvs let map_on_all_states f old_res = let res = ref old_res in begin try while true do let (st, i) = Stack.pop todo in let r = f st in res := (r, i) :: !res done with Stack.Empty -> () end; !res let goto_state st = if dfa_state_is_empty st then Backtrack,[] else let n,moves = get_state st in Goto n,moves (****************************) (* compute reachable states *) (****************************) let add_tags_to_map gen tags m = Tags.fold (fun tag m -> let m = TagMap.remove tag m in TagMap.add tag (alloc_new_addr tag gen) m) tags m let apply_transition gen r pri m = function | ToAction n,tags -> let on,(opri,_) = r.final in if n < on || (on=n && pri < opri) then let m = add_tags_to_map gen tags m in {r with final=n,(pri,m)} else r | OnChars n,tags -> try let (opri,_) = MemMap.find n r.others in if pri < opri then let m = add_tags_to_map gen tags m in {r with others=MemMap.add n (pri,m) (MemMap.remove n r.others)} else r with | Not_found -> let m = add_tags_to_map gen tags m in {r with others=MemMap.add n (pri,m) r.others} (* add transitions ts to new state r transitions in ts start from state pri and memory map m *) let apply_transitions gen r pri m ts = TransSet.fold (fun t r -> apply_transition gen r pri m t) ts r (* For a given nfa_state pos, refine char partition *) let rec split_env gen follow pos m s = function | [] -> assert false | (s1,st1) as p::rem -> let here = Cset.inter s s1 in if Cset.is_empty here then p::split_env gen follow pos m s rem else let rest = Cset.diff s here in let rem = if Cset.is_empty rest then rem else split_env gen follow pos m rest rem and new_st = apply_transitions gen st1 pos m follow in let stay = Cset.diff s1 here in if Cset.is_empty stay then (here, new_st)::rem else (stay, st1)::(here, new_st)::rem (* For all nfa_state pos in a dfa state st *) let comp_shift gen chars follow st = MemMap.fold (fun pos (_,m) env -> split_env gen follow.(pos) pos m chars.(pos) env) st [Cset.all_chars_eof,dfa_state_empty] let reachs chars follow st = let gen = create_new_addr_gen () in (* build a association list (char set -> new state) *) let env = comp_shift gen chars follow st in (* change it into (char set -> new state_num) *) let env = List.map (fun (s,dfa_state) -> s,goto_state dfa_state) env in (* finally build the char indexed array -> new state num *) let shift = Cset.env_to_array env in shift let get_tag_mem n env t = try TagMap.find t env.(n) with | Not_found -> assert false let do_tag_actions n env m = let used,r = TagMap.fold (fun t m (used,r) -> let a = get_tag_mem n env t in Ints.add a used,SetTag (a,m)::r) m (Ints.empty,[]) in let _,r = TagMap.fold (fun tag m (used,r) -> if not (Ints.mem m used) && tag.start then Ints.add m used, EraseTag m::r else used,r) env.(n) (used,r) in r let translate_state shortest_match tags chars follow st = let (n,(_,m)) = st.final in if MemMap.empty = st.others then Perform (n,do_tag_actions n tags m) else if shortest_match then begin if n=no_action then Shift (No_remember,reachs chars follow st.others) else Perform(n, do_tag_actions n tags m) end else begin Shift ( (if n = no_action then No_remember else Remember (n,do_tag_actions n tags m)), reachs chars follow st.others) end (* let dtags chan tags = Tags.iter (fun t -> fprintf chan " %a" dtag t) tags let dtransset s = TransSet.iter (fun trans -> match trans with | OnChars i,tags -> eprintf " (-> %d,%a)" i dtags tags | ToAction i,tags -> eprintf " ([%d],%a)" i dtags tags) s let dfollow t = eprintf "follow=[" ; for i = 0 to Array.length t-1 do eprintf "%d:" i ; dtransset t.(i) done ; prerr_endline "]" *) let make_tag_entry id start act a r = match a with | Sum (Mem m,0) -> TagMap.add {id=id ; start=start ; action=act} m r | _ -> r let extract_tags l = let envs = Array.create (List.length l) TagMap.empty in List.iter (fun (act,m,_) -> envs.(act) <- List.fold_right (fun (x,v) r -> match v with | Ident_char (_,t) -> make_tag_entry x true act t r | Ident_string (_,t1,t2) -> make_tag_entry x true act t1 (make_tag_entry x false act t2 r)) m TagMap.empty) l ; envs let make_dfa lexdef = let (chars, entry_list) = encode_lexdef lexdef in let follow = followpos (Array.length chars) entry_list in (* dfollow follow ; *) reset_state_mem () ; let r_states = ref [] in let initial_states = List.map (fun (le,args,shortest) -> let tags = extract_tags le.lex_actions in reset_cell_mem le.lex_mem_tags ; let pos_set = firstpos le.lex_regexp in (* prerr_string "trans={" ; dtransset pos_set ; prerr_endline "}" ; *) let init_state = create_init_state pos_set in let init_num = get_state init_state in r_states := map_on_all_states (translate_state shortest tags chars follow) !r_states ; { auto_name = le.lex_name; auto_args = args ; auto_mem_size = (if !temp_pending then !next_mem_cell+1 else !next_mem_cell) ; auto_initial_state = init_num ; auto_actions = le.lex_actions }) entry_list in let states = !r_states in (* prerr_endline "** states **" ; for i = 0 to !next_state_num-1 do eprintf "+++ %d +++\n" i ; dstate (Table.get state_table i) ; prerr_endline "" done ; eprintf "%d states\n" !next_state_num ; *) let actions = Array.create !next_state_num (Perform (0,[])) in List.iter (fun (act, i) -> actions.(i) <- act) states; reset_state_mem () ; reset_cell_mem 0 ; (initial_states, actions)