diff options
author | Xavier Leroy <xavier.leroy@inria.fr> | 2002-12-09 14:05:18 +0000 |
---|---|---|
committer | Xavier Leroy <xavier.leroy@inria.fr> | 2002-12-09 14:05:18 +0000 |
commit | 211fc51518273dc50deeb873dee7c353b203dc3e (patch) | |
tree | 40d5921351c6b6f7b83f2dac46ea0bd2d123c0c2 /otherlibs | |
parent | 11392c1894b4be951325ffb50cd308c1271c6cad (diff) |
Reimplementation complete de la bibliotheque Str en n'utilisant pas GNU regex
git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@5324 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
Diffstat (limited to 'otherlibs')
-rw-r--r-- | otherlibs/str/.depend | 7 | ||||
-rw-r--r-- | otherlibs/str/Makefile | 9 | ||||
-rw-r--r-- | otherlibs/str/Makefile.Mac | 9 | ||||
-rw-r--r-- | otherlibs/str/Makefile.nt | 17 | ||||
-rw-r--r-- | otherlibs/str/str.ml | 572 | ||||
-rw-r--r-- | otherlibs/str/str.mli | 32 | ||||
-rw-r--r-- | otherlibs/str/strstubs.c | 510 |
7 files changed, 941 insertions, 215 deletions
diff --git a/otherlibs/str/.depend b/otherlibs/str/.depend index 4d370653d..70671c2f8 100644 --- a/otherlibs/str/.depend +++ b/otherlibs/str/.depend @@ -1,8 +1,5 @@ -strstubs.o: strstubs.c regex-0.12/regex.h ../../byterun/mlvalues.h \ - ../../byterun/config.h ../../config/m.h ../../config/s.h \ - ../../byterun/misc.h ../../byterun/alloc.h ../../byterun/custom.h \ - ../../byterun/fail.h ../../byterun/memory.h ../../byterun/gc.h \ - ../../byterun/major_gc.h ../../byterun/freelist.h \ +strstubs.o: strstubs.c ../../config/s.h ../../byterun/misc.h \ + ../../byterun/gc.h ../../byterun/major_gc.h ../../byterun/freelist.h \ ../../byterun/minor_gc.h str.cmo: str.cmi str.cmx: str.cmi diff --git a/otherlibs/str/Makefile b/otherlibs/str/Makefile index 22b440c19..97123ddaa 100644 --- a/otherlibs/str/Makefile +++ b/otherlibs/str/Makefile @@ -19,12 +19,11 @@ include ../../config/Makefile # Compilation options CC=$(BYTECC) -CFLAGS=-O -I$(REGEXLIB) -I../../byterun $(BYTECCCOMPOPTS) $(SHAREDCCCOMPOPTS) +CFLAGS=-O -I../../byterun $(BYTECCCOMPOPTS) $(SHAREDCCCOMPOPTS) CAMLC=../../ocamlcomp.sh CAMLOPT=../../ocamlcompopt.sh COMPFLAGS=-warn-error A -REGEXLIB=regex-0.12 -COBJS=strstubs.o $(REGEXLIB)/regex.o +COBJS=strstubs.o MKLIB=../../boot/ocamlrun ../../tools/ocamlmklib all: libstr.a str.cmi str.cma @@ -40,9 +39,6 @@ str.cma: str.cmo str.cmxa: str.cmx $(MKLIB) -ocamlopt '$(CAMLOPT)' -o str str.cmx -$(REGEXLIB)/regex.o: $(REGEXLIB)/regex.c $(REGEXLIB)/regex.h - cd $(REGEXLIB); CC="$(CC) $(BYTECCCOMPOPTS) $(SHAREDCCCOMPOPTS)" sh configure; $(MAKE) - str.cmx: ../../ocamlopt partialclean: @@ -50,7 +46,6 @@ partialclean: clean: partialclean rm -f *.a *.so *.o - cd $(REGEXLIB); if test -f Makefile; then $(MAKE) distclean; else exit 0; fi install: if test -f dllstr.so; then cp dllstr.so $(STUBLIBDIR)/dllstr.so; fi diff --git a/otherlibs/str/Makefile.Mac b/otherlibs/str/Makefile.Mac index a4b65258b..c5345acd0 100644 --- a/otherlibs/str/Makefile.Mac +++ b/otherlibs/str/Makefile.Mac @@ -17,12 +17,11 @@ # Compilation options PPCC = mrc -PPCCOptions = -i {REGEXLIB},:::byterun:,:::config: -w 7 {cdbgflag} +PPCCOptions = -i :::byterun:,:::config: -w 7 {cdbgflag} CAMLC = :::boot:ocamlrun :::ocamlc -I :::stdlib: -REGEXLIB = :regex-0.12: -PPCCOBJS = strstubs.c.x {REGEXLIB}regex.c.x +PPCCOBJS = strstubs.c.x all � libstr.x str.cmi str.cma @@ -32,15 +31,11 @@ libstr.x � {PPCCOBJS} str.cma � str.cmo {CAMLC} -a -o str.cma str.cmo -{REGEXLIB}regex.c.x � {REGEXLIB}regex.c {REGEXLIB}regex.h - directory {REGEXLIB}; domake; directory :: - partialclean � delete -i �.cm[aio] || set status 0 clean � partialclean delete -i �.x || set status 0 - directory {REGEXLIB}; domake distclean; directory :: install � duplicate -y libstr.x str.cma str.cmi "{LIBDIR}" diff --git a/otherlibs/str/Makefile.nt b/otherlibs/str/Makefile.nt index ec7ac0559..3d65d19f0 100644 --- a/otherlibs/str/Makefile.nt +++ b/otherlibs/str/Makefile.nt @@ -19,13 +19,11 @@ include ../../config/Makefile # Compilation options CC=$(BYTECC) -CFLAGS=-I$(REGEXLIB) -I../../byterun +CFLAGS=-I../../byterun CAMLC=../../boot/ocamlrun ../../ocamlc -I ../../boot CAMLOPT=../../boot/ocamlrun ../../ocamlopt -I ../../stdlib -REGEXLIB=regex-0.12 -REGEXFLAGS=-DREGEX_MALLOC -DSTDC_HEADERS -DCOBJS=strstubs.$(DO) $(REGEXLIB)/regex.$(DO) -SCOBJS=strstubs.$(SO) $(REGEXLIB)/regex.$(SO) +DCOBJS=strstubs.$(DO) +SCOBJS=strstubs.$(SO) all: dllstr.dll libstr.$(A) str.cmi str.cma @@ -44,14 +42,6 @@ str.cma: str.cmo str.cmxa: str.cmx $(CAMLOPT) -a -o str.cmxa str.cmx -cclib -lstr -$(REGEXLIB)/regex.$(DO): $(REGEXLIB)/regex.c $(REGEXLIB)/regex.h - cd $(REGEXLIB); $(CC) $(REGEXFLAGS) $(DLLCCCOMPOPTS) -c regex.c - mv $(REGEXLIB)/regex.$(O) $(REGEXLIB)/regex.$(DO) - -$(REGEXLIB)/regex.$(SO): $(REGEXLIB)/regex.c $(REGEXLIB)/regex.h - cd $(REGEXLIB); $(CC) $(REGEXFLAGS) $(BYTECCCOMPOPTS) -c regex.c - mv $(REGEXLIB)/regex.$(O) $(REGEXLIB)/regex.$(SO) - str.cmx: ../../ocamlopt partialclean: @@ -59,7 +49,6 @@ partialclean: clean: partialclean rm -f *.$(A) *.dll *.$(O) *.$(SO) - rm -f $(REGEXLIB)/*.$(O) install: cp dllstr.dll $(STUBLIBDIR)/dllstr.dll diff --git a/otherlibs/str/str.ml b/otherlibs/str/str.ml index d20de2647..178d3ff2c 100644 --- a/otherlibs/str/str.ml +++ b/otherlibs/str/str.ml @@ -13,17 +13,485 @@ (* $Id$ *) -type regexp - -external compile_regexp: string -> bool -> regexp = "str_compile_regexp" -external string_match: regexp -> string -> int -> bool = "str_string_match" -external string_partial_match: regexp -> string -> int -> bool = - "str_string_partial_match" -external search_forward: regexp -> string -> int -> int = "str_search_forward" -external search_backward: regexp -> string -> int -> int = "str_search_backward" -external beginning_group: int -> int = "str_beginning_group" -external end_group: int -> int = "str_end_group" -external replacement_text: string -> string -> string = "str_replacement_text" +(** String utilities *) + +let string_before s n = String.sub s 0 n + +let string_after s n = String.sub s n (String.length s - n) + +let first_chars s n = String.sub s 0 n + +let last_chars s n = String.sub s (String.length s - n) n + +(** Representation of character sets **) + +module Charset = + struct + type t = string (* of length 32 *) + + let empty = String.make 32 '\000' + let full = String.make 32 '\255' + + let make_empty () = String.make 32 '\000' + + let add s c = + let i = Char.code c in + s.[i lsr 3] <- Char.chr(Char.code s.[i lsr 3] lor (1 lsl (i land 7))) + + let add_range s c1 c2 = + for i = Char.code c1 to Char.code c2 do add s (Char.chr i) done + + let singleton c = + let s = make_empty () in add s c; s + + let range c1 c2 = + let s = make_empty () in add_range s c1 c2; s + + let complement s = + let r = String.create 32 in + for i = 0 to 31 do + r.[i] <- Char.chr(Char.code s.[i] lxor 0xFF) + done; + r + + let union s1 s2 = + let r = String.create 32 in + for i = 0 to 31 do + r.[i] <- Char.chr(Char.code s1.[i] lor Char.code s2.[i]) + done; + r + + let disjoint s1 s2 = + try + for i = 0 to 31 do + if Char.code s1.[i] land Char.code s2.[i] <> 0 then raise Exit + done; + true + with Exit -> + false + + let iter fn s = + for i = 0 to 31 do + let c = Char.code s.[i] in + if c <> 0 then + for j = 0 to 7 do + if c land (1 lsl j) <> 0 then fn (Char.chr ((i lsl 3) + j)) + done + done + + let expand s = + let r = String.make 256 '\000' in + iter (fun c -> r.[Char.code c] <- '\001') s; + r + + let fold_case s = + let r = make_empty() in + iter (fun c -> add r (Char.lowercase c); add r (Char.uppercase c)) s; + r + + end + +(** Abstract syntax tree for regular expressions *) + +type re_syntax = + Char of char + | String of string + | CharClass of Charset.t + | Seq of re_syntax list + | Alt of re_syntax * re_syntax + | Star of re_syntax + | Plus of re_syntax + | Option of re_syntax + | Group of int * re_syntax + | Refgroup of int + | Bol + | Eol + | Wordboundary + +(** Representation of compiled regular expressions *) + +type regexp = { + prog: int array; (* bytecode instructions *) + cpool: string array; (* constant pool (string literals) *) + normtable: string; (* case folding table (if any) *) + numgroups: int; (* number of \(...\) groups *) + startchars: int (* index of set of starting chars, or -1 if none *) +} + +(** Opcodes for bytecode instructions; see strstubs.c for description *) + +let op_CHAR = 0 +let op_CHARNORM = 1 +let op_STRING = 2 +let op_STRINGNORM = 3 +let op_CHARCLASS = 4 +let op_BOL = 5 +let op_EOL = 6 +let op_WORDBOUNDARY = 7 +let op_BEGGROUP = 8 +let op_ENDGROUP = 9 +let op_REFGROUP = 10 +let op_ACCEPT = 11 +let op_SIMPLEOPT = 12 +let op_SIMPLESTAR = 13 +let op_SIMPLEPLUS = 14 +let op_GOTO = 15 +let op_PUSHBACK = 16 + +(* Encoding of bytecode instructions *) + +let instr opc arg = opc lor (arg lsl 8) + +(* Computing relative displacements for GOTO and PUSHBACK instructions *) + +let displ dest from = dest - from - 1 + +(** Compilation of a regular expression *) + +(* first r returns a set of characters C such that: + for all string s, s matches r => the first character of s is in C *) + +let rec first = function + Char c -> Charset.singleton c + | String s -> if s = "" then Charset.full else Charset.singleton s.[0] + | CharClass cl -> cl + | Seq rl -> first_seq rl + | Alt (r1, r2) -> Charset.union (first r1) (first r2) + | Star r -> Charset.full + | Plus r -> first r + | Option r -> Charset.full + | Group(n, r) -> first r + | Refgroup n -> Charset.full + | Bol -> Charset.full + | Eol -> Charset.full + | Wordboundary -> Charset.full + +and first_seq = function + [] -> Charset.full + | (Bol | Eol | Wordboundary) :: rl -> first_seq rl + | Star r :: rl -> Charset.union (first r) (first_seq rl) + | Option r :: rl -> Charset.union (first r) (first_seq rl) + | r :: rl -> first r + +(* Transform a Char or CharClass regexp into a character class *) + +let charclass_of_regexp fold_case re = + let cl = + match re with + Char c -> Charset.singleton c + | CharClass cl -> cl + | _ -> assert false in + if fold_case then Charset.fold_case cl else cl + +(* The case fold table: maps characters to their lowercase equivalent *) + +let fold_case_table = + let t = String.create 256 in + for i = 0 to 255 do t.[i] <- Char.lowercase(Char.chr i) done; + t + +module StringMap = Map.Make(struct type t = string let compare = compare end) + +(* Compilation of a regular expression *) + +let compile fold_case re = + + (* Instruction buffering *) + let prog = ref (Array.make 32 0) + and progpos = ref 0 + and cpool = ref StringMap.empty + and cpoolpos = ref 0 + and numgroups = ref 1 in + (* Add a new instruction *) + let emit_instr opc arg = + if !progpos >= Array.length !prog then begin + let nprog = Array.make (2 * Array.length !prog) 0 in + Array.blit !prog 0 nprog 0 (Array.length !prog); + prog := nprog + end; + (!prog).(!progpos) <- (instr opc arg); + incr progpos in + (* Reserve an instruction slot and return its position *) + let emit_hole () = + let p = !progpos in incr progpos; p in + (* Fill a reserved instruction slot with a GOTO or PUSHBACK instruction *) + let patch_instr pos opc dest = + (!prog).(pos) <- (instr opc (displ dest pos)) in + (* Return the cpool index for the given string, adding it if not + already there *) + let cpool_index s = + try + StringMap.find s !cpool + with Not_found -> + let p = !cpoolpos in + cpool := StringMap.add s p !cpool; + incr cpoolpos; + p in + (* Main recursive compilation function *) + let rec emit_code = function + Char c -> + if fold_case then + emit_instr op_CHARNORM (Char.code (Char.lowercase c)) + else + emit_instr op_CHAR (Char.code c) + | String s -> + begin match String.length s with + 0 -> () + | 1 -> + if fold_case then + emit_instr op_CHARNORM (Char.code (Char.lowercase s.[0])) + else + emit_instr op_CHAR (Char.code s.[0]) + | _ -> + try + (* null characters are not accepted by the STRING* instructions; + if one is found, split string at null character *) + let i = String.index s '\000' in + emit_code (String (string_before s i)); + emit_instr op_CHAR 0; + emit_code (String (string_after s (i+1))) + with Not_found -> + if fold_case then + emit_instr op_STRINGNORM (cpool_index (String.lowercase s)) + else + emit_instr op_STRING (cpool_index s) + end + | CharClass cl -> + let cl' = if fold_case then Charset.fold_case cl else cl in + emit_instr op_CHARCLASS (cpool_index cl') + | Seq rl -> + emit_seq_code rl + | Alt(r1, r2) -> + (* PUSHBACK lbl1 + <match r1> + GOTO lbl2 + lbl1: <match r2> + lbl2: ... *) + let pos_pushback = emit_hole() in + emit_code r1; + let pos_goto_end = emit_hole() in + let lbl1 = !progpos in + emit_code r2; + let lbl2 = !progpos in + patch_instr pos_pushback op_PUSHBACK lbl1; + patch_instr pos_goto_end op_GOTO lbl2 + | Star r -> + (* Implement longest match semantics for compatibility with old Str *) + (* lbl1: PUSHBACK lbl2 + <match r> + GOTO lbl1 + lbl2: + *) + let lbl1 = emit_hole() in + emit_code r; + emit_instr op_GOTO (displ lbl1 !progpos); + let lbl2 = !progpos in + patch_instr lbl1 op_PUSHBACK lbl2 + | Plus r -> + (* Implement longest match semantics for compatibility with old Str *) + (* lbl1: <match r> + PUSHBACK lbl2 + GOTO lbl1 + lbl2: + *) + let lbl1 = !progpos in + emit_code r; + let pos_pushback = emit_hole() in + emit_instr op_GOTO (displ lbl1 !progpos); + let lbl2 = !progpos in + patch_instr pos_pushback op_PUSHBACK lbl2 + | Option r -> + (* Implement longest match semantics for compatibility with old Str *) + (* PUSHBACK lbl + <match r> + lbl: + *) + let pos_pushback = emit_hole() in + emit_code r; + let lbl = !progpos in + patch_instr pos_pushback op_PUSHBACK lbl + | Group(n, r) -> + if n >= 32 then failwith "too many \\(...\\) groups"; + emit_instr op_BEGGROUP n; + emit_code r; + emit_instr op_ENDGROUP n; + numgroups := max !numgroups (n+1) + | Refgroup n -> + emit_instr op_REFGROUP n + | Bol -> + emit_instr op_BOL 0 + | Eol -> + emit_instr op_EOL 0 + | Wordboundary -> + emit_instr op_WORDBOUNDARY 0 + + and emit_seq_code = function + [] -> () + | Star(Char _ | CharClass _ as r) :: rl + when disjoint_modulo_case (first r) (first_seq rl) -> + emit_instr op_SIMPLESTAR (cpool_index (charclass_of_regexp fold_case r)); + emit_seq_code rl + | Plus(Char _ | CharClass _ as r) :: rl + when disjoint_modulo_case (first r) (first_seq rl) -> + emit_instr op_SIMPLEPLUS (cpool_index (charclass_of_regexp fold_case r)); + emit_seq_code rl + | Option(Char _ | CharClass _ as r) :: rl + when disjoint_modulo_case (first r) (first_seq rl) -> + emit_instr op_SIMPLEOPT (cpool_index (charclass_of_regexp fold_case r)); + emit_seq_code rl + | r :: rl -> + emit_code r; + emit_seq_code rl + + and disjoint_modulo_case c1 c2 = + if fold_case + then Charset.disjoint (Charset.fold_case c1) (Charset.fold_case c2) + else Charset.disjoint c1 c2 + in + + emit_code re; + emit_instr op_ACCEPT 0; + let start = first re in + let start' = if fold_case then Charset.fold_case start else start in + let start_pos = + if start = Charset.full + then -1 + else cpool_index (Charset.expand start') in + let constantpool = Array.make !cpoolpos "" in + StringMap.iter (fun str idx -> constantpool.(idx) <- str) !cpool; + { prog = Array.sub !prog 0 !progpos; + cpool = constantpool; + normtable = if fold_case then fold_case_table else ""; + numgroups = !numgroups; + startchars = start_pos } + +(** Parsing of a regular expression *) + +(* Efficient buffering of sequences *) + +module SeqBuffer = struct + + type t = { sb_chars: Buffer.t; mutable sb_next: re_syntax list } + + let create() = { sb_chars = Buffer.create 16; sb_next = [] } + + let flush buf = + let s = Buffer.contents buf.sb_chars in + Buffer.clear buf.sb_chars; + match String.length s with + 0 -> () + | 1 -> buf.sb_next <- Char s.[0] :: buf.sb_next + | _ -> buf.sb_next <- String s :: buf.sb_next + + let add buf re = + match re with + Char c -> Buffer.add_char buf.sb_chars c + | _ -> flush buf; buf.sb_next <- re :: buf.sb_next + + let extract buf = + flush buf; Seq(List.rev buf.sb_next) + +end + +(* The character class corresponding to `.' *) + +let dotclass = Charset.complement (Charset.singleton '\n') + +(* Parse a regular expression *) + +let parse s = + let len = String.length s in + let group_counter = ref 1 in + + let rec regexp0 i = + let (r, j) = regexp1 i in + regexp0cont r j + and regexp0cont r1 i = + if i + 2 <= len && s.[i] = '\\' && s.[i+1] = '|' then + let (r2, j) = regexp1 (i+2) in + regexp0cont (Alt(r1, r2)) j + else + (r1, i) + and regexp1 i = + regexp1cont (SeqBuffer.create()) i + and regexp1cont sb i = + if i >= len + || i + 2 <= len && s.[i] = '\\' && (let c = s.[i+1] in c = '|' || c = ')') + then + (SeqBuffer.extract sb, i) + else + let (r, j) = regexp2 i in + SeqBuffer.add sb r; + regexp1cont sb j + and regexp2 i = + let (r, j) = regexp3 i in + regexp2cont r j + and regexp2cont r i = + if i >= len then (r, i) else + match s.[i] with + '?' -> regexp2cont (Option r) (i+1) + | '*' -> regexp2cont (Star r) (i+1) + | '+' -> regexp2cont (Plus r) (i+1) + | _ -> (r, i) + and regexp3 i = + match s.[i] with + '\\' -> regexpbackslash (i+1) + | '[' -> let (c, j) = regexpclass0 (i+1) in (CharClass c, j) + | '^' -> (Bol, i+1) + | '$' -> (Eol, i+1) + | '.' -> (CharClass dotclass, i+1) + | c -> (Char c, i+1) + and regexpbackslash i = + if i >= len then (Char '\\', i) else + match s.[i] with + '|' | ')' -> + assert false + | '(' -> + let group_no = !group_counter in + if group_no < 32 then incr group_counter; + let (r, j) = regexp0 (i+1) in + if j + 1 < len && s.[j] = '\\' && s.[j+1] = ')' then + if group_no < 32 + then (Group(group_no, r), j + 2) + else (r, j + 2) + else + failwith "\\( group not closed by \\)" + | '1' .. '9' as c -> + (Refgroup(Char.code c - 48), i + 1) + | 'b' -> + (Wordboundary, i + 1) + | c -> + (Char c, i + 1) + and regexpclass0 i = + if i < len && s.[i] = '^' + then let (c, j) = regexpclass1 (i+1) in (Charset.complement c, j) + else regexpclass1 i + and regexpclass1 i = + let c = Charset.make_empty() in + let j = regexpclass2 c i i in + (c, j) + and regexpclass2 c start i = + if i >= len then failwith "[ class not closed by ]"; + if s.[i] = ']' && i > start then i+1 else begin + let c1 = s.[i] in + if i+2 < len && s.[i+1] = '-' then begin + let c2 = s.[i+2] in + Charset.add_range c c1 c2; + regexpclass2 c start (i+3) + end else begin + Charset.add c c1; + regexpclass2 c start (i+1) + end + end in + + let (r, j) = regexp0 0 in + if j = len then r else failwith "spurious \\) in regular expression" + +(** Parsing and compilation *) + +let regexp e = compile false (parse e) + +let regexp_case_fold e = compile true (parse e) let quote s = let len = String.length s in @@ -38,42 +506,80 @@ let quote s = done; String.sub buf 0 !pos -let string_before s n = String.sub s 0 n +let regexp_string s = compile false (String s) -let string_after s n = String.sub s n (String.length s - n) +let regexp_string_case_fold s = compile true (String s) -let first_chars s n = String.sub s 0 n +(** Matching functions **) -let last_chars s n = String.sub s (String.length s - n) n +external re_string_match: regexp -> string -> int -> int array + = "re_string_match" +external re_partial_match: regexp -> string -> int -> int array + = "re_partial_match" +external re_search_forward: regexp -> string -> int -> int array + = "re_search_forward" +external re_search_backward: regexp -> string -> int -> int array + = "re_search_backward" + +let last_search_result = ref [||] -let regexp e = compile_regexp e false +let string_match re s pos = + let res = re_string_match re s pos in + last_search_result := res; + Array.length res > 0 -let regexp_case_fold e = compile_regexp e true +let string_partial_match re s pos = + let res = re_partial_match re s pos in + last_search_result := res; + Array.length res > 0 -let regexp_string s = compile_regexp (quote s) false +let search_forward re s pos = + let res = re_search_forward re s pos in + last_search_result := res; + if Array.length res = 0 then raise Not_found else res.(0) -let regexp_string_case_fold s = compile_regexp (quote s) true +let search_backward re s pos = + let res = re_search_backward re s pos in + last_search_result := res; + if Array.length res = 0 then raise Not_found else res.(0) let group_beginning n = - if n < 0 || n >= 10 then invalid_arg "Str.group_beginning" else - let pos = beginning_group n in - if pos = -1 then raise Not_found else pos + let n2 = n + n in + if n < 0 || n2 >= Array.length !last_search_result then + invalid_arg "Str.group_beginning" + else + let pos = !last_search_result.(n2) in + if pos = -1 then raise Not_found else pos let group_end n = - if n < 0 || n >= 10 then invalid_arg "Str.group_end" else - let pos = end_group n in - if pos = -1 then raise Not_found else pos + let n2 = n + n in + if n < 0 || n2 >= Array.length !last_search_result then + invalid_arg "Str.group_end" + else + let pos = !last_search_result.(n2 + 1) in + if pos = -1 then raise Not_found else pos let matched_group n txt = - let b = group_beginning n and e = group_end n in String.sub txt b (e-b) + let n2 = n + n in + if n < 0 || n2 >= Array.length !last_search_result then + invalid_arg "Str.matched_group" + else + let b = !last_search_result.(n2) + and e = !last_search_result.(n2 + 1) in + if b = -1 then raise Not_found else String.sub txt b (e - b) -let replace_matched repl matched = - replacement_text repl matched - let match_beginning () = group_beginning 0 and match_end () = group_end 0 and matched_string txt = matched_group 0 txt +(** Replacement **) + +external re_replacement_text: string -> int array -> string -> string + = "re_replacement_text" + +let replace_matched repl matched = + re_replacement_text repl !last_search_result matched + let substitute_first expr repl_fun text = try let pos = search_forward expr text 0 in @@ -99,10 +605,12 @@ let global_substitute expr repl_fun text = String.concat "" (replace 0 false) let global_replace expr repl text = - global_substitute expr (replacement_text repl) text + global_substitute expr (replace_matched repl) text and replace_first expr repl text = - substitute_first expr (replacement_text repl) text - + substitute_first expr (replace_matched repl) text + +(** Splitting *) + let bounded_split expr text num = let start = if string_match expr text 0 then match_end() else 0 in diff --git a/otherlibs/str/str.mli b/otherlibs/str/str.mli index 4ef6a39dc..c7d9d1366 100644 --- a/otherlibs/str/str.mli +++ b/otherlibs/str/str.mli @@ -29,19 +29,19 @@ val regexp : string -> regexp [$^.*+?[]]. The following constructs are recognized: - [. ] matches any character except newline - [* ] (postfix) matches the previous expression zero, one or - several times + several times - [+ ] (postfix) matches the previous expression one or - several times + several times - [? ] (postfix) matches the previous expression once or - not at all + not at all - [[..] ] character set; ranges are denoted with [-], as in [[a-z]]; - an initial [^], as in [[^0-9]], complements the set + an initial [^], as in [[^0-9]], complements the set - [^ ] matches at beginning of line - [$ ] matches at end of line - [\| ] (infix) alternative between two expressions - [\(..\)] grouping and naming of the enclosed expression - [\1 ] the text matched by the first [\(...\)] expression - ([\2] for the second expression, etc) + ([\2] for the second expression, and so on up to [\9]) - [\b ] matches word boundaries - [\ ] quotes special characters. *) @@ -66,26 +66,23 @@ val regexp_string_case_fold : string -> regexp (** {6 String matching and searching} *) -external string_match : regexp -> string -> int -> bool = "str_string_match" +val string_match : regexp -> string -> int -> bool (** [string_match r s start] tests whether the characters in [s] starting at position [start] match the regular expression [r]. The first character of a string has position [0], as usual. *) -external search_forward : - regexp -> string -> int -> int = "str_search_forward" +val search_forward : regexp -> string -> int -> int (** [search_forward r s start] searchs the string [s] for a substring matching the regular expression [r]. The search starts at position [start] and proceeds towards the end of the string. Return the position of the first character of the matched substring, or raise [Not_found] if no substring matches. *) -external search_backward : - regexp -> string -> int -> int = "str_search_backward" +val search_backward : regexp -> string -> int -> int (** Same as {!Str.search_forward}, but the search proceeds towards the beginning of the string. *) -external string_partial_match : - regexp -> string -> int -> bool = "str_string_partial_match" +val string_partial_match : regexp -> string -> int -> bool (** Similar to {!Str.string_match}, but succeeds whenever the argument string is a prefix of a string that matches. This includes the case of a true complete match. *) @@ -162,12 +159,12 @@ val substitute_first : regexp -> (string -> string) -> string -> string (** Same as {!Str.global_substitute}, except that only the first substring matching the regular expression is replaced. *) +val replace_matched : string -> string -> string (** [replace_matched repl s] returns the replacement text [repl] in which [\1], [\2], etc. have been replaced by the text matched by the corresponding groups in the most recent matching operation. [s] must be the same string that was matched during this matching operation. *) -val replace_matched : string -> string -> string (** {6 Splitting} *) @@ -195,10 +192,7 @@ val split_delim : regexp -> string -> string list val bounded_split_delim : regexp -> string -> int -> string list (** Same as {!Str.bounded_split}, but occurrences of the delimiter at the beginning and at the end of the string are - recognized and returned as empty strings in the result. - For instance, [split_delim (regexp " ") " abc "] - returns [[""; "abc"; ""]], while [split] with the same - arguments returns [["abc"]]. *) + recognized and returned as empty strings in the result. *) type split_result = Text of string @@ -216,9 +210,7 @@ val bounded_full_split : regexp -> string -> int -> split_result list (** Same as {!Str.bounded_split_delim}, but returns the delimiters as well as the substrings contained between delimiters. The former are tagged [Delim] in the result list; - the latter are tagged [Text]. For instance, - [full_split (regexp "[{}]") "{ab}"] returns - [[Delim "{"; Text "ab"; Delim "}"]]. *) + the latter are tagged [Text]. *) (** {6 Extracting substrings} *) diff --git a/otherlibs/str/strstubs.c b/otherlibs/str/strstubs.c index 1e70a05f2..f64080f9b 100644 --- a/otherlibs/str/strstubs.c +++ b/otherlibs/str/strstubs.c @@ -1,170 +1,413 @@ -#if !macintosh -#include <sys/types.h> -#else -#include <SizeTDef.h> -#endif +/***********************************************************************/ +/* */ +/* Objective Caml */ +/* */ +/* Xavier Leroy, projet Cristal, INRIA Rocquencourt */ +/* */ +/* Copyright 1996 Institut National de Recherche en Informatique et */ +/* en Automatique. All rights reserved. This file is distributed */ +/* under the terms of the GNU Library General Public License, with */ +/* the special exception on linking described in file ../../LICENSE. */ +/* */ +/***********************************************************************/ + +/* $Id$ */ + +#include <assert.h> #include <string.h> -#include <regex.h> +#include <ctype.h> #include <mlvalues.h> #include <alloc.h> -#include <custom.h> -#include <fail.h> #include <memory.h> +#include <fail.h> -struct regexp_struct { - struct custom_operations * ops; - struct re_pattern_buffer re; +/* The backtracking NFA interpreter */ + +struct backtrack_point { + char * txt; + value * pc; + int mask; }; -typedef struct regexp_struct * regexp; +#define BACKTRACK_STACK_BLOCK_SIZE 500 + +struct backtrack_stack { + struct backtrack_stack * previous; + struct backtrack_point point[BACKTRACK_STACK_BLOCK_SIZE]; +}; + +#define Opcode(x) ((x) & 0xFF) +#define Arg(x) ((unsigned long)(x) >> 8) +#define SignedArg(x) ((long)(x) >> 8) + +enum { + CHAR, /* match a single character */ + CHARNORM, /* match a single character, after normalization */ + STRING, /* match a character string */ + STRINGNORM, /* match a character string, after normalization */ + CHARCLASS, /* match a character class */ + BOL, /* match at beginning of line */ + EOL, /* match at end of line */ + WORDBOUNDARY, /* match on a word boundary */ + BEGGROUP, /* record the beginning of a group */ + ENDGROUP, /* record the end of a group */ + REFGROUP, /* match a previously matched group */ + ACCEPT, /* report success */ + SIMPLEOPT, /* match a character class 0 or 1 times */ + SIMPLESTAR, /* match a character class 0, 1 or several times */ + SIMPLEPLUS, /* match a character class 1 or several times */ + GOTO, /* unconditional branch */ + PUSHBACK /* record a backtrack point -- + where to jump in case of failure */ +}; + +/* Accessors in a compiled regexp */ +#define Prog(re) Field(re, 0) +#define Cpool(re) Field(re, 1) +#define Normtable(re) Field(re, 2) +#define Numgroups(re) Int_val(Field(re, 3)) +#define Startchars(re) Int_val(Field(re, 4)) + +/* Record positions of matched groups */ +struct re_group { + unsigned char * tentative_start; + unsigned char * start; + unsigned char * end; +}; +static struct re_group re_group[32]; -static void free_regexp(value vexpr) +/* Bitvector recording which groups were fully matched */ +static int re_mask; + +/* The initial backtracking stack */ +static struct backtrack_stack initial_stack = { NULL, }; + +/* Free a chained list of backtracking stacks */ +static void free_backtrack_stack(struct backtrack_stack * stack) { - regexp expr = (regexp) Bp_val(vexpr); - expr->re.translate = NULL; - re_free(&(expr->re)); + struct backtrack_stack * prevstack; + while ((prevstack = stack->previous) != NULL) { + stat_free(stack); + stack = prevstack; + } } -static struct custom_operations regexp_ops = { - "_regexp", - free_regexp, - custom_compare_default, - custom_hash_default, - custom_serialize_default, - custom_deserialize_default +/* Membership in a bit vector representing a set of booleans */ +#define In_bitset(s,i,tmp) (tmp = (i), ((s)[tmp >> 3] >> (tmp & 7)) & 1) + +/* Determine if a character is a word constituent */ +static unsigned char re_word_letters[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, 254, 255, 255, 7, 254, 255, 255, 7, + 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 127, 255, 255, 255, 127, 255 }; +#define Is_word_letter(c) ((re_word_letters[(c) >> 3] >> ((c) & 7)) & 1) + +/* The bytecode interpreter for the NFA */ -static regexp alloc_regexp(void) +static int re_match(value re, + unsigned char * starttxt, + register unsigned char * txt, + register unsigned char * endtxt, + int accept_partial_match) { - value res = - alloc_custom(®exp_ops, sizeof(struct regexp_struct), 1, 10000); - return (regexp) res; -} + register value * pc; + struct backtrack_stack * stack; + struct backtrack_point * sp; + value cpool; + value normtable; + unsigned char c; -#define RE_SYNTAX RE_SYNTAX_EMACS + pc = &Field(Prog(re), 0); + stack = &initial_stack; + sp = stack->point; + cpool = Cpool(re); + normtable = Normtable(re); + re_mask = 0; + re_group[0].start = txt; -static char * case_fold_table = NULL; + while (1) { + long instr = Long_val(*pc++); + switch (Opcode(instr)) { + case CHAR: + if (txt == endtxt) goto prefix_match; + if (*txt != Arg(instr)) goto backtrack; + txt++; + break; + case CHARNORM: + if (txt == endtxt) goto prefix_match; + if (Byte_u(normtable, *txt) != Arg(instr)) goto backtrack; + txt++; + break; + case STRING: { + unsigned char * s = + (unsigned char *) String_val(Field(cpool, Arg(instr))); + while ((c = *s++) != 0) { + if (txt == endtxt) goto prefix_match; + if (c != *txt) goto backtrack; + txt++; + } + break; + } + case STRINGNORM: { + unsigned char * s = + (unsigned char *) String_val(Field(cpool, Arg(instr))); + while ((c = *s++) != 0) { + if (txt == endtxt) goto prefix_match; + if (c != Byte_u(normtable, *txt)) goto backtrack; + txt++; + } + break; + } + case CHARCLASS: + if (txt == endtxt) goto prefix_match; + if (! In_bitset(String_val(Field(cpool, Arg(instr))), *txt, c)) + goto backtrack; + txt++; + break; + case BOL: + if (txt > starttxt && txt[-1] != '\n') goto backtrack; + break; + case EOL: + if (txt < endtxt && *txt != '\n') goto backtrack; + break; + case WORDBOUNDARY: + /* At beginning and end of text: no + At beginning of text: OK if current char is a letter + At end of text: OK if previous char is a letter + Otherwise: + OK if previous char is a letter and current char not a letter + or previous char is not a letter and current char is a letter */ + if (txt == starttxt) { + if (txt == endtxt) goto prefix_match; + if (Is_word_letter(txt[0])) break; + goto backtrack; + } else if (txt == endtxt) { + if (Is_word_letter(txt[-1])) break; + goto backtrack; + } else { + if (Is_word_letter(txt[-1]) != Is_word_letter(txt[0])) break; + goto backtrack; + } + case BEGGROUP: { + int group_no = Arg(instr); + re_group[group_no].tentative_start = txt; + break; + } + case ENDGROUP: { + int group_no = Arg(instr); + struct re_group * group = &(re_group[group_no]); + group->start = group->tentative_start; + group->end = txt; + re_mask |= (1 << group_no); + break; + } + case REFGROUP: { + int group_no = Arg(instr); + struct re_group * group = &(re_group[group_no]); + unsigned char * s; + if ((re_mask & (1 << group_no)) == 0) goto backtrack; + for (s = group->start; s < group->end; s++) { + if (txt == endtxt) goto prefix_match; + if (*s != *txt) goto backtrack; + txt++; + } + break; + } + case ACCEPT: + goto accept; + case SIMPLEOPT: { + char * set = String_val(Field(cpool, Arg(instr))); + if (txt < endtxt && In_bitset(set, *txt, c)) txt++; + break; + } + case SIMPLESTAR: { + char * set = String_val(Field(cpool, Arg(instr))); + while (txt < endtxt && In_bitset(set, *txt, c)) + txt++; + break; + } + case SIMPLEPLUS: { + char * set = String_val(Field(cpool, Arg(instr))); + if (txt == endtxt) goto prefix_match; + if (! In_bitset(set, *txt, c)) goto backtrack; + txt++; + while (txt < endtxt && In_bitset(set, *txt, c)) + txt++; + break; + } + case GOTO: + pc = pc + SignedArg(instr); + break; + case PUSHBACK: + if (sp == stack->point + BACKTRACK_STACK_BLOCK_SIZE) { + struct backtrack_stack * newstack = + stat_alloc(sizeof(struct backtrack_stack)); + newstack->previous = stack; + stack = newstack; + sp = stack->point; + } + sp->txt = txt; + sp->pc = pc + SignedArg(instr); + sp->mask = re_mask; + sp++; + break; + default: + assert(0); + } + /* Continue with next instruction */ + continue; + prefix_match: + /* We get here when matching failed because the end of text + was encountered. */ + if (accept_partial_match) goto accept; + backtrack: + /* We get here when matching fails. Backtrack to most recent saved + point. */ + if (sp == stack->point) { + struct backtrack_stack * prevstack = stack->previous; + if (prevstack == NULL) return 0; + stat_free(stack); + stack = prevstack; + sp = stack->point + BACKTRACK_STACK_BLOCK_SIZE; + } + sp--; + txt = sp->txt; + pc = sp->pc; + re_mask = sp->mask; + } + accept: + /* We get here when the regexp was successfully matched */ + free_backtrack_stack(stack); + re_group[0].end = txt; + re_mask |= 1; + return 1; +} + +/* Allocate an integer array containing the positions of the matched groups. + Beginning of group #N is at 2N, end is at 2N+1. + Take position = -1 when group wasn't matched. */ -CAMLprim value str_compile_regexp(value src, value fold) +static value re_alloc_groups(value re, value str) { - regexp expr; - char * msg; - - Begin_root(src); - expr = alloc_regexp(); - End_roots(); - re_syntax_options = RE_SYNTAX; - if (Bool_val(fold) && case_fold_table == NULL) { - int i; - case_fold_table = stat_alloc(256); - for (i = 0; i <= 255; i++) case_fold_table[i] = i; - for (i = 'A'; i <= 'Z'; i++) case_fold_table[i] = i + 32; - for (i = 192; i <= 214; i++) case_fold_table[i] = i + 32; - for (i = 216; i <= 222; i++) case_fold_table[i] = i + 32; + CAMLparam1(str); + CAMLlocal1(res); + unsigned char * starttxt = (unsigned char *) String_val(str); + int n = Numgroups(re); + int i; + + res = alloc(n * 2, 0); + for (i = 0; i < n; i++) { + if ((re_mask & (1 << i)) == 0) { + Field(res, i * 2) = Val_int(-1); + Field(res, i * 2 + 1) = Val_int(-1); + } else { + Field(res, i * 2) = Val_long(re_group[i].start - starttxt); + Field(res, i * 2 + 1) = Val_long(re_group[i].end - starttxt); + } } - expr->re.translate = Bool_val(fold) ? case_fold_table : NULL; - expr->re.fastmap = stat_alloc(256); - expr->re.buffer = NULL; - expr->re.allocated = 0; - msg = (char *) re_compile_pattern(String_val(src), string_length(src), - &(expr->re)); - if (msg != NULL) failwith(msg); - re_compile_fastmap(&(expr->re)); - expr->re.regs_allocated = REGS_FIXED; - return (value) expr; + CAMLreturn(res); } -static regoff_t start_regs[10], end_regs[10]; - -static struct re_registers match_regs = { 10, start_regs, end_regs }; +/* String matching and searching. All functions return the empty array + on failure, and an array of positions on success. */ -CAMLprim value str_string_match(regexp expr, value text, value pos) +CAMLprim value re_string_match(value re, value str, value pos) { - int len = string_length(text); - int start = Int_val(pos); - if (start < 0 || start > len) + unsigned char * starttxt = &Byte_u(str, 0); + unsigned char * txt = &Byte_u(str, Long_val(pos)); + unsigned char * endtxt = &Byte_u(str, string_length(str)); + + if (txt < starttxt || txt > endtxt) invalid_argument("Str.string_match"); - switch (re_match(&(expr->re), String_val(text), len, - start, &match_regs)) { - case -2: - failwith("Str.string_match"); - case -1: - case -3: - return Val_false; - default: - return Val_true; + if (re_match(re, starttxt, txt, endtxt, 0)) { + return re_alloc_groups(re, str); + } else { + return Atom(0); } } -CAMLprim value str_string_partial_match(regexp expr, value text, value pos) +CAMLprim value re_partial_match(value re, value str, value pos) { - int len = string_length(text); - int start = Int_val(pos); - if (start < 0 || start > len) + unsigned char * starttxt = &Byte_u(str, 0); + unsigned char * txt = &Byte_u(str, Long_val(pos)); + unsigned char * endtxt = &Byte_u(str, string_length(str)); + + if (txt < starttxt || txt > endtxt) invalid_argument("Str.string_partial_match"); - switch (re_match(&(expr->re), String_val(text), len, - start, &match_regs)) { - case -2: - failwith("Str.string_partial_match"); - case -1: - return Val_false; - default: - return Val_true; + if (re_match(re, starttxt, txt, endtxt, 1)) { + return re_alloc_groups(re, str); + } else { + return Atom(0); } } -CAMLprim value str_search_forward(regexp expr, value text, value pos) +CAMLprim value re_search_forward(value re, value str, value startpos) { - int res; - int len = string_length(text); - int start = Int_val(pos); - if (start < 0 || start > len) + unsigned char * starttxt = &Byte_u(str, 0); + unsigned char * txt = &Byte_u(str, Long_val(startpos)); + unsigned char * endtxt = &Byte_u(str, string_length(str)); + unsigned char * startchars; + unsigned char c; + + if (txt < starttxt || txt > endtxt) invalid_argument("Str.search_forward"); - res = re_search(&(expr->re), String_val(text), len, start, len-start, - &match_regs); - switch(res) { - case -2: - failwith("Str.search_forward"); - case -1: - raise_not_found(); - default: - return Val_int(res); + if (Startchars(re) == -1) { + do { + if (re_match(re, starttxt, txt, endtxt, 0)) + return re_alloc_groups(re, str); + txt++; + } while (txt <= endtxt); + return Atom(0); + } else { + startchars = + (unsigned char *) String_val(Field(Cpool(re), Startchars(re))); + do { + while (txt < endtxt && startchars[*txt] == 0) txt++; + if (re_match(re, starttxt, txt, endtxt, 0)) + return re_alloc_groups(re, str); + txt++; + } while (txt <= endtxt); + return Atom(0); } } -CAMLprim value str_search_backward(regexp expr, value text, value pos) +CAMLprim value re_search_backward(value re, value str, value startpos) { - int res; - int len = string_length(text); - int start = Int_val(pos); - if (start < 0 || start > len) + unsigned char * starttxt = &Byte_u(str, 0); + unsigned char * txt = &Byte_u(str, Long_val(startpos)); + unsigned char * endtxt = &Byte_u(str, string_length(str)); + unsigned char * startchars; + unsigned char c; + + if (txt < starttxt || txt > endtxt) invalid_argument("Str.search_backward"); - res = re_search(&(expr->re), String_val(text), len, start, -start-1, - &match_regs); - switch(res) { - case -2: - failwith("Str.search_backward"); - case -1: - raise_not_found(); - default: - return Val_int(res); + if (Startchars(re) == -1) { + do { + if (re_match(re, starttxt, txt, endtxt, 0)) + return re_alloc_groups(re, str); + txt--; + } while (txt >= starttxt); + return Atom(0); + } else { + startchars = + (unsigned char *) String_val(Field(Cpool(re), Startchars(re))); + do { + while (txt > starttxt && startchars[*txt] == 0) txt--; + if (re_match(re, starttxt, txt, endtxt, 0)) + return re_alloc_groups(re, str); + txt--; + } while (txt >= starttxt); + return Atom(0); } } -CAMLprim value str_beginning_group(value ngroup) -{ - return Val_int(start_regs[Int_val(ngroup)]); -} - -CAMLprim value str_end_group(value ngroup) -{ - return Val_int(end_regs[Int_val(ngroup)]); -} +/* Replacement */ -CAMLprim value str_replacement_text(value repl, value orig) +CAMLprim value re_replacement_text(value repl, value groups, value orig) { - value res; - mlsize_t len, n; + CAMLparam3(repl, groups, orig); + CAMLlocal1(res); + mlsize_t start, end, len, n; char * p, * q; int c; @@ -184,15 +427,20 @@ CAMLprim value str_replacement_text(value repl, value orig) case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c -= '0'; - len += end_regs[c] - start_regs[c]; break; + if (c*2 >= Wosize_val(groups)) + failwith("Str.replace: reference to unmatched group"); + start = Long_val(Field(groups, c*2)); + end = Long_val(Field(groups, c*2 + 1)); + if (start == (mlsize_t) -1) + failwith("Str.replace: reference to unmatched group"); + len += end - start; + break; default: len += 2; break; } } } - Begin_roots2(orig,repl); - res = alloc_string(len); - End_roots(); + res = alloc_string(len); p = String_val(repl); q = String_val(res); n = string_length(repl); @@ -208,8 +456,10 @@ CAMLprim value str_replacement_text(value repl, value orig) case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c -= '0'; - len = end_regs[c] - start_regs[c]; - memmove (q, &Byte(orig, start_regs[c]), len); + start = Long_val(Field(groups, c*2)); + end = Long_val(Field(groups, c*2 + 1)); + len = end - start; + memmove (q, &Byte(orig, start), len); q += len; break; default: @@ -217,6 +467,6 @@ CAMLprim value str_replacement_text(value repl, value orig) } } } - return res; + CAMLreturn(res); } |