Reimplementation complete de la bibliotheque Str en n'utilisant pas GNU regex

git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@5324 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
author: Xavier Leroy <xavier.leroy@inria.fr> 2002-12-09 14:05:18 +0000
committer: Xavier Leroy <xavier.leroy@inria.fr> 2002-12-09 14:05:18 +0000
commit: 211fc51518273dc50deeb873dee7c353b203dc3e (patch)
tree: 40d5921351c6b6f7b83f2dac46ea0bd2d123c0c2 /otherlibs
parent: 11392c1894b4be951325ffb50cd308c1271c6cad (diff)
7 files changed, 941 insertions, 215 deletions
diff --git a/otherlibs/str/.depend b/otherlibs/str/.depend
index 4d370653d..70671c2f8 100644
--- a/otherlibs/str/.depend
+++ b/otherlibs/str/.depend
@@ -1,8 +1,5 @@
-strstubs.o: strstubs.c regex-0.12/regex.h ../../byterun/mlvalues.h \
-  ../../byterun/config.h ../../config/m.h ../../config/s.h \
-  ../../byterun/misc.h ../../byterun/alloc.h ../../byterun/custom.h \
-  ../../byterun/fail.h ../../byterun/memory.h ../../byterun/gc.h \
-  ../../byterun/major_gc.h ../../byterun/freelist.h \
+strstubs.o: strstubs.c ../../config/s.h ../../byterun/misc.h \
+  ../../byterun/gc.h ../../byterun/major_gc.h ../../byterun/freelist.h \
   ../../byterun/minor_gc.h
 str.cmo: str.cmi 
 str.cmx: str.cmi 
diff --git a/otherlibs/str/Makefile b/otherlibs/str/Makefile
index 22b440c19..97123ddaa 100644
--- a/otherlibs/str/Makefile
+++ b/otherlibs/str/Makefile
@@ -19,12 +19,11 @@ include ../../config/Makefile
 
 # Compilation options
 CC=$(BYTECC)
-CFLAGS=-O -I$(REGEXLIB) -I../../byterun $(BYTECCCOMPOPTS) $(SHAREDCCCOMPOPTS)
+CFLAGS=-O -I../../byterun $(BYTECCCOMPOPTS) $(SHAREDCCCOMPOPTS)
 CAMLC=../../ocamlcomp.sh
 CAMLOPT=../../ocamlcompopt.sh
 COMPFLAGS=-warn-error A
-REGEXLIB=regex-0.12
-COBJS=strstubs.o $(REGEXLIB)/regex.o
+COBJS=strstubs.o
 MKLIB=../../boot/ocamlrun ../../tools/ocamlmklib
 
 all: libstr.a str.cmi str.cma
@@ -40,9 +39,6 @@ str.cma: str.cmo
 str.cmxa: str.cmx
 	$(MKLIB) -ocamlopt '$(CAMLOPT)' -o str str.cmx
 
-$(REGEXLIB)/regex.o: $(REGEXLIB)/regex.c $(REGEXLIB)/regex.h
-	cd $(REGEXLIB); CC="$(CC) $(BYTECCCOMPOPTS) $(SHAREDCCCOMPOPTS)" sh configure; $(MAKE)
-
 str.cmx: ../../ocamlopt
 
 partialclean:
@@ -50,7 +46,6 @@ partialclean:
 
 clean: partialclean
 	rm -f *.a *.so *.o
-	cd $(REGEXLIB); if test -f Makefile; then $(MAKE) distclean; else exit 0; fi
 
 install:
 	if test -f dllstr.so; then cp dllstr.so $(STUBLIBDIR)/dllstr.so; fi
diff --git a/otherlibs/str/Makefile.Mac b/otherlibs/str/Makefile.Mac
index a4b65258b..c5345acd0 100644
--- a/otherlibs/str/Makefile.Mac
+++ b/otherlibs/str/Makefile.Mac
@@ -17,12 +17,11 @@
 
 # Compilation options
 PPCC = mrc
-PPCCOptions = -i {REGEXLIB},:::byterun:,:::config: -w 7 {cdbgflag}
+PPCCOptions = -i :::byterun:,:::config: -w 7 {cdbgflag}
 
 CAMLC = :::boot:ocamlrun :::ocamlc -I :::stdlib:
-REGEXLIB = :regex-0.12:
 
-PPCCOBJS = strstubs.c.x {REGEXLIB}regex.c.x
+PPCCOBJS = strstubs.c.x
 
 all � libstr.x str.cmi str.cma
 
@@ -32,15 +31,11 @@ libstr.x � {PPCCOBJS}
 str.cma � str.cmo
     {CAMLC} -a -o str.cma str.cmo
 
-{REGEXLIB}regex.c.x � {REGEXLIB}regex.c {REGEXLIB}regex.h
-    directory {REGEXLIB}; domake; directory ::
-
 partialclean �
     delete -i �.cm[aio] || set status 0
 
 clean � partialclean
     delete -i �.x || set status 0
-    directory {REGEXLIB}; domake distclean; directory ::
 
 install �
     duplicate -y libstr.x str.cma str.cmi "{LIBDIR}"
diff --git a/otherlibs/str/Makefile.nt b/otherlibs/str/Makefile.nt
index ec7ac0559..3d65d19f0 100644
--- a/otherlibs/str/Makefile.nt
+++ b/otherlibs/str/Makefile.nt
@@ -19,13 +19,11 @@ include ../../config/Makefile
 
 # Compilation options
 CC=$(BYTECC)
-CFLAGS=-I$(REGEXLIB) -I../../byterun
+CFLAGS=-I../../byterun
 CAMLC=../../boot/ocamlrun ../../ocamlc -I ../../boot
 CAMLOPT=../../boot/ocamlrun ../../ocamlopt -I ../../stdlib
-REGEXLIB=regex-0.12
-REGEXFLAGS=-DREGEX_MALLOC -DSTDC_HEADERS
-DCOBJS=strstubs.$(DO) $(REGEXLIB)/regex.$(DO)
-SCOBJS=strstubs.$(SO) $(REGEXLIB)/regex.$(SO)
+DCOBJS=strstubs.$(DO)
+SCOBJS=strstubs.$(SO)
 
 all: dllstr.dll libstr.$(A) str.cmi str.cma
 
@@ -44,14 +42,6 @@ str.cma: str.cmo
 str.cmxa: str.cmx
 	$(CAMLOPT) -a -o str.cmxa str.cmx -cclib -lstr
 
-$(REGEXLIB)/regex.$(DO): $(REGEXLIB)/regex.c $(REGEXLIB)/regex.h
-	cd $(REGEXLIB); $(CC) $(REGEXFLAGS) $(DLLCCCOMPOPTS) -c regex.c
-	mv $(REGEXLIB)/regex.$(O) $(REGEXLIB)/regex.$(DO)
-
-$(REGEXLIB)/regex.$(SO): $(REGEXLIB)/regex.c $(REGEXLIB)/regex.h
-	cd $(REGEXLIB); $(CC) $(REGEXFLAGS) $(BYTECCCOMPOPTS) -c regex.c
-	mv $(REGEXLIB)/regex.$(O) $(REGEXLIB)/regex.$(SO)
-
 str.cmx: ../../ocamlopt
 
 partialclean:
@@ -59,7 +49,6 @@ partialclean:
 
 clean: partialclean
 	rm -f *.$(A) *.dll *.$(O) *.$(SO)
-	rm -f $(REGEXLIB)/*.$(O)
 
 install:
 	cp dllstr.dll $(STUBLIBDIR)/dllstr.dll
diff --git a/otherlibs/str/str.ml b/otherlibs/str/str.ml
index d20de2647..178d3ff2c 100644
--- a/otherlibs/str/str.ml
+++ b/otherlibs/str/str.ml
@@ -13,17 +13,485 @@
 
 (* $Id$ *)
 
-type regexp
-
-external compile_regexp: string -> bool -> regexp = "str_compile_regexp"
-external string_match: regexp -> string -> int -> bool = "str_string_match"
-external string_partial_match: regexp -> string -> int -> bool =
-  "str_string_partial_match"
-external search_forward: regexp -> string -> int -> int = "str_search_forward"
-external search_backward: regexp -> string -> int -> int = "str_search_backward"
-external beginning_group: int -> int = "str_beginning_group"
-external end_group: int -> int = "str_end_group"
-external replacement_text: string -> string -> string = "str_replacement_text"
+(** String utilities *)
+
+let string_before s n = String.sub s 0 n
+
+let string_after s n = String.sub s n (String.length s - n)
+
+let first_chars s n = String.sub s 0 n
+
+let last_chars s n = String.sub s (String.length s - n) n
+
+(** Representation of character sets **)
+
+module Charset =
+  struct
+    type t = string (* of length 32 *)
+
+    let empty = String.make 32 '\000'
+    let full = String.make 32 '\255'
+
+    let make_empty () = String.make 32 '\000'
+
+    let add s c =
+      let i = Char.code c in
+      s.[i lsr 3] <- Char.chr(Char.code s.[i lsr 3] lor (1 lsl (i land 7)))
+
+    let add_range s c1 c2 =
+      for i = Char.code c1 to Char.code c2 do add s (Char.chr i) done
+
+    let singleton c =
+      let s = make_empty () in add s c; s
+
+    let range c1 c2 =
+      let s = make_empty () in add_range s c1 c2; s
+
+    let complement s =
+      let r = String.create 32 in
+      for i = 0 to 31 do
+        r.[i] <- Char.chr(Char.code s.[i] lxor 0xFF)
+      done;
+      r
+
+    let union s1 s2 =
+      let r = String.create 32 in
+      for i = 0 to 31 do
+        r.[i] <- Char.chr(Char.code s1.[i] lor Char.code s2.[i])
+      done;
+      r
+
+    let disjoint s1 s2 =
+      try
+        for i = 0 to 31 do
+          if Char.code s1.[i] land Char.code s2.[i] <> 0 then raise Exit
+        done;
+        true
+      with Exit ->
+        false
+
+    let iter fn s =
+      for i = 0 to 31 do
+        let c = Char.code s.[i] in
+        if c <> 0 then
+          for j = 0 to 7 do
+            if c land (1 lsl j) <> 0 then fn (Char.chr ((i lsl 3) + j))
+          done
+      done
+
+    let expand s =
+      let r = String.make 256 '\000' in
+      iter (fun c -> r.[Char.code c] <- '\001') s;
+      r
+
+    let fold_case s =
+      let r = make_empty() in
+      iter (fun c -> add r (Char.lowercase c); add r (Char.uppercase c)) s;
+      r
+
+  end
+
+(** Abstract syntax tree for regular expressions *)
+
+type re_syntax =
+    Char of char
+  | String of string
+  | CharClass of Charset.t
+  | Seq of re_syntax list
+  | Alt of re_syntax * re_syntax
+  | Star of re_syntax
+  | Plus of re_syntax
+  | Option of re_syntax
+  | Group of int * re_syntax
+  | Refgroup of int
+  | Bol
+  | Eol
+  | Wordboundary
+
+(** Representation of compiled regular expressions *)
+
+type regexp = {
+  prog: int array;         (* bytecode instructions *)
+  cpool: string array;     (* constant pool (string literals) *)
+  normtable: string;       (* case folding table (if any) *)
+  numgroups: int;          (* number of \(...\) groups *)
+  startchars: int          (* index of set of starting chars, or -1 if none *)
+}
+
+(** Opcodes for bytecode instructions; see strstubs.c for description *)
+
+let op_CHAR = 0
+let op_CHARNORM = 1
+let op_STRING = 2
+let op_STRINGNORM = 3
+let op_CHARCLASS = 4
+let op_BOL = 5
+let op_EOL = 6
+let op_WORDBOUNDARY = 7
+let op_BEGGROUP = 8
+let op_ENDGROUP = 9
+let op_REFGROUP = 10
+let op_ACCEPT = 11
+let op_SIMPLEOPT = 12
+let op_SIMPLESTAR = 13
+let op_SIMPLEPLUS = 14
+let op_GOTO = 15
+let op_PUSHBACK = 16
+
+(* Encoding of bytecode instructions *)
+
+let instr opc arg = opc lor (arg lsl 8)
+
+(* Computing relative displacements for GOTO and PUSHBACK instructions *)
+
+let displ dest from = dest - from - 1
+
+(** Compilation of a regular expression *)
+
+(* first r returns a set of characters C such that:
+     for all string s, s matches r => the first character of s is in C *)
+
+let rec first = function
+    Char c -> Charset.singleton c
+  | String s -> if s = "" then Charset.full else Charset.singleton s.[0]
+  | CharClass cl -> cl
+  | Seq rl -> first_seq rl
+  | Alt (r1, r2) -> Charset.union (first r1) (first r2)
+  | Star r -> Charset.full
+  | Plus r -> first r
+  | Option r -> Charset.full
+  | Group(n, r) -> first r
+  | Refgroup n -> Charset.full
+  | Bol -> Charset.full
+  | Eol -> Charset.full
+  | Wordboundary -> Charset.full
+
+and first_seq = function
+    [] -> Charset.full
+  | (Bol | Eol | Wordboundary) :: rl -> first_seq rl
+  | Star r :: rl -> Charset.union (first r) (first_seq rl)
+  | Option r :: rl -> Charset.union (first r) (first_seq rl)
+  | r :: rl -> first r
+
+(* Transform a Char or CharClass regexp into a character class *)
+
+let charclass_of_regexp fold_case re =
+  let cl =
+    match re with
+      Char c -> Charset.singleton c
+    | CharClass cl -> cl
+    | _ -> assert false in
+  if fold_case then Charset.fold_case cl else cl
+
+(* The case fold table: maps characters to their lowercase equivalent *)
+
+let fold_case_table =
+  let t = String.create 256 in
+  for i = 0 to 255 do t.[i] <- Char.lowercase(Char.chr i) done;
+  t
+
+module StringMap = Map.Make(struct type t = string let compare = compare end)
+
+(* Compilation of a regular expression *)
+
+let compile fold_case re =
+
+  (* Instruction buffering *)
+  let prog = ref (Array.make 32 0)
+  and progpos = ref 0
+  and cpool = ref StringMap.empty
+  and cpoolpos = ref 0
+  and numgroups = ref 1 in
+  (* Add a new instruction *)
+  let emit_instr opc arg =
+    if !progpos >= Array.length !prog then begin
+      let nprog = Array.make (2 * Array.length !prog) 0 in
+      Array.blit !prog 0 nprog 0 (Array.length !prog);
+      prog := nprog
+    end;
+    (!prog).(!progpos) <- (instr opc arg);
+    incr progpos in
+  (* Reserve an instruction slot and return its position *)
+  let emit_hole () =
+    let p = !progpos in incr progpos; p in
+  (* Fill a reserved instruction slot with a GOTO or PUSHBACK instruction *)
+  let patch_instr pos opc dest =
+    (!prog).(pos) <- (instr opc (displ dest pos)) in
+  (* Return the cpool index for the given string, adding it if not
+     already there *)
+  let cpool_index s =
+    try
+      StringMap.find s !cpool
+    with Not_found ->
+      let p = !cpoolpos in
+      cpool := StringMap.add s p !cpool;
+      incr cpoolpos;
+      p in
+  (* Main recursive compilation function *)
+  let rec emit_code = function
+    Char c ->
+      if fold_case then
+        emit_instr op_CHARNORM (Char.code (Char.lowercase c))
+      else
+        emit_instr op_CHAR (Char.code c)
+  | String s ->
+      begin match String.length s with
+        0 -> ()
+      | 1 ->
+        if fold_case then
+          emit_instr op_CHARNORM (Char.code (Char.lowercase s.[0]))
+        else
+          emit_instr op_CHAR (Char.code s.[0])
+      | _ ->
+        try
+          (* null characters are not accepted by the STRING* instructions;
+             if one is found, split string at null character *)
+          let i = String.index s '\000' in
+          emit_code (String (string_before s i));
+          emit_instr op_CHAR 0;
+          emit_code (String (string_after s (i+1)))
+        with Not_found ->
+          if fold_case then
+            emit_instr op_STRINGNORM (cpool_index (String.lowercase s))
+          else
+            emit_instr op_STRING (cpool_index s)
+      end
+  | CharClass cl ->
+      let cl' = if fold_case then Charset.fold_case cl else cl in
+      emit_instr op_CHARCLASS (cpool_index cl')
+  | Seq rl ->
+      emit_seq_code rl
+  | Alt(r1, r2) ->
+      (*      PUSHBACK lbl1
+              <match r1>
+              GOTO lbl2
+        lbl1: <match r2>
+        lbl2: ... *)
+      let pos_pushback = emit_hole() in
+      emit_code r1;
+      let pos_goto_end = emit_hole() in
+      let lbl1 = !progpos in
+      emit_code r2;
+      let lbl2 = !progpos in
+      patch_instr pos_pushback op_PUSHBACK lbl1;
+      patch_instr pos_goto_end op_GOTO lbl2
+  | Star r ->
+      (* Implement longest match semantics for compatibility with old Str *)
+      (* lbl1: PUSHBACK lbl2
+               <match r>
+               GOTO lbl1
+         lbl2:
+      *)
+      let lbl1 = emit_hole() in
+      emit_code r;
+      emit_instr op_GOTO (displ lbl1 !progpos);
+      let lbl2 = !progpos in
+      patch_instr lbl1 op_PUSHBACK lbl2
+  | Plus r ->
+      (* Implement longest match semantics for compatibility with old Str *)
+      (* lbl1: <match r>
+               PUSHBACK lbl2
+               GOTO lbl1
+         lbl2:
+      *)
+      let lbl1 = !progpos in
+      emit_code r;
+      let pos_pushback = emit_hole() in
+      emit_instr op_GOTO (displ lbl1 !progpos);
+      let lbl2 = !progpos in
+      patch_instr pos_pushback op_PUSHBACK lbl2
+  | Option r ->
+      (* Implement longest match semantics for compatibility with old Str *)
+      (*      PUSHBACK lbl
+              <match r>
+         lbl:
+      *)
+      let pos_pushback = emit_hole() in
+      emit_code r;
+      let lbl = !progpos in
+      patch_instr pos_pushback op_PUSHBACK lbl
+  | Group(n, r) -> 
+      if n >= 32 then failwith "too many \\(...\\) groups";
+      emit_instr op_BEGGROUP n;
+      emit_code r;
+      emit_instr op_ENDGROUP n;
+      numgroups := max !numgroups (n+1)
+  | Refgroup n ->
+      emit_instr op_REFGROUP n      
+  | Bol ->
+      emit_instr op_BOL 0
+  | Eol ->
+      emit_instr op_EOL 0
+  | Wordboundary ->
+      emit_instr op_WORDBOUNDARY 0
+
+  and emit_seq_code = function
+    [] -> ()
+  | Star(Char _ | CharClass _ as r) :: rl
+    when disjoint_modulo_case (first r) (first_seq rl) ->
+      emit_instr op_SIMPLESTAR (cpool_index (charclass_of_regexp fold_case r));
+      emit_seq_code rl
+  | Plus(Char _ | CharClass _ as r) :: rl
+    when disjoint_modulo_case (first r) (first_seq rl) ->
+      emit_instr op_SIMPLEPLUS (cpool_index (charclass_of_regexp fold_case r));
+      emit_seq_code rl
+  | Option(Char _ | CharClass _ as r) :: rl
+    when disjoint_modulo_case (first r) (first_seq rl) ->
+      emit_instr op_SIMPLEOPT (cpool_index (charclass_of_regexp fold_case r));
+      emit_seq_code rl
+  | r :: rl ->
+      emit_code r;
+      emit_seq_code rl
+
+  and disjoint_modulo_case c1 c2 =
+    if fold_case
+    then Charset.disjoint (Charset.fold_case c1) (Charset.fold_case c2)
+    else Charset.disjoint c1 c2
+  in
+
+  emit_code re;
+  emit_instr op_ACCEPT 0;
+  let start = first re in
+  let start' = if fold_case then Charset.fold_case start else start in
+  let start_pos =
+    if start = Charset.full
+    then -1
+    else cpool_index (Charset.expand start') in
+  let constantpool = Array.make !cpoolpos "" in
+  StringMap.iter (fun str idx -> constantpool.(idx) <- str) !cpool;
+  { prog = Array.sub !prog 0 !progpos;
+    cpool = constantpool;
+    normtable = if fold_case then fold_case_table else "";
+    numgroups = !numgroups;
+    startchars = start_pos }
+
+(** Parsing of a regular expression *)
+
+(* Efficient buffering of sequences *)
+
+module SeqBuffer = struct
+
+  type t = { sb_chars: Buffer.t; mutable sb_next: re_syntax list }
+
+  let create() = { sb_chars = Buffer.create 16; sb_next = [] }
+
+  let flush buf =
+    let s = Buffer.contents buf.sb_chars in
+    Buffer.clear buf.sb_chars;
+    match String.length s with
+      0 -> ()
+    | 1 -> buf.sb_next <- Char s.[0] :: buf.sb_next
+    | _ -> buf.sb_next <- String s :: buf.sb_next
+
+  let add buf re =
+    match re with
+      Char c -> Buffer.add_char buf.sb_chars c
+    | _ -> flush buf; buf.sb_next <- re :: buf.sb_next
+
+  let extract buf =
+    flush buf; Seq(List.rev buf.sb_next)
+
+end
+
+(* The character class corresponding to `.' *)
+
+let dotclass = Charset.complement (Charset.singleton '\n')
+
+(* Parse a regular expression *)
+
+let parse s =
+  let len = String.length s in
+  let group_counter = ref 1 in
+
+  let rec regexp0 i =
+    let (r, j) = regexp1 i in
+    regexp0cont r j
+  and regexp0cont r1 i =
+    if i + 2 <= len && s.[i] = '\\' && s.[i+1] = '|' then
+      let (r2, j) = regexp1 (i+2) in
+      regexp0cont (Alt(r1, r2)) j
+    else
+      (r1, i)
+  and regexp1 i =
+    regexp1cont (SeqBuffer.create()) i
+  and regexp1cont sb i =
+    if i >= len
+    || i + 2 <= len && s.[i] = '\\' && (let c = s.[i+1] in c = '|' || c = ')')
+    then
+      (SeqBuffer.extract sb, i)
+    else
+      let (r, j) = regexp2 i in
+      SeqBuffer.add sb r;
+      regexp1cont sb j
+  and regexp2 i =
+    let (r, j) = regexp3 i in
+    regexp2cont r j
+  and regexp2cont r i =
+    if i >= len then (r, i) else
+      match s.[i] with
+        '?' -> regexp2cont (Option r) (i+1)
+      | '*' -> regexp2cont (Star r) (i+1)
+      | '+' -> regexp2cont (Plus r) (i+1)
+      |  _  -> (r, i)
+  and regexp3 i =
+    match s.[i] with
+      '\\' -> regexpbackslash (i+1)
+    | '['  -> let (c, j) = regexpclass0 (i+1) in (CharClass c, j)
+    | '^'  -> (Bol, i+1)
+    | '$'  -> (Eol, i+1)
+    | '.'  -> (CharClass dotclass, i+1)
+    | c    -> (Char c, i+1)
+  and regexpbackslash i =
+    if i >= len then (Char '\\', i) else
+      match s.[i] with
+        '|' | ')' ->
+          assert false
+      | '(' ->
+          let group_no = !group_counter in
+          if group_no < 32 then incr group_counter;
+          let (r, j) = regexp0 (i+1) in
+          if j + 1 < len && s.[j] = '\\' && s.[j+1] = ')' then
+            if group_no < 32
+            then (Group(group_no, r), j + 2)
+            else (r, j + 2)
+          else
+            failwith "\\( group not closed by \\)"
+      | '1' .. '9' as c ->
+          (Refgroup(Char.code c - 48), i + 1)
+      | 'b' ->
+          (Wordboundary, i + 1)
+      | c ->
+          (Char c, i + 1)
+  and regexpclass0 i =
+    if i < len && s.[i] = '^'
+    then let (c, j) = regexpclass1 (i+1) in (Charset.complement c, j)
+    else regexpclass1 i
+  and regexpclass1 i =
+    let c = Charset.make_empty() in
+    let j = regexpclass2 c i i in
+    (c, j)
+  and regexpclass2 c start i =
+    if i >= len then failwith "[ class not closed by ]";
+    if s.[i] = ']' && i > start then i+1 else begin
+      let c1 = s.[i] in
+      if i+2 < len && s.[i+1] = '-' then begin
+        let c2 = s.[i+2] in
+        Charset.add_range c c1 c2;
+        regexpclass2 c start (i+3)
+      end else begin
+        Charset.add c c1;
+        regexpclass2 c start (i+1)
+      end
+    end in
+
+  let (r, j) = regexp0 0 in
+  if j = len then r else failwith "spurious \\) in regular expression"
+
+(** Parsing and compilation *)
+
+let regexp e = compile false (parse e)
+
+let regexp_case_fold e = compile true (parse e)
 
 let quote s =
   let len = String.length s in
@@ -38,42 +506,80 @@ let quote s =
   done;
   String.sub buf 0 !pos
 
-let string_before s n = String.sub s 0 n
+let regexp_string s = compile false (String s) 
 
-let string_after s n = String.sub s n (String.length s - n)
+let regexp_string_case_fold s = compile true (String s) 
 
-let first_chars s n = String.sub s 0 n
+(** Matching functions **)
 
-let last_chars s n = String.sub s (String.length s - n) n
+external re_string_match: regexp -> string -> int -> int array
+     = "re_string_match"
+external re_partial_match: regexp -> string -> int -> int array
+     = "re_partial_match"
+external re_search_forward: regexp -> string -> int -> int array
+     = "re_search_forward"
+external re_search_backward: regexp -> string -> int -> int array
+     = "re_search_backward"
+
+let last_search_result = ref [||]
 
-let regexp e = compile_regexp e false
+let string_match re s pos =
+  let res = re_string_match re s pos in
+  last_search_result := res;
+  Array.length res > 0
 
-let regexp_case_fold e = compile_regexp e true
+let string_partial_match re s pos =
+  let res = re_partial_match re s pos in
+  last_search_result := res;
+  Array.length res > 0
 
-let regexp_string s = compile_regexp (quote s) false
+let search_forward re s pos =
+  let res = re_search_forward re s pos in
+  last_search_result := res;
+  if Array.length res = 0 then raise Not_found else res.(0)
 
-let regexp_string_case_fold s = compile_regexp (quote s) true
+let search_backward re s pos =
+  let res = re_search_backward re s pos in
+  last_search_result := res;
+  if Array.length res = 0 then raise Not_found else res.(0)
 
 let group_beginning n =
-  if n < 0 || n >= 10 then invalid_arg "Str.group_beginning" else
-  let pos = beginning_group n in
-  if pos = -1 then raise Not_found else pos
+  let n2 = n + n in
+  if n < 0 || n2 >= Array.length !last_search_result then
+    invalid_arg "Str.group_beginning"
+  else
+    let pos = !last_search_result.(n2) in
+    if pos = -1 then raise Not_found else pos
 
 let group_end n =
-  if n < 0 || n >= 10 then invalid_arg "Str.group_end" else
-  let pos = end_group n in
-  if pos = -1 then raise Not_found else pos
+  let n2 = n + n in
+  if n < 0 || n2 >= Array.length !last_search_result then
+    invalid_arg "Str.group_end"
+  else
+    let pos = !last_search_result.(n2 + 1) in
+    if pos = -1 then raise Not_found else pos
 
 let matched_group n txt =
-  let b = group_beginning n and e = group_end n in String.sub txt b (e-b)
+  let n2 = n + n in
+  if n < 0 || n2 >= Array.length !last_search_result then
+    invalid_arg "Str.matched_group"
+  else
+    let b = !last_search_result.(n2)
+    and e = !last_search_result.(n2 + 1) in
+    if b = -1 then raise Not_found else String.sub txt b (e - b)
 
-let replace_matched repl matched =
-  replacement_text repl matched
-  
 let match_beginning () = group_beginning 0
 and match_end () = group_end 0
 and matched_string txt = matched_group 0 txt
 
+(** Replacement **)
+
+external re_replacement_text: string -> int array -> string -> string
+    = "re_replacement_text"
+
+let replace_matched repl matched =
+  re_replacement_text repl !last_search_result matched
+
 let substitute_first expr repl_fun text =
   try
     let pos = search_forward expr text 0 in
@@ -99,10 +605,12 @@ let global_substitute expr repl_fun text =
   String.concat "" (replace 0 false)
 
 let global_replace expr repl text =
-  global_substitute expr (replacement_text repl) text
+  global_substitute expr (replace_matched repl) text
 and replace_first expr repl text =
-  substitute_first expr (replacement_text repl) text  
-  
+  substitute_first expr (replace_matched repl) text  
+
+(** Splitting *)
+
 let bounded_split expr text num =
   let start =
     if string_match expr text 0 then match_end() else 0 in
diff --git a/otherlibs/str/str.mli b/otherlibs/str/str.mli
index 4ef6a39dc..c7d9d1366 100644
--- a/otherlibs/str/str.mli
+++ b/otherlibs/str/str.mli
@@ -29,19 +29,19 @@ val regexp : string -> regexp
    [$^.*+?[]]. The following constructs are recognized:
    - [.     ] matches any character except newline
    - [*     ] (postfix) matches the previous expression zero, one or
-     several times
+              several times
    - [+     ] (postfix) matches the previous expression one or
-     several times
+              several times
    - [?     ] (postfix) matches the previous expression once or
-     not at all
+              not at all
    - [[..]  ] character set; ranges are denoted with [-], as in [[a-z]];
-     an initial [^], as in [[^0-9]], complements the set
+              an initial [^], as in [[^0-9]], complements the set
    - [^     ] matches at beginning of line
    - [$     ] matches at end of line
    - [\|    ] (infix) alternative between two expressions
    - [\(..\)] grouping and naming of the enclosed expression
    - [\1    ] the text matched by the first [\(...\)] expression
-     ([\2] for the second expression, etc)
+     ([\2] for the second expression, and so on up to [\9])
    - [\b    ] matches word boundaries
    - [\     ] quotes special characters. *)
 
@@ -66,26 +66,23 @@ val regexp_string_case_fold : string -> regexp
 (** {6 String matching and searching} *)
 
 
-external string_match : regexp -> string -> int -> bool = "str_string_match"
+val string_match : regexp -> string -> int -> bool
 (** [string_match r s start] tests whether the characters in [s]
    starting at position [start] match the regular expression [r].
    The first character of a string has position [0], as usual. *)
 
-external search_forward :
-  regexp -> string -> int -> int = "str_search_forward"
+val search_forward : regexp -> string -> int -> int
 (** [search_forward r s start] searchs the string [s] for a substring
    matching the regular expression [r]. The search starts at position
    [start] and proceeds towards the end of the string.
    Return the position of the first character of the matched
    substring, or raise [Not_found] if no substring matches. *)
 
-external search_backward :
-  regexp -> string -> int -> int = "str_search_backward"
+val search_backward : regexp -> string -> int -> int
 (** Same as {!Str.search_forward}, but the search proceeds towards the
    beginning of the string. *)
 
-external string_partial_match :
-  regexp -> string -> int -> bool = "str_string_partial_match"
+val string_partial_match : regexp -> string -> int -> bool
 (** Similar to {!Str.string_match}, but succeeds whenever the argument
    string is a prefix of a string that matches.  This includes
    the case of a true complete match. *)
@@ -162,12 +159,12 @@ val substitute_first : regexp -> (string -> string) -> string -> string
 (** Same as {!Str.global_substitute}, except that only the first substring
    matching the regular expression is replaced. *)
 
+val replace_matched : string -> string -> string
 (** [replace_matched repl s] returns the replacement text [repl]
    in which [\1], [\2], etc. have been replaced by the text
    matched by the corresponding groups in the most recent matching
    operation.  [s] must be the same string that was matched during
    this matching operation. *)     
-val replace_matched : string -> string -> string
         
 
 (** {6 Splitting} *)
@@ -195,10 +192,7 @@ val split_delim : regexp -> string -> string list
 val bounded_split_delim : regexp -> string -> int -> string list
 (** Same as {!Str.bounded_split}, but occurrences of the
    delimiter at the beginning and at the end of the string are
-   recognized and returned as empty strings in the result.
-   For instance, [split_delim (regexp " ") " abc "]
-   returns [[""; "abc"; ""]], while [split] with the same
-   arguments returns [["abc"]]. *)
+   recognized and returned as empty strings in the result. *)
 
 type split_result = 
     Text of string
@@ -216,9 +210,7 @@ val bounded_full_split : regexp -> string -> int -> split_result list
 (** Same as {!Str.bounded_split_delim}, but returns
    the delimiters as well as the substrings contained between
    delimiters.  The former are tagged [Delim] in the result list;
-   the latter are tagged [Text].  For instance,
-   [full_split (regexp "[{}]") "{ab}"] returns
-   [[Delim "{"; Text "ab"; Delim "}"]]. *)
+   the latter are tagged [Text]. *)
 
 
 (** {6 Extracting substrings} *)
diff --git a/otherlibs/str/strstubs.c b/otherlibs/str/strstubs.c
index 1e70a05f2..f64080f9b 100644
--- a/otherlibs/str/strstubs.c
+++ b/otherlibs/str/strstubs.c
@@ -1,170 +1,413 @@
-#if !macintosh
-#include <sys/types.h>
-#else
-#include <SizeTDef.h>
-#endif
+/***********************************************************************/
+/*                                                                     */
+/*                           Objective Caml                            */
+/*                                                                     */
+/*           Xavier Leroy, projet Cristal, INRIA Rocquencourt          */
+/*                                                                     */
+/*  Copyright 1996 Institut National de Recherche en Informatique et   */
+/*  en Automatique.  All rights reserved.  This file is distributed    */
+/*  under the terms of the GNU Library General Public License, with    */
+/*  the special exception on linking described in file ../../LICENSE.  */
+/*                                                                     */
+/***********************************************************************/
+
+/* $Id$ */
+
+#include <assert.h>
 #include <string.h>
-#include <regex.h>
+#include <ctype.h>
 #include <mlvalues.h>
 #include <alloc.h>
-#include <custom.h>
-#include <fail.h>
 #include <memory.h>
+#include <fail.h>
 
-struct regexp_struct {
-  struct custom_operations * ops;
-  struct re_pattern_buffer re;
+/* The backtracking NFA interpreter */
+
+struct backtrack_point {
+  char * txt;
+  value * pc;
+  int mask;
 };
 
-typedef struct regexp_struct * regexp;
+#define BACKTRACK_STACK_BLOCK_SIZE 500
+
+struct backtrack_stack {
+  struct backtrack_stack * previous;
+  struct backtrack_point point[BACKTRACK_STACK_BLOCK_SIZE];
+};
+
+#define Opcode(x) ((x) & 0xFF)
+#define Arg(x) ((unsigned long)(x) >> 8)
+#define SignedArg(x) ((long)(x) >> 8)
+
+enum {
+  CHAR,       /* match a single character */
+  CHARNORM,   /* match a single character, after normalization */
+  STRING,     /* match a character string */
+  STRINGNORM, /* match a character string, after normalization */
+  CHARCLASS,  /* match a character class */
+  BOL,        /* match at beginning of line */
+  EOL,        /* match at end of line */
+  WORDBOUNDARY, /* match on a word boundary */
+  BEGGROUP,   /* record the beginning of a group */
+  ENDGROUP,   /* record the end of a group */
+  REFGROUP,   /* match a previously matched group */
+  ACCEPT,     /* report success */
+  SIMPLEOPT,  /* match a character class 0 or 1 times */
+  SIMPLESTAR, /* match a character class 0, 1 or several times */
+  SIMPLEPLUS, /* match a character class 1 or several times */
+  GOTO,       /* unconditional branch */
+  PUSHBACK    /* record a backtrack point -- 
+                 where to jump in case of failure */
+};
+
+/* Accessors in a compiled regexp */
+#define Prog(re) Field(re, 0)
+#define Cpool(re) Field(re, 1)
+#define Normtable(re) Field(re, 2)
+#define Numgroups(re) Int_val(Field(re, 3))
+#define Startchars(re) Int_val(Field(re, 4))
+
+/* Record positions of matched groups */
+struct re_group {
+  unsigned char * tentative_start;
+  unsigned char * start;
+  unsigned char * end;
+};
+static struct re_group re_group[32];
 
-static void free_regexp(value vexpr)
+/* Bitvector recording which groups were fully matched */
+static int re_mask;
+
+/* The initial backtracking stack */
+static struct backtrack_stack initial_stack = { NULL, };
+
+/* Free a chained list of backtracking stacks */
+static void free_backtrack_stack(struct backtrack_stack * stack)
 {
-  regexp expr = (regexp) Bp_val(vexpr);
-  expr->re.translate = NULL;
-  re_free(&(expr->re));
+  struct backtrack_stack * prevstack;
+  while ((prevstack = stack->previous) != NULL) {
+    stat_free(stack);
+    stack = prevstack;
+  }
 }
 
-static struct custom_operations regexp_ops = {
-  "_regexp",
-  free_regexp,
-  custom_compare_default,
-  custom_hash_default,
-  custom_serialize_default,
-  custom_deserialize_default
+/* Membership in a bit vector representing a set of booleans */
+#define In_bitset(s,i,tmp) (tmp = (i), ((s)[tmp >> 3] >> (tmp & 7)) & 1)
+
+/* Determine if a character is a word constituent */
+static unsigned char re_word_letters[32] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 254, 255, 255, 7, 254, 255, 255, 7,
+  0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 127, 255, 255, 255, 127, 255
 };
+#define Is_word_letter(c) ((re_word_letters[(c) >> 3] >> ((c) & 7)) & 1)
+
+/* The bytecode interpreter for the NFA */
 
-static regexp alloc_regexp(void)
+static int re_match(value re, 
+                    unsigned char * starttxt,
+                    register unsigned char * txt,
+                    register unsigned char * endtxt,
+                    int accept_partial_match)
 {
-  value res =
-    alloc_custom(&regexp_ops, sizeof(struct regexp_struct), 1, 10000);
-  return (regexp) res;
-}
+  register value * pc;
+  struct backtrack_stack * stack;
+  struct backtrack_point * sp;
+  value cpool;
+  value normtable;
+  unsigned char c;
 
-#define RE_SYNTAX RE_SYNTAX_EMACS
+  pc = &Field(Prog(re), 0);
+  stack = &initial_stack;
+  sp = stack->point;
+  cpool = Cpool(re);
+  normtable = Normtable(re);
+  re_mask = 0;
+  re_group[0].start = txt;
 
-static char * case_fold_table = NULL;
+  while (1) {
+    long instr = Long_val(*pc++);
+    switch (Opcode(instr)) {
+    case CHAR:
+      if (txt == endtxt) goto prefix_match;
+      if (*txt != Arg(instr)) goto backtrack;
+      txt++;
+      break;
+    case CHARNORM:
+      if (txt == endtxt) goto prefix_match;
+      if (Byte_u(normtable, *txt) != Arg(instr)) goto backtrack;
+      txt++;
+      break;
+    case STRING: {
+      unsigned char * s =
+        (unsigned char *) String_val(Field(cpool, Arg(instr)));
+      while ((c = *s++) != 0) {
+        if (txt == endtxt) goto prefix_match;
+        if (c != *txt) goto backtrack;
+        txt++;
+      }
+      break;
+    }
+    case STRINGNORM: {
+      unsigned char * s =
+        (unsigned char *) String_val(Field(cpool, Arg(instr)));
+      while ((c = *s++) != 0) {
+        if (txt == endtxt) goto prefix_match;
+        if (c != Byte_u(normtable, *txt)) goto backtrack;
+        txt++;
+      }
+      break;
+    }
+    case CHARCLASS:
+      if (txt == endtxt) goto prefix_match;
+      if (! In_bitset(String_val(Field(cpool, Arg(instr))), *txt, c))
+        goto backtrack;
+      txt++;
+      break;
+    case BOL:
+      if (txt > starttxt && txt[-1] != '\n') goto backtrack;
+      break;
+    case EOL:
+      if (txt < endtxt && *txt != '\n') goto backtrack;
+      break;
+    case WORDBOUNDARY:
+      /* At beginning and end of text: no
+         At beginning of text: OK if current char is a letter
+         At end of text: OK if previous char is a letter
+         Otherwise: 
+           OK if previous char is a letter and current char not a letter
+           or previous char is not a letter and current char is a letter */
+      if (txt == starttxt) {
+        if (txt == endtxt) goto prefix_match;
+        if (Is_word_letter(txt[0])) break;
+        goto backtrack;
+      } else if (txt == endtxt) {
+        if (Is_word_letter(txt[-1])) break;
+        goto backtrack;
+      } else {
+        if (Is_word_letter(txt[-1]) != Is_word_letter(txt[0])) break;
+        goto backtrack;
+      }
+    case BEGGROUP: {
+      int group_no = Arg(instr);
+      re_group[group_no].tentative_start = txt;
+      break;
+    }
+    case ENDGROUP: {
+      int group_no = Arg(instr);
+      struct re_group * group = &(re_group[group_no]);
+      group->start = group->tentative_start;
+      group->end = txt;
+      re_mask |= (1 << group_no);
+      break;
+    }
+    case REFGROUP: {
+      int group_no = Arg(instr);
+      struct re_group * group = &(re_group[group_no]);
+      unsigned char * s;
+      if ((re_mask & (1 << group_no)) == 0) goto backtrack;
+      for (s = group->start; s < group->end; s++) {
+        if (txt == endtxt) goto prefix_match;
+        if (*s != *txt) goto backtrack;
+        txt++;
+      }
+      break;
+    }
+    case ACCEPT:
+      goto accept;
+    case SIMPLEOPT: {
+      char * set = String_val(Field(cpool, Arg(instr)));
+      if (txt < endtxt && In_bitset(set, *txt, c)) txt++;
+      break;
+    }
+    case SIMPLESTAR: {
+      char * set = String_val(Field(cpool, Arg(instr)));
+      while (txt < endtxt && In_bitset(set, *txt, c))
+        txt++;
+      break;
+    }
+    case SIMPLEPLUS: {
+      char * set = String_val(Field(cpool, Arg(instr)));
+      if (txt == endtxt) goto prefix_match;
+      if (! In_bitset(set, *txt, c)) goto backtrack;
+      txt++;
+      while (txt < endtxt && In_bitset(set, *txt, c))
+        txt++;
+      break;
+    }
+    case GOTO:
+      pc = pc + SignedArg(instr);
+      break;
+    case PUSHBACK:
+      if (sp == stack->point + BACKTRACK_STACK_BLOCK_SIZE) {
+        struct backtrack_stack * newstack = 
+          stat_alloc(sizeof(struct backtrack_stack));
+        newstack->previous = stack;
+        stack = newstack;
+        sp = stack->point;
+      }
+      sp->txt = txt;
+      sp->pc = pc + SignedArg(instr);
+      sp->mask = re_mask;
+      sp++;
+      break;
+    default:
+      assert(0);
+    }
+    /* Continue with next instruction */
+    continue;
+  prefix_match:
+    /* We get here when matching failed because the end of text
+       was encountered. */
+    if (accept_partial_match) goto accept;
+  backtrack:
+    /* We get here when matching fails.  Backtrack to most recent saved
+       point. */
+    if (sp == stack->point) {
+      struct backtrack_stack * prevstack = stack->previous;
+      if (prevstack == NULL) return 0;
+      stat_free(stack);
+      stack = prevstack;
+      sp = stack->point + BACKTRACK_STACK_BLOCK_SIZE;
+    }
+    sp--;
+    txt = sp->txt;
+    pc = sp->pc;
+    re_mask = sp->mask;
+  }  
+ accept:
+  /* We get here when the regexp was successfully matched */
+  free_backtrack_stack(stack);
+  re_group[0].end = txt;
+  re_mask |= 1;
+  return 1;
+}
+
+/* Allocate an integer array containing the positions of the matched groups.
+   Beginning of group #N is at 2N, end is at 2N+1.
+   Take position = -1 when group wasn't matched. */
 
-CAMLprim value str_compile_regexp(value src, value fold)
+static value re_alloc_groups(value re, value str)
 {
-  regexp expr;
-  char * msg;
-
-  Begin_root(src);
-  expr = alloc_regexp();
-  End_roots();
-  re_syntax_options = RE_SYNTAX;
-  if (Bool_val(fold) && case_fold_table == NULL) {
-    int i;
-    case_fold_table = stat_alloc(256);
-    for (i = 0; i <= 255; i++) case_fold_table[i] = i;
-    for (i = 'A'; i <= 'Z'; i++) case_fold_table[i] = i + 32;
-    for (i = 192; i <= 214; i++) case_fold_table[i] = i + 32;
-    for (i = 216; i <= 222; i++) case_fold_table[i] = i + 32;
+  CAMLparam1(str);
+  CAMLlocal1(res);
+  unsigned char * starttxt = (unsigned char *) String_val(str);
+  int n = Numgroups(re);
+  int i;
+
+  res = alloc(n * 2, 0);
+  for (i = 0; i < n; i++) {
+    if ((re_mask & (1 << i)) == 0) {
+      Field(res, i * 2) = Val_int(-1);
+      Field(res, i * 2 + 1) = Val_int(-1);
+    } else {
+      Field(res, i * 2) = Val_long(re_group[i].start - starttxt);
+      Field(res, i * 2 + 1) = Val_long(re_group[i].end - starttxt);
+    }
   }
-  expr->re.translate = Bool_val(fold) ? case_fold_table : NULL;
-  expr->re.fastmap = stat_alloc(256);
-  expr->re.buffer = NULL;
-  expr->re.allocated = 0;
-  msg = (char *) re_compile_pattern(String_val(src), string_length(src),
-                                    &(expr->re));
-  if (msg != NULL) failwith(msg);
-  re_compile_fastmap(&(expr->re));
-  expr->re.regs_allocated = REGS_FIXED;
-  return (value) expr;
+  CAMLreturn(res);
 }
 
-static regoff_t start_regs[10], end_regs[10];
-
-static struct re_registers match_regs = { 10, start_regs, end_regs };
+/* String matching and searching.  All functions return the empty array
+   on failure, and an array of positions on success. */
 
-CAMLprim value str_string_match(regexp expr, value text, value pos)
+CAMLprim value re_string_match(value re, value str, value pos)
 {
-  int len = string_length(text);
-  int start = Int_val(pos);
-  if (start < 0 || start > len)
+  unsigned char * starttxt = &Byte_u(str, 0);
+  unsigned char * txt = &Byte_u(str, Long_val(pos));
+  unsigned char * endtxt = &Byte_u(str, string_length(str));
+
+  if (txt < starttxt || txt > endtxt)
     invalid_argument("Str.string_match");
-  switch (re_match(&(expr->re), String_val(text), len,
-                   start, &match_regs)) {
-  case -2:
-    failwith("Str.string_match");
-  case -1:
-  case -3:
-    return Val_false;
-  default:
-    return Val_true;
+  if (re_match(re, starttxt, txt, endtxt, 0)) {
+    return re_alloc_groups(re, str);
+  } else {
+    return Atom(0);
   }
 }
 
-CAMLprim value str_string_partial_match(regexp expr, value text, value pos)
+CAMLprim value re_partial_match(value re, value str, value pos)
 {
-  int len = string_length(text);
-  int start = Int_val(pos);
-  if (start < 0 || start > len)
+  unsigned char * starttxt = &Byte_u(str, 0);
+  unsigned char * txt = &Byte_u(str, Long_val(pos));
+  unsigned char * endtxt = &Byte_u(str, string_length(str));
+
+  if (txt < starttxt || txt > endtxt)
     invalid_argument("Str.string_partial_match");
-  switch (re_match(&(expr->re), String_val(text), len,
-                   start, &match_regs)) {
-  case -2:
-    failwith("Str.string_partial_match");
-  case -1:
-    return Val_false;
-  default:
-    return Val_true;
+  if (re_match(re, starttxt, txt, endtxt, 1)) {
+    return re_alloc_groups(re, str);
+  } else {
+    return Atom(0);
   }
 }
 
-CAMLprim value str_search_forward(regexp expr, value text, value pos)
+CAMLprim value re_search_forward(value re, value str, value startpos)
 {
-  int res;
-  int len = string_length(text);
-  int start = Int_val(pos);
-  if (start < 0 || start > len)
+  unsigned char * starttxt = &Byte_u(str, 0);
+  unsigned char * txt = &Byte_u(str, Long_val(startpos));
+  unsigned char * endtxt = &Byte_u(str, string_length(str));
+  unsigned char * startchars;
+  unsigned char c;
+
+  if (txt < starttxt || txt > endtxt)
     invalid_argument("Str.search_forward");
-  res = re_search(&(expr->re), String_val(text), len, start, len-start,
-                  &match_regs);
-  switch(res) {
-  case -2:
-    failwith("Str.search_forward");
-  case -1:
-    raise_not_found();
-  default:
-    return Val_int(res);
+  if (Startchars(re) == -1) {
+    do {
+      if (re_match(re, starttxt, txt, endtxt, 0))
+        return re_alloc_groups(re, str);
+      txt++;
+    } while (txt <= endtxt);
+    return Atom(0);
+  } else {
+    startchars =
+      (unsigned char *) String_val(Field(Cpool(re), Startchars(re)));
+    do {
+      while (txt < endtxt && startchars[*txt] == 0) txt++;
+      if (re_match(re, starttxt, txt, endtxt, 0))
+        return re_alloc_groups(re, str);
+      txt++;
+    } while (txt <= endtxt);
+    return Atom(0);
   }
 }
 
-CAMLprim value str_search_backward(regexp expr, value text, value pos)
+CAMLprim value re_search_backward(value re, value str, value startpos)
 {
-  int res;
-  int len = string_length(text);
-  int start = Int_val(pos);
-  if (start < 0 || start > len)
+  unsigned char * starttxt = &Byte_u(str, 0);
+  unsigned char * txt = &Byte_u(str, Long_val(startpos));
+  unsigned char * endtxt = &Byte_u(str, string_length(str));
+  unsigned char * startchars;
+  unsigned char c;
+
+  if (txt < starttxt || txt > endtxt)
     invalid_argument("Str.search_backward");
-  res = re_search(&(expr->re), String_val(text), len, start, -start-1,
-                  &match_regs);
-  switch(res) {
-  case -2:
-    failwith("Str.search_backward");
-  case -1:
-    raise_not_found();
-  default:
-    return Val_int(res);
+  if (Startchars(re) == -1) {
+    do {
+      if (re_match(re, starttxt, txt, endtxt, 0))
+        return re_alloc_groups(re, str);
+      txt--;
+    } while (txt >= starttxt);
+    return Atom(0);
+  } else {
+    startchars =
+      (unsigned char *) String_val(Field(Cpool(re), Startchars(re)));
+    do {
+      while (txt > starttxt && startchars[*txt] == 0) txt--;
+      if (re_match(re, starttxt, txt, endtxt, 0))
+        return re_alloc_groups(re, str);
+      txt--;
+    } while (txt >= starttxt);
+    return Atom(0);
   }
 }
 
-CAMLprim value str_beginning_group(value ngroup)
-{
-  return Val_int(start_regs[Int_val(ngroup)]);
-}
-
-CAMLprim value str_end_group(value ngroup)
-{
-  return Val_int(end_regs[Int_val(ngroup)]);
-}
+/* Replacement */
 
-CAMLprim value str_replacement_text(value repl, value orig)
+CAMLprim value re_replacement_text(value repl, value groups, value orig)
 {
-  value res;
-  mlsize_t len, n;
+  CAMLparam3(repl, groups, orig);
+  CAMLlocal1(res);
+  mlsize_t start, end, len, n;
   char * p, * q;
   int c;
 
@@ -184,15 +427,20 @@ CAMLprim value str_replacement_text(value repl, value orig)
       case '0': case '1': case '2': case '3': case '4':
       case '5': case '6': case '7': case '8': case '9':
         c -= '0';
-        len += end_regs[c] - start_regs[c]; break;
+        if (c*2 >= Wosize_val(groups))
+          failwith("Str.replace: reference to unmatched group");
+        start = Long_val(Field(groups, c*2));
+        end = Long_val(Field(groups, c*2 + 1));
+        if (start == (mlsize_t) -1)
+          failwith("Str.replace: reference to unmatched group");
+        len += end - start;
+        break;
       default:
         len += 2; break;
       }
     }
   }
-  Begin_roots2(orig,repl);
-    res = alloc_string(len);
-  End_roots();
+  res = alloc_string(len);
   p = String_val(repl);
   q = String_val(res);
   n = string_length(repl);
@@ -208,8 +456,10 @@ CAMLprim value str_replacement_text(value repl, value orig)
       case '0': case '1': case '2': case '3': case '4':
       case '5': case '6': case '7': case '8': case '9':
         c -= '0';
-        len = end_regs[c] - start_regs[c];
-        memmove (q, &Byte(orig, start_regs[c]), len);
+        start = Long_val(Field(groups, c*2));
+        end = Long_val(Field(groups, c*2 + 1));
+        len = end - start;
+        memmove (q, &Byte(orig, start), len);
         q += len;
         break;
       default:
@@ -217,6 +467,6 @@ CAMLprim value str_replacement_text(value repl, value orig)
       }
     }
   }
-  return res;
+  CAMLreturn(res);
 }
author	Xavier Leroy <xavier.leroy@inria.fr>	2002-12-09 14:05:18 +0000
committer	Xavier Leroy <xavier.leroy@inria.fr>	2002-12-09 14:05:18 +0000
commit	211fc51518273dc50deeb873dee7c353b203dc3e (patch)
tree	40d5921351c6b6f7b83f2dac46ea0bd2d123c0c2 /otherlibs
parent	11392c1894b4be951325ffb50cd308c1271c6cad (diff)