summaryrefslogtreecommitdiffstats
path: root/otherlibs
diff options
context:
space:
mode:
authorXavier Leroy <xavier.leroy@inria.fr>2002-12-09 14:05:18 +0000
committerXavier Leroy <xavier.leroy@inria.fr>2002-12-09 14:05:18 +0000
commit211fc51518273dc50deeb873dee7c353b203dc3e (patch)
tree40d5921351c6b6f7b83f2dac46ea0bd2d123c0c2 /otherlibs
parent11392c1894b4be951325ffb50cd308c1271c6cad (diff)
Reimplementation complete de la bibliotheque Str en n'utilisant pas GNU regex
git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@5324 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
Diffstat (limited to 'otherlibs')
-rw-r--r--otherlibs/str/.depend7
-rw-r--r--otherlibs/str/Makefile9
-rw-r--r--otherlibs/str/Makefile.Mac9
-rw-r--r--otherlibs/str/Makefile.nt17
-rw-r--r--otherlibs/str/str.ml572
-rw-r--r--otherlibs/str/str.mli32
-rw-r--r--otherlibs/str/strstubs.c510
7 files changed, 941 insertions, 215 deletions
diff --git a/otherlibs/str/.depend b/otherlibs/str/.depend
index 4d370653d..70671c2f8 100644
--- a/otherlibs/str/.depend
+++ b/otherlibs/str/.depend
@@ -1,8 +1,5 @@
-strstubs.o: strstubs.c regex-0.12/regex.h ../../byterun/mlvalues.h \
- ../../byterun/config.h ../../config/m.h ../../config/s.h \
- ../../byterun/misc.h ../../byterun/alloc.h ../../byterun/custom.h \
- ../../byterun/fail.h ../../byterun/memory.h ../../byterun/gc.h \
- ../../byterun/major_gc.h ../../byterun/freelist.h \
+strstubs.o: strstubs.c ../../config/s.h ../../byterun/misc.h \
+ ../../byterun/gc.h ../../byterun/major_gc.h ../../byterun/freelist.h \
../../byterun/minor_gc.h
str.cmo: str.cmi
str.cmx: str.cmi
diff --git a/otherlibs/str/Makefile b/otherlibs/str/Makefile
index 22b440c19..97123ddaa 100644
--- a/otherlibs/str/Makefile
+++ b/otherlibs/str/Makefile
@@ -19,12 +19,11 @@ include ../../config/Makefile
# Compilation options
CC=$(BYTECC)
-CFLAGS=-O -I$(REGEXLIB) -I../../byterun $(BYTECCCOMPOPTS) $(SHAREDCCCOMPOPTS)
+CFLAGS=-O -I../../byterun $(BYTECCCOMPOPTS) $(SHAREDCCCOMPOPTS)
CAMLC=../../ocamlcomp.sh
CAMLOPT=../../ocamlcompopt.sh
COMPFLAGS=-warn-error A
-REGEXLIB=regex-0.12
-COBJS=strstubs.o $(REGEXLIB)/regex.o
+COBJS=strstubs.o
MKLIB=../../boot/ocamlrun ../../tools/ocamlmklib
all: libstr.a str.cmi str.cma
@@ -40,9 +39,6 @@ str.cma: str.cmo
str.cmxa: str.cmx
$(MKLIB) -ocamlopt '$(CAMLOPT)' -o str str.cmx
-$(REGEXLIB)/regex.o: $(REGEXLIB)/regex.c $(REGEXLIB)/regex.h
- cd $(REGEXLIB); CC="$(CC) $(BYTECCCOMPOPTS) $(SHAREDCCCOMPOPTS)" sh configure; $(MAKE)
-
str.cmx: ../../ocamlopt
partialclean:
@@ -50,7 +46,6 @@ partialclean:
clean: partialclean
rm -f *.a *.so *.o
- cd $(REGEXLIB); if test -f Makefile; then $(MAKE) distclean; else exit 0; fi
install:
if test -f dllstr.so; then cp dllstr.so $(STUBLIBDIR)/dllstr.so; fi
diff --git a/otherlibs/str/Makefile.Mac b/otherlibs/str/Makefile.Mac
index a4b65258b..c5345acd0 100644
--- a/otherlibs/str/Makefile.Mac
+++ b/otherlibs/str/Makefile.Mac
@@ -17,12 +17,11 @@
# Compilation options
PPCC = mrc
-PPCCOptions = -i {REGEXLIB},:::byterun:,:::config: -w 7 {cdbgflag}
+PPCCOptions = -i :::byterun:,:::config: -w 7 {cdbgflag}
CAMLC = :::boot:ocamlrun :::ocamlc -I :::stdlib:
-REGEXLIB = :regex-0.12:
-PPCCOBJS = strstubs.c.x {REGEXLIB}regex.c.x
+PPCCOBJS = strstubs.c.x
all � libstr.x str.cmi str.cma
@@ -32,15 +31,11 @@ libstr.x � {PPCCOBJS}
str.cma � str.cmo
{CAMLC} -a -o str.cma str.cmo
-{REGEXLIB}regex.c.x � {REGEXLIB}regex.c {REGEXLIB}regex.h
- directory {REGEXLIB}; domake; directory ::
-
partialclean �
delete -i �.cm[aio] || set status 0
clean � partialclean
delete -i �.x || set status 0
- directory {REGEXLIB}; domake distclean; directory ::
install �
duplicate -y libstr.x str.cma str.cmi "{LIBDIR}"
diff --git a/otherlibs/str/Makefile.nt b/otherlibs/str/Makefile.nt
index ec7ac0559..3d65d19f0 100644
--- a/otherlibs/str/Makefile.nt
+++ b/otherlibs/str/Makefile.nt
@@ -19,13 +19,11 @@ include ../../config/Makefile
# Compilation options
CC=$(BYTECC)
-CFLAGS=-I$(REGEXLIB) -I../../byterun
+CFLAGS=-I../../byterun
CAMLC=../../boot/ocamlrun ../../ocamlc -I ../../boot
CAMLOPT=../../boot/ocamlrun ../../ocamlopt -I ../../stdlib
-REGEXLIB=regex-0.12
-REGEXFLAGS=-DREGEX_MALLOC -DSTDC_HEADERS
-DCOBJS=strstubs.$(DO) $(REGEXLIB)/regex.$(DO)
-SCOBJS=strstubs.$(SO) $(REGEXLIB)/regex.$(SO)
+DCOBJS=strstubs.$(DO)
+SCOBJS=strstubs.$(SO)
all: dllstr.dll libstr.$(A) str.cmi str.cma
@@ -44,14 +42,6 @@ str.cma: str.cmo
str.cmxa: str.cmx
$(CAMLOPT) -a -o str.cmxa str.cmx -cclib -lstr
-$(REGEXLIB)/regex.$(DO): $(REGEXLIB)/regex.c $(REGEXLIB)/regex.h
- cd $(REGEXLIB); $(CC) $(REGEXFLAGS) $(DLLCCCOMPOPTS) -c regex.c
- mv $(REGEXLIB)/regex.$(O) $(REGEXLIB)/regex.$(DO)
-
-$(REGEXLIB)/regex.$(SO): $(REGEXLIB)/regex.c $(REGEXLIB)/regex.h
- cd $(REGEXLIB); $(CC) $(REGEXFLAGS) $(BYTECCCOMPOPTS) -c regex.c
- mv $(REGEXLIB)/regex.$(O) $(REGEXLIB)/regex.$(SO)
-
str.cmx: ../../ocamlopt
partialclean:
@@ -59,7 +49,6 @@ partialclean:
clean: partialclean
rm -f *.$(A) *.dll *.$(O) *.$(SO)
- rm -f $(REGEXLIB)/*.$(O)
install:
cp dllstr.dll $(STUBLIBDIR)/dllstr.dll
diff --git a/otherlibs/str/str.ml b/otherlibs/str/str.ml
index d20de2647..178d3ff2c 100644
--- a/otherlibs/str/str.ml
+++ b/otherlibs/str/str.ml
@@ -13,17 +13,485 @@
(* $Id$ *)
-type regexp
-
-external compile_regexp: string -> bool -> regexp = "str_compile_regexp"
-external string_match: regexp -> string -> int -> bool = "str_string_match"
-external string_partial_match: regexp -> string -> int -> bool =
- "str_string_partial_match"
-external search_forward: regexp -> string -> int -> int = "str_search_forward"
-external search_backward: regexp -> string -> int -> int = "str_search_backward"
-external beginning_group: int -> int = "str_beginning_group"
-external end_group: int -> int = "str_end_group"
-external replacement_text: string -> string -> string = "str_replacement_text"
+(** String utilities *)
+
+let string_before s n = String.sub s 0 n
+
+let string_after s n = String.sub s n (String.length s - n)
+
+let first_chars s n = String.sub s 0 n
+
+let last_chars s n = String.sub s (String.length s - n) n
+
+(** Representation of character sets **)
+
+module Charset =
+ struct
+ type t = string (* of length 32 *)
+
+ let empty = String.make 32 '\000'
+ let full = String.make 32 '\255'
+
+ let make_empty () = String.make 32 '\000'
+
+ let add s c =
+ let i = Char.code c in
+ s.[i lsr 3] <- Char.chr(Char.code s.[i lsr 3] lor (1 lsl (i land 7)))
+
+ let add_range s c1 c2 =
+ for i = Char.code c1 to Char.code c2 do add s (Char.chr i) done
+
+ let singleton c =
+ let s = make_empty () in add s c; s
+
+ let range c1 c2 =
+ let s = make_empty () in add_range s c1 c2; s
+
+ let complement s =
+ let r = String.create 32 in
+ for i = 0 to 31 do
+ r.[i] <- Char.chr(Char.code s.[i] lxor 0xFF)
+ done;
+ r
+
+ let union s1 s2 =
+ let r = String.create 32 in
+ for i = 0 to 31 do
+ r.[i] <- Char.chr(Char.code s1.[i] lor Char.code s2.[i])
+ done;
+ r
+
+ let disjoint s1 s2 =
+ try
+ for i = 0 to 31 do
+ if Char.code s1.[i] land Char.code s2.[i] <> 0 then raise Exit
+ done;
+ true
+ with Exit ->
+ false
+
+ let iter fn s =
+ for i = 0 to 31 do
+ let c = Char.code s.[i] in
+ if c <> 0 then
+ for j = 0 to 7 do
+ if c land (1 lsl j) <> 0 then fn (Char.chr ((i lsl 3) + j))
+ done
+ done
+
+ let expand s =
+ let r = String.make 256 '\000' in
+ iter (fun c -> r.[Char.code c] <- '\001') s;
+ r
+
+ let fold_case s =
+ let r = make_empty() in
+ iter (fun c -> add r (Char.lowercase c); add r (Char.uppercase c)) s;
+ r
+
+ end
+
+(** Abstract syntax tree for regular expressions *)
+
+type re_syntax =
+ Char of char
+ | String of string
+ | CharClass of Charset.t
+ | Seq of re_syntax list
+ | Alt of re_syntax * re_syntax
+ | Star of re_syntax
+ | Plus of re_syntax
+ | Option of re_syntax
+ | Group of int * re_syntax
+ | Refgroup of int
+ | Bol
+ | Eol
+ | Wordboundary
+
+(** Representation of compiled regular expressions *)
+
+type regexp = {
+ prog: int array; (* bytecode instructions *)
+ cpool: string array; (* constant pool (string literals) *)
+ normtable: string; (* case folding table (if any) *)
+ numgroups: int; (* number of \(...\) groups *)
+ startchars: int (* index of set of starting chars, or -1 if none *)
+}
+
+(** Opcodes for bytecode instructions; see strstubs.c for description *)
+
+let op_CHAR = 0
+let op_CHARNORM = 1
+let op_STRING = 2
+let op_STRINGNORM = 3
+let op_CHARCLASS = 4
+let op_BOL = 5
+let op_EOL = 6
+let op_WORDBOUNDARY = 7
+let op_BEGGROUP = 8
+let op_ENDGROUP = 9
+let op_REFGROUP = 10
+let op_ACCEPT = 11
+let op_SIMPLEOPT = 12
+let op_SIMPLESTAR = 13
+let op_SIMPLEPLUS = 14
+let op_GOTO = 15
+let op_PUSHBACK = 16
+
+(* Encoding of bytecode instructions *)
+
+let instr opc arg = opc lor (arg lsl 8)
+
+(* Computing relative displacements for GOTO and PUSHBACK instructions *)
+
+let displ dest from = dest - from - 1
+
+(** Compilation of a regular expression *)
+
+(* first r returns a set of characters C such that:
+ for all string s, s matches r => the first character of s is in C *)
+
+let rec first = function
+ Char c -> Charset.singleton c
+ | String s -> if s = "" then Charset.full else Charset.singleton s.[0]
+ | CharClass cl -> cl
+ | Seq rl -> first_seq rl
+ | Alt (r1, r2) -> Charset.union (first r1) (first r2)
+ | Star r -> Charset.full
+ | Plus r -> first r
+ | Option r -> Charset.full
+ | Group(n, r) -> first r
+ | Refgroup n -> Charset.full
+ | Bol -> Charset.full
+ | Eol -> Charset.full
+ | Wordboundary -> Charset.full
+
+and first_seq = function
+ [] -> Charset.full
+ | (Bol | Eol | Wordboundary) :: rl -> first_seq rl
+ | Star r :: rl -> Charset.union (first r) (first_seq rl)
+ | Option r :: rl -> Charset.union (first r) (first_seq rl)
+ | r :: rl -> first r
+
+(* Transform a Char or CharClass regexp into a character class *)
+
+let charclass_of_regexp fold_case re =
+ let cl =
+ match re with
+ Char c -> Charset.singleton c
+ | CharClass cl -> cl
+ | _ -> assert false in
+ if fold_case then Charset.fold_case cl else cl
+
+(* The case fold table: maps characters to their lowercase equivalent *)
+
+let fold_case_table =
+ let t = String.create 256 in
+ for i = 0 to 255 do t.[i] <- Char.lowercase(Char.chr i) done;
+ t
+
+module StringMap = Map.Make(struct type t = string let compare = compare end)
+
+(* Compilation of a regular expression *)
+
+let compile fold_case re =
+
+ (* Instruction buffering *)
+ let prog = ref (Array.make 32 0)
+ and progpos = ref 0
+ and cpool = ref StringMap.empty
+ and cpoolpos = ref 0
+ and numgroups = ref 1 in
+ (* Add a new instruction *)
+ let emit_instr opc arg =
+ if !progpos >= Array.length !prog then begin
+ let nprog = Array.make (2 * Array.length !prog) 0 in
+ Array.blit !prog 0 nprog 0 (Array.length !prog);
+ prog := nprog
+ end;
+ (!prog).(!progpos) <- (instr opc arg);
+ incr progpos in
+ (* Reserve an instruction slot and return its position *)
+ let emit_hole () =
+ let p = !progpos in incr progpos; p in
+ (* Fill a reserved instruction slot with a GOTO or PUSHBACK instruction *)
+ let patch_instr pos opc dest =
+ (!prog).(pos) <- (instr opc (displ dest pos)) in
+ (* Return the cpool index for the given string, adding it if not
+ already there *)
+ let cpool_index s =
+ try
+ StringMap.find s !cpool
+ with Not_found ->
+ let p = !cpoolpos in
+ cpool := StringMap.add s p !cpool;
+ incr cpoolpos;
+ p in
+ (* Main recursive compilation function *)
+ let rec emit_code = function
+ Char c ->
+ if fold_case then
+ emit_instr op_CHARNORM (Char.code (Char.lowercase c))
+ else
+ emit_instr op_CHAR (Char.code c)
+ | String s ->
+ begin match String.length s with
+ 0 -> ()
+ | 1 ->
+ if fold_case then
+ emit_instr op_CHARNORM (Char.code (Char.lowercase s.[0]))
+ else
+ emit_instr op_CHAR (Char.code s.[0])
+ | _ ->
+ try
+ (* null characters are not accepted by the STRING* instructions;
+ if one is found, split string at null character *)
+ let i = String.index s '\000' in
+ emit_code (String (string_before s i));
+ emit_instr op_CHAR 0;
+ emit_code (String (string_after s (i+1)))
+ with Not_found ->
+ if fold_case then
+ emit_instr op_STRINGNORM (cpool_index (String.lowercase s))
+ else
+ emit_instr op_STRING (cpool_index s)
+ end
+ | CharClass cl ->
+ let cl' = if fold_case then Charset.fold_case cl else cl in
+ emit_instr op_CHARCLASS (cpool_index cl')
+ | Seq rl ->
+ emit_seq_code rl
+ | Alt(r1, r2) ->
+ (* PUSHBACK lbl1
+ <match r1>
+ GOTO lbl2
+ lbl1: <match r2>
+ lbl2: ... *)
+ let pos_pushback = emit_hole() in
+ emit_code r1;
+ let pos_goto_end = emit_hole() in
+ let lbl1 = !progpos in
+ emit_code r2;
+ let lbl2 = !progpos in
+ patch_instr pos_pushback op_PUSHBACK lbl1;
+ patch_instr pos_goto_end op_GOTO lbl2
+ | Star r ->
+ (* Implement longest match semantics for compatibility with old Str *)
+ (* lbl1: PUSHBACK lbl2
+ <match r>
+ GOTO lbl1
+ lbl2:
+ *)
+ let lbl1 = emit_hole() in
+ emit_code r;
+ emit_instr op_GOTO (displ lbl1 !progpos);
+ let lbl2 = !progpos in
+ patch_instr lbl1 op_PUSHBACK lbl2
+ | Plus r ->
+ (* Implement longest match semantics for compatibility with old Str *)
+ (* lbl1: <match r>
+ PUSHBACK lbl2
+ GOTO lbl1
+ lbl2:
+ *)
+ let lbl1 = !progpos in
+ emit_code r;
+ let pos_pushback = emit_hole() in
+ emit_instr op_GOTO (displ lbl1 !progpos);
+ let lbl2 = !progpos in
+ patch_instr pos_pushback op_PUSHBACK lbl2
+ | Option r ->
+ (* Implement longest match semantics for compatibility with old Str *)
+ (* PUSHBACK lbl
+ <match r>
+ lbl:
+ *)
+ let pos_pushback = emit_hole() in
+ emit_code r;
+ let lbl = !progpos in
+ patch_instr pos_pushback op_PUSHBACK lbl
+ | Group(n, r) ->
+ if n >= 32 then failwith "too many \\(...\\) groups";
+ emit_instr op_BEGGROUP n;
+ emit_code r;
+ emit_instr op_ENDGROUP n;
+ numgroups := max !numgroups (n+1)
+ | Refgroup n ->
+ emit_instr op_REFGROUP n
+ | Bol ->
+ emit_instr op_BOL 0
+ | Eol ->
+ emit_instr op_EOL 0
+ | Wordboundary ->
+ emit_instr op_WORDBOUNDARY 0
+
+ and emit_seq_code = function
+ [] -> ()
+ | Star(Char _ | CharClass _ as r) :: rl
+ when disjoint_modulo_case (first r) (first_seq rl) ->
+ emit_instr op_SIMPLESTAR (cpool_index (charclass_of_regexp fold_case r));
+ emit_seq_code rl
+ | Plus(Char _ | CharClass _ as r) :: rl
+ when disjoint_modulo_case (first r) (first_seq rl) ->
+ emit_instr op_SIMPLEPLUS (cpool_index (charclass_of_regexp fold_case r));
+ emit_seq_code rl
+ | Option(Char _ | CharClass _ as r) :: rl
+ when disjoint_modulo_case (first r) (first_seq rl) ->
+ emit_instr op_SIMPLEOPT (cpool_index (charclass_of_regexp fold_case r));
+ emit_seq_code rl
+ | r :: rl ->
+ emit_code r;
+ emit_seq_code rl
+
+ and disjoint_modulo_case c1 c2 =
+ if fold_case
+ then Charset.disjoint (Charset.fold_case c1) (Charset.fold_case c2)
+ else Charset.disjoint c1 c2
+ in
+
+ emit_code re;
+ emit_instr op_ACCEPT 0;
+ let start = first re in
+ let start' = if fold_case then Charset.fold_case start else start in
+ let start_pos =
+ if start = Charset.full
+ then -1
+ else cpool_index (Charset.expand start') in
+ let constantpool = Array.make !cpoolpos "" in
+ StringMap.iter (fun str idx -> constantpool.(idx) <- str) !cpool;
+ { prog = Array.sub !prog 0 !progpos;
+ cpool = constantpool;
+ normtable = if fold_case then fold_case_table else "";
+ numgroups = !numgroups;
+ startchars = start_pos }
+
+(** Parsing of a regular expression *)
+
+(* Efficient buffering of sequences *)
+
+module SeqBuffer = struct
+
+ type t = { sb_chars: Buffer.t; mutable sb_next: re_syntax list }
+
+ let create() = { sb_chars = Buffer.create 16; sb_next = [] }
+
+ let flush buf =
+ let s = Buffer.contents buf.sb_chars in
+ Buffer.clear buf.sb_chars;
+ match String.length s with
+ 0 -> ()
+ | 1 -> buf.sb_next <- Char s.[0] :: buf.sb_next
+ | _ -> buf.sb_next <- String s :: buf.sb_next
+
+ let add buf re =
+ match re with
+ Char c -> Buffer.add_char buf.sb_chars c
+ | _ -> flush buf; buf.sb_next <- re :: buf.sb_next
+
+ let extract buf =
+ flush buf; Seq(List.rev buf.sb_next)
+
+end
+
+(* The character class corresponding to `.' *)
+
+let dotclass = Charset.complement (Charset.singleton '\n')
+
+(* Parse a regular expression *)
+
+let parse s =
+ let len = String.length s in
+ let group_counter = ref 1 in
+
+ let rec regexp0 i =
+ let (r, j) = regexp1 i in
+ regexp0cont r j
+ and regexp0cont r1 i =
+ if i + 2 <= len && s.[i] = '\\' && s.[i+1] = '|' then
+ let (r2, j) = regexp1 (i+2) in
+ regexp0cont (Alt(r1, r2)) j
+ else
+ (r1, i)
+ and regexp1 i =
+ regexp1cont (SeqBuffer.create()) i
+ and regexp1cont sb i =
+ if i >= len
+ || i + 2 <= len && s.[i] = '\\' && (let c = s.[i+1] in c = '|' || c = ')')
+ then
+ (SeqBuffer.extract sb, i)
+ else
+ let (r, j) = regexp2 i in
+ SeqBuffer.add sb r;
+ regexp1cont sb j
+ and regexp2 i =
+ let (r, j) = regexp3 i in
+ regexp2cont r j
+ and regexp2cont r i =
+ if i >= len then (r, i) else
+ match s.[i] with
+ '?' -> regexp2cont (Option r) (i+1)
+ | '*' -> regexp2cont (Star r) (i+1)
+ | '+' -> regexp2cont (Plus r) (i+1)
+ | _ -> (r, i)
+ and regexp3 i =
+ match s.[i] with
+ '\\' -> regexpbackslash (i+1)
+ | '[' -> let (c, j) = regexpclass0 (i+1) in (CharClass c, j)
+ | '^' -> (Bol, i+1)
+ | '$' -> (Eol, i+1)
+ | '.' -> (CharClass dotclass, i+1)
+ | c -> (Char c, i+1)
+ and regexpbackslash i =
+ if i >= len then (Char '\\', i) else
+ match s.[i] with
+ '|' | ')' ->
+ assert false
+ | '(' ->
+ let group_no = !group_counter in
+ if group_no < 32 then incr group_counter;
+ let (r, j) = regexp0 (i+1) in
+ if j + 1 < len && s.[j] = '\\' && s.[j+1] = ')' then
+ if group_no < 32
+ then (Group(group_no, r), j + 2)
+ else (r, j + 2)
+ else
+ failwith "\\( group not closed by \\)"
+ | '1' .. '9' as c ->
+ (Refgroup(Char.code c - 48), i + 1)
+ | 'b' ->
+ (Wordboundary, i + 1)
+ | c ->
+ (Char c, i + 1)
+ and regexpclass0 i =
+ if i < len && s.[i] = '^'
+ then let (c, j) = regexpclass1 (i+1) in (Charset.complement c, j)
+ else regexpclass1 i
+ and regexpclass1 i =
+ let c = Charset.make_empty() in
+ let j = regexpclass2 c i i in
+ (c, j)
+ and regexpclass2 c start i =
+ if i >= len then failwith "[ class not closed by ]";
+ if s.[i] = ']' && i > start then i+1 else begin
+ let c1 = s.[i] in
+ if i+2 < len && s.[i+1] = '-' then begin
+ let c2 = s.[i+2] in
+ Charset.add_range c c1 c2;
+ regexpclass2 c start (i+3)
+ end else begin
+ Charset.add c c1;
+ regexpclass2 c start (i+1)
+ end
+ end in
+
+ let (r, j) = regexp0 0 in
+ if j = len then r else failwith "spurious \\) in regular expression"
+
+(** Parsing and compilation *)
+
+let regexp e = compile false (parse e)
+
+let regexp_case_fold e = compile true (parse e)
let quote s =
let len = String.length s in
@@ -38,42 +506,80 @@ let quote s =
done;
String.sub buf 0 !pos
-let string_before s n = String.sub s 0 n
+let regexp_string s = compile false (String s)
-let string_after s n = String.sub s n (String.length s - n)
+let regexp_string_case_fold s = compile true (String s)
-let first_chars s n = String.sub s 0 n
+(** Matching functions **)
-let last_chars s n = String.sub s (String.length s - n) n
+external re_string_match: regexp -> string -> int -> int array
+ = "re_string_match"
+external re_partial_match: regexp -> string -> int -> int array
+ = "re_partial_match"
+external re_search_forward: regexp -> string -> int -> int array
+ = "re_search_forward"
+external re_search_backward: regexp -> string -> int -> int array
+ = "re_search_backward"
+
+let last_search_result = ref [||]
-let regexp e = compile_regexp e false
+let string_match re s pos =
+ let res = re_string_match re s pos in
+ last_search_result := res;
+ Array.length res > 0
-let regexp_case_fold e = compile_regexp e true
+let string_partial_match re s pos =
+ let res = re_partial_match re s pos in
+ last_search_result := res;
+ Array.length res > 0
-let regexp_string s = compile_regexp (quote s) false
+let search_forward re s pos =
+ let res = re_search_forward re s pos in
+ last_search_result := res;
+ if Array.length res = 0 then raise Not_found else res.(0)
-let regexp_string_case_fold s = compile_regexp (quote s) true
+let search_backward re s pos =
+ let res = re_search_backward re s pos in
+ last_search_result := res;
+ if Array.length res = 0 then raise Not_found else res.(0)
let group_beginning n =
- if n < 0 || n >= 10 then invalid_arg "Str.group_beginning" else
- let pos = beginning_group n in
- if pos = -1 then raise Not_found else pos
+ let n2 = n + n in
+ if n < 0 || n2 >= Array.length !last_search_result then
+ invalid_arg "Str.group_beginning"
+ else
+ let pos = !last_search_result.(n2) in
+ if pos = -1 then raise Not_found else pos
let group_end n =
- if n < 0 || n >= 10 then invalid_arg "Str.group_end" else
- let pos = end_group n in
- if pos = -1 then raise Not_found else pos
+ let n2 = n + n in
+ if n < 0 || n2 >= Array.length !last_search_result then
+ invalid_arg "Str.group_end"
+ else
+ let pos = !last_search_result.(n2 + 1) in
+ if pos = -1 then raise Not_found else pos
let matched_group n txt =
- let b = group_beginning n and e = group_end n in String.sub txt b (e-b)
+ let n2 = n + n in
+ if n < 0 || n2 >= Array.length !last_search_result then
+ invalid_arg "Str.matched_group"
+ else
+ let b = !last_search_result.(n2)
+ and e = !last_search_result.(n2 + 1) in
+ if b = -1 then raise Not_found else String.sub txt b (e - b)
-let replace_matched repl matched =
- replacement_text repl matched
-
let match_beginning () = group_beginning 0
and match_end () = group_end 0
and matched_string txt = matched_group 0 txt
+(** Replacement **)
+
+external re_replacement_text: string -> int array -> string -> string
+ = "re_replacement_text"
+
+let replace_matched repl matched =
+ re_replacement_text repl !last_search_result matched
+
let substitute_first expr repl_fun text =
try
let pos = search_forward expr text 0 in
@@ -99,10 +605,12 @@ let global_substitute expr repl_fun text =
String.concat "" (replace 0 false)
let global_replace expr repl text =
- global_substitute expr (replacement_text repl) text
+ global_substitute expr (replace_matched repl) text
and replace_first expr repl text =
- substitute_first expr (replacement_text repl) text
-
+ substitute_first expr (replace_matched repl) text
+
+(** Splitting *)
+
let bounded_split expr text num =
let start =
if string_match expr text 0 then match_end() else 0 in
diff --git a/otherlibs/str/str.mli b/otherlibs/str/str.mli
index 4ef6a39dc..c7d9d1366 100644
--- a/otherlibs/str/str.mli
+++ b/otherlibs/str/str.mli
@@ -29,19 +29,19 @@ val regexp : string -> regexp
[$^.*+?[]]. The following constructs are recognized:
- [. ] matches any character except newline
- [* ] (postfix) matches the previous expression zero, one or
- several times
+ several times
- [+ ] (postfix) matches the previous expression one or
- several times
+ several times
- [? ] (postfix) matches the previous expression once or
- not at all
+ not at all
- [[..] ] character set; ranges are denoted with [-], as in [[a-z]];
- an initial [^], as in [[^0-9]], complements the set
+ an initial [^], as in [[^0-9]], complements the set
- [^ ] matches at beginning of line
- [$ ] matches at end of line
- [\| ] (infix) alternative between two expressions
- [\(..\)] grouping and naming of the enclosed expression
- [\1 ] the text matched by the first [\(...\)] expression
- ([\2] for the second expression, etc)
+ ([\2] for the second expression, and so on up to [\9])
- [\b ] matches word boundaries
- [\ ] quotes special characters. *)
@@ -66,26 +66,23 @@ val regexp_string_case_fold : string -> regexp
(** {6 String matching and searching} *)
-external string_match : regexp -> string -> int -> bool = "str_string_match"
+val string_match : regexp -> string -> int -> bool
(** [string_match r s start] tests whether the characters in [s]
starting at position [start] match the regular expression [r].
The first character of a string has position [0], as usual. *)
-external search_forward :
- regexp -> string -> int -> int = "str_search_forward"
+val search_forward : regexp -> string -> int -> int
(** [search_forward r s start] searchs the string [s] for a substring
matching the regular expression [r]. The search starts at position
[start] and proceeds towards the end of the string.
Return the position of the first character of the matched
substring, or raise [Not_found] if no substring matches. *)
-external search_backward :
- regexp -> string -> int -> int = "str_search_backward"
+val search_backward : regexp -> string -> int -> int
(** Same as {!Str.search_forward}, but the search proceeds towards the
beginning of the string. *)
-external string_partial_match :
- regexp -> string -> int -> bool = "str_string_partial_match"
+val string_partial_match : regexp -> string -> int -> bool
(** Similar to {!Str.string_match}, but succeeds whenever the argument
string is a prefix of a string that matches. This includes
the case of a true complete match. *)
@@ -162,12 +159,12 @@ val substitute_first : regexp -> (string -> string) -> string -> string
(** Same as {!Str.global_substitute}, except that only the first substring
matching the regular expression is replaced. *)
+val replace_matched : string -> string -> string
(** [replace_matched repl s] returns the replacement text [repl]
in which [\1], [\2], etc. have been replaced by the text
matched by the corresponding groups in the most recent matching
operation. [s] must be the same string that was matched during
this matching operation. *)
-val replace_matched : string -> string -> string
(** {6 Splitting} *)
@@ -195,10 +192,7 @@ val split_delim : regexp -> string -> string list
val bounded_split_delim : regexp -> string -> int -> string list
(** Same as {!Str.bounded_split}, but occurrences of the
delimiter at the beginning and at the end of the string are
- recognized and returned as empty strings in the result.
- For instance, [split_delim (regexp " ") " abc "]
- returns [[""; "abc"; ""]], while [split] with the same
- arguments returns [["abc"]]. *)
+ recognized and returned as empty strings in the result. *)
type split_result =
Text of string
@@ -216,9 +210,7 @@ val bounded_full_split : regexp -> string -> int -> split_result list
(** Same as {!Str.bounded_split_delim}, but returns
the delimiters as well as the substrings contained between
delimiters. The former are tagged [Delim] in the result list;
- the latter are tagged [Text]. For instance,
- [full_split (regexp "[{}]") "{ab}"] returns
- [[Delim "{"; Text "ab"; Delim "}"]]. *)
+ the latter are tagged [Text]. *)
(** {6 Extracting substrings} *)
diff --git a/otherlibs/str/strstubs.c b/otherlibs/str/strstubs.c
index 1e70a05f2..f64080f9b 100644
--- a/otherlibs/str/strstubs.c
+++ b/otherlibs/str/strstubs.c
@@ -1,170 +1,413 @@
-#if !macintosh
-#include <sys/types.h>
-#else
-#include <SizeTDef.h>
-#endif
+/***********************************************************************/
+/* */
+/* Objective Caml */
+/* */
+/* Xavier Leroy, projet Cristal, INRIA Rocquencourt */
+/* */
+/* Copyright 1996 Institut National de Recherche en Informatique et */
+/* en Automatique. All rights reserved. This file is distributed */
+/* under the terms of the GNU Library General Public License, with */
+/* the special exception on linking described in file ../../LICENSE. */
+/* */
+/***********************************************************************/
+
+/* $Id$ */
+
+#include <assert.h>
#include <string.h>
-#include <regex.h>
+#include <ctype.h>
#include <mlvalues.h>
#include <alloc.h>
-#include <custom.h>
-#include <fail.h>
#include <memory.h>
+#include <fail.h>
-struct regexp_struct {
- struct custom_operations * ops;
- struct re_pattern_buffer re;
+/* The backtracking NFA interpreter */
+
+struct backtrack_point {
+ char * txt;
+ value * pc;
+ int mask;
};
-typedef struct regexp_struct * regexp;
+#define BACKTRACK_STACK_BLOCK_SIZE 500
+
+struct backtrack_stack {
+ struct backtrack_stack * previous;
+ struct backtrack_point point[BACKTRACK_STACK_BLOCK_SIZE];
+};
+
+#define Opcode(x) ((x) & 0xFF)
+#define Arg(x) ((unsigned long)(x) >> 8)
+#define SignedArg(x) ((long)(x) >> 8)
+
+enum {
+ CHAR, /* match a single character */
+ CHARNORM, /* match a single character, after normalization */
+ STRING, /* match a character string */
+ STRINGNORM, /* match a character string, after normalization */
+ CHARCLASS, /* match a character class */
+ BOL, /* match at beginning of line */
+ EOL, /* match at end of line */
+ WORDBOUNDARY, /* match on a word boundary */
+ BEGGROUP, /* record the beginning of a group */
+ ENDGROUP, /* record the end of a group */
+ REFGROUP, /* match a previously matched group */
+ ACCEPT, /* report success */
+ SIMPLEOPT, /* match a character class 0 or 1 times */
+ SIMPLESTAR, /* match a character class 0, 1 or several times */
+ SIMPLEPLUS, /* match a character class 1 or several times */
+ GOTO, /* unconditional branch */
+ PUSHBACK /* record a backtrack point --
+ where to jump in case of failure */
+};
+
+/* Accessors in a compiled regexp */
+#define Prog(re) Field(re, 0)
+#define Cpool(re) Field(re, 1)
+#define Normtable(re) Field(re, 2)
+#define Numgroups(re) Int_val(Field(re, 3))
+#define Startchars(re) Int_val(Field(re, 4))
+
+/* Record positions of matched groups */
+struct re_group {
+ unsigned char * tentative_start;
+ unsigned char * start;
+ unsigned char * end;
+};
+static struct re_group re_group[32];
-static void free_regexp(value vexpr)
+/* Bitvector recording which groups were fully matched */
+static int re_mask;
+
+/* The initial backtracking stack */
+static struct backtrack_stack initial_stack = { NULL, };
+
+/* Free a chained list of backtracking stacks */
+static void free_backtrack_stack(struct backtrack_stack * stack)
{
- regexp expr = (regexp) Bp_val(vexpr);
- expr->re.translate = NULL;
- re_free(&(expr->re));
+ struct backtrack_stack * prevstack;
+ while ((prevstack = stack->previous) != NULL) {
+ stat_free(stack);
+ stack = prevstack;
+ }
}
-static struct custom_operations regexp_ops = {
- "_regexp",
- free_regexp,
- custom_compare_default,
- custom_hash_default,
- custom_serialize_default,
- custom_deserialize_default
+/* Membership in a bit vector representing a set of booleans */
+#define In_bitset(s,i,tmp) (tmp = (i), ((s)[tmp >> 3] >> (tmp & 7)) & 1)
+
+/* Determine if a character is a word constituent */
+static unsigned char re_word_letters[32] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 254, 255, 255, 7, 254, 255, 255, 7,
+ 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 127, 255, 255, 255, 127, 255
};
+#define Is_word_letter(c) ((re_word_letters[(c) >> 3] >> ((c) & 7)) & 1)
+
+/* The bytecode interpreter for the NFA */
-static regexp alloc_regexp(void)
+static int re_match(value re,
+ unsigned char * starttxt,
+ register unsigned char * txt,
+ register unsigned char * endtxt,
+ int accept_partial_match)
{
- value res =
- alloc_custom(&regexp_ops, sizeof(struct regexp_struct), 1, 10000);
- return (regexp) res;
-}
+ register value * pc;
+ struct backtrack_stack * stack;
+ struct backtrack_point * sp;
+ value cpool;
+ value normtable;
+ unsigned char c;
-#define RE_SYNTAX RE_SYNTAX_EMACS
+ pc = &Field(Prog(re), 0);
+ stack = &initial_stack;
+ sp = stack->point;
+ cpool = Cpool(re);
+ normtable = Normtable(re);
+ re_mask = 0;
+ re_group[0].start = txt;
-static char * case_fold_table = NULL;
+ while (1) {
+ long instr = Long_val(*pc++);
+ switch (Opcode(instr)) {
+ case CHAR:
+ if (txt == endtxt) goto prefix_match;
+ if (*txt != Arg(instr)) goto backtrack;
+ txt++;
+ break;
+ case CHARNORM:
+ if (txt == endtxt) goto prefix_match;
+ if (Byte_u(normtable, *txt) != Arg(instr)) goto backtrack;
+ txt++;
+ break;
+ case STRING: {
+ unsigned char * s =
+ (unsigned char *) String_val(Field(cpool, Arg(instr)));
+ while ((c = *s++) != 0) {
+ if (txt == endtxt) goto prefix_match;
+ if (c != *txt) goto backtrack;
+ txt++;
+ }
+ break;
+ }
+ case STRINGNORM: {
+ unsigned char * s =
+ (unsigned char *) String_val(Field(cpool, Arg(instr)));
+ while ((c = *s++) != 0) {
+ if (txt == endtxt) goto prefix_match;
+ if (c != Byte_u(normtable, *txt)) goto backtrack;
+ txt++;
+ }
+ break;
+ }
+ case CHARCLASS:
+ if (txt == endtxt) goto prefix_match;
+ if (! In_bitset(String_val(Field(cpool, Arg(instr))), *txt, c))
+ goto backtrack;
+ txt++;
+ break;
+ case BOL:
+ if (txt > starttxt && txt[-1] != '\n') goto backtrack;
+ break;
+ case EOL:
+ if (txt < endtxt && *txt != '\n') goto backtrack;
+ break;
+ case WORDBOUNDARY:
+ /* At beginning and end of text: no
+ At beginning of text: OK if current char is a letter
+ At end of text: OK if previous char is a letter
+ Otherwise:
+ OK if previous char is a letter and current char not a letter
+ or previous char is not a letter and current char is a letter */
+ if (txt == starttxt) {
+ if (txt == endtxt) goto prefix_match;
+ if (Is_word_letter(txt[0])) break;
+ goto backtrack;
+ } else if (txt == endtxt) {
+ if (Is_word_letter(txt[-1])) break;
+ goto backtrack;
+ } else {
+ if (Is_word_letter(txt[-1]) != Is_word_letter(txt[0])) break;
+ goto backtrack;
+ }
+ case BEGGROUP: {
+ int group_no = Arg(instr);
+ re_group[group_no].tentative_start = txt;
+ break;
+ }
+ case ENDGROUP: {
+ int group_no = Arg(instr);
+ struct re_group * group = &(re_group[group_no]);
+ group->start = group->tentative_start;
+ group->end = txt;
+ re_mask |= (1 << group_no);
+ break;
+ }
+ case REFGROUP: {
+ int group_no = Arg(instr);
+ struct re_group * group = &(re_group[group_no]);
+ unsigned char * s;
+ if ((re_mask & (1 << group_no)) == 0) goto backtrack;
+ for (s = group->start; s < group->end; s++) {
+ if (txt == endtxt) goto prefix_match;
+ if (*s != *txt) goto backtrack;
+ txt++;
+ }
+ break;
+ }
+ case ACCEPT:
+ goto accept;
+ case SIMPLEOPT: {
+ char * set = String_val(Field(cpool, Arg(instr)));
+ if (txt < endtxt && In_bitset(set, *txt, c)) txt++;
+ break;
+ }
+ case SIMPLESTAR: {
+ char * set = String_val(Field(cpool, Arg(instr)));
+ while (txt < endtxt && In_bitset(set, *txt, c))
+ txt++;
+ break;
+ }
+ case SIMPLEPLUS: {
+ char * set = String_val(Field(cpool, Arg(instr)));
+ if (txt == endtxt) goto prefix_match;
+ if (! In_bitset(set, *txt, c)) goto backtrack;
+ txt++;
+ while (txt < endtxt && In_bitset(set, *txt, c))
+ txt++;
+ break;
+ }
+ case GOTO:
+ pc = pc + SignedArg(instr);
+ break;
+ case PUSHBACK:
+ if (sp == stack->point + BACKTRACK_STACK_BLOCK_SIZE) {
+ struct backtrack_stack * newstack =
+ stat_alloc(sizeof(struct backtrack_stack));
+ newstack->previous = stack;
+ stack = newstack;
+ sp = stack->point;
+ }
+ sp->txt = txt;
+ sp->pc = pc + SignedArg(instr);
+ sp->mask = re_mask;
+ sp++;
+ break;
+ default:
+ assert(0);
+ }
+ /* Continue with next instruction */
+ continue;
+ prefix_match:
+ /* We get here when matching failed because the end of text
+ was encountered. */
+ if (accept_partial_match) goto accept;
+ backtrack:
+ /* We get here when matching fails. Backtrack to most recent saved
+ point. */
+ if (sp == stack->point) {
+ struct backtrack_stack * prevstack = stack->previous;
+ if (prevstack == NULL) return 0;
+ stat_free(stack);
+ stack = prevstack;
+ sp = stack->point + BACKTRACK_STACK_BLOCK_SIZE;
+ }
+ sp--;
+ txt = sp->txt;
+ pc = sp->pc;
+ re_mask = sp->mask;
+ }
+ accept:
+ /* We get here when the regexp was successfully matched */
+ free_backtrack_stack(stack);
+ re_group[0].end = txt;
+ re_mask |= 1;
+ return 1;
+}
+
+/* Allocate an integer array containing the positions of the matched groups.
+ Beginning of group #N is at 2N, end is at 2N+1.
+ Take position = -1 when group wasn't matched. */
-CAMLprim value str_compile_regexp(value src, value fold)
+static value re_alloc_groups(value re, value str)
{
- regexp expr;
- char * msg;
-
- Begin_root(src);
- expr = alloc_regexp();
- End_roots();
- re_syntax_options = RE_SYNTAX;
- if (Bool_val(fold) && case_fold_table == NULL) {
- int i;
- case_fold_table = stat_alloc(256);
- for (i = 0; i <= 255; i++) case_fold_table[i] = i;
- for (i = 'A'; i <= 'Z'; i++) case_fold_table[i] = i + 32;
- for (i = 192; i <= 214; i++) case_fold_table[i] = i + 32;
- for (i = 216; i <= 222; i++) case_fold_table[i] = i + 32;
+ CAMLparam1(str);
+ CAMLlocal1(res);
+ unsigned char * starttxt = (unsigned char *) String_val(str);
+ int n = Numgroups(re);
+ int i;
+
+ res = alloc(n * 2, 0);
+ for (i = 0; i < n; i++) {
+ if ((re_mask & (1 << i)) == 0) {
+ Field(res, i * 2) = Val_int(-1);
+ Field(res, i * 2 + 1) = Val_int(-1);
+ } else {
+ Field(res, i * 2) = Val_long(re_group[i].start - starttxt);
+ Field(res, i * 2 + 1) = Val_long(re_group[i].end - starttxt);
+ }
}
- expr->re.translate = Bool_val(fold) ? case_fold_table : NULL;
- expr->re.fastmap = stat_alloc(256);
- expr->re.buffer = NULL;
- expr->re.allocated = 0;
- msg = (char *) re_compile_pattern(String_val(src), string_length(src),
- &(expr->re));
- if (msg != NULL) failwith(msg);
- re_compile_fastmap(&(expr->re));
- expr->re.regs_allocated = REGS_FIXED;
- return (value) expr;
+ CAMLreturn(res);
}
-static regoff_t start_regs[10], end_regs[10];
-
-static struct re_registers match_regs = { 10, start_regs, end_regs };
+/* String matching and searching. All functions return the empty array
+ on failure, and an array of positions on success. */
-CAMLprim value str_string_match(regexp expr, value text, value pos)
+CAMLprim value re_string_match(value re, value str, value pos)
{
- int len = string_length(text);
- int start = Int_val(pos);
- if (start < 0 || start > len)
+ unsigned char * starttxt = &Byte_u(str, 0);
+ unsigned char * txt = &Byte_u(str, Long_val(pos));
+ unsigned char * endtxt = &Byte_u(str, string_length(str));
+
+ if (txt < starttxt || txt > endtxt)
invalid_argument("Str.string_match");
- switch (re_match(&(expr->re), String_val(text), len,
- start, &match_regs)) {
- case -2:
- failwith("Str.string_match");
- case -1:
- case -3:
- return Val_false;
- default:
- return Val_true;
+ if (re_match(re, starttxt, txt, endtxt, 0)) {
+ return re_alloc_groups(re, str);
+ } else {
+ return Atom(0);
}
}
-CAMLprim value str_string_partial_match(regexp expr, value text, value pos)
+CAMLprim value re_partial_match(value re, value str, value pos)
{
- int len = string_length(text);
- int start = Int_val(pos);
- if (start < 0 || start > len)
+ unsigned char * starttxt = &Byte_u(str, 0);
+ unsigned char * txt = &Byte_u(str, Long_val(pos));
+ unsigned char * endtxt = &Byte_u(str, string_length(str));
+
+ if (txt < starttxt || txt > endtxt)
invalid_argument("Str.string_partial_match");
- switch (re_match(&(expr->re), String_val(text), len,
- start, &match_regs)) {
- case -2:
- failwith("Str.string_partial_match");
- case -1:
- return Val_false;
- default:
- return Val_true;
+ if (re_match(re, starttxt, txt, endtxt, 1)) {
+ return re_alloc_groups(re, str);
+ } else {
+ return Atom(0);
}
}
-CAMLprim value str_search_forward(regexp expr, value text, value pos)
+CAMLprim value re_search_forward(value re, value str, value startpos)
{
- int res;
- int len = string_length(text);
- int start = Int_val(pos);
- if (start < 0 || start > len)
+ unsigned char * starttxt = &Byte_u(str, 0);
+ unsigned char * txt = &Byte_u(str, Long_val(startpos));
+ unsigned char * endtxt = &Byte_u(str, string_length(str));
+ unsigned char * startchars;
+ unsigned char c;
+
+ if (txt < starttxt || txt > endtxt)
invalid_argument("Str.search_forward");
- res = re_search(&(expr->re), String_val(text), len, start, len-start,
- &match_regs);
- switch(res) {
- case -2:
- failwith("Str.search_forward");
- case -1:
- raise_not_found();
- default:
- return Val_int(res);
+ if (Startchars(re) == -1) {
+ do {
+ if (re_match(re, starttxt, txt, endtxt, 0))
+ return re_alloc_groups(re, str);
+ txt++;
+ } while (txt <= endtxt);
+ return Atom(0);
+ } else {
+ startchars =
+ (unsigned char *) String_val(Field(Cpool(re), Startchars(re)));
+ do {
+ while (txt < endtxt && startchars[*txt] == 0) txt++;
+ if (re_match(re, starttxt, txt, endtxt, 0))
+ return re_alloc_groups(re, str);
+ txt++;
+ } while (txt <= endtxt);
+ return Atom(0);
}
}
-CAMLprim value str_search_backward(regexp expr, value text, value pos)
+CAMLprim value re_search_backward(value re, value str, value startpos)
{
- int res;
- int len = string_length(text);
- int start = Int_val(pos);
- if (start < 0 || start > len)
+ unsigned char * starttxt = &Byte_u(str, 0);
+ unsigned char * txt = &Byte_u(str, Long_val(startpos));
+ unsigned char * endtxt = &Byte_u(str, string_length(str));
+ unsigned char * startchars;
+ unsigned char c;
+
+ if (txt < starttxt || txt > endtxt)
invalid_argument("Str.search_backward");
- res = re_search(&(expr->re), String_val(text), len, start, -start-1,
- &match_regs);
- switch(res) {
- case -2:
- failwith("Str.search_backward");
- case -1:
- raise_not_found();
- default:
- return Val_int(res);
+ if (Startchars(re) == -1) {
+ do {
+ if (re_match(re, starttxt, txt, endtxt, 0))
+ return re_alloc_groups(re, str);
+ txt--;
+ } while (txt >= starttxt);
+ return Atom(0);
+ } else {
+ startchars =
+ (unsigned char *) String_val(Field(Cpool(re), Startchars(re)));
+ do {
+ while (txt > starttxt && startchars[*txt] == 0) txt--;
+ if (re_match(re, starttxt, txt, endtxt, 0))
+ return re_alloc_groups(re, str);
+ txt--;
+ } while (txt >= starttxt);
+ return Atom(0);
}
}
-CAMLprim value str_beginning_group(value ngroup)
-{
- return Val_int(start_regs[Int_val(ngroup)]);
-}
-
-CAMLprim value str_end_group(value ngroup)
-{
- return Val_int(end_regs[Int_val(ngroup)]);
-}
+/* Replacement */
-CAMLprim value str_replacement_text(value repl, value orig)
+CAMLprim value re_replacement_text(value repl, value groups, value orig)
{
- value res;
- mlsize_t len, n;
+ CAMLparam3(repl, groups, orig);
+ CAMLlocal1(res);
+ mlsize_t start, end, len, n;
char * p, * q;
int c;
@@ -184,15 +427,20 @@ CAMLprim value str_replacement_text(value repl, value orig)
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
c -= '0';
- len += end_regs[c] - start_regs[c]; break;
+ if (c*2 >= Wosize_val(groups))
+ failwith("Str.replace: reference to unmatched group");
+ start = Long_val(Field(groups, c*2));
+ end = Long_val(Field(groups, c*2 + 1));
+ if (start == (mlsize_t) -1)
+ failwith("Str.replace: reference to unmatched group");
+ len += end - start;
+ break;
default:
len += 2; break;
}
}
}
- Begin_roots2(orig,repl);
- res = alloc_string(len);
- End_roots();
+ res = alloc_string(len);
p = String_val(repl);
q = String_val(res);
n = string_length(repl);
@@ -208,8 +456,10 @@ CAMLprim value str_replacement_text(value repl, value orig)
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
c -= '0';
- len = end_regs[c] - start_regs[c];
- memmove (q, &Byte(orig, start_regs[c]), len);
+ start = Long_val(Field(groups, c*2));
+ end = Long_val(Field(groups, c*2 + 1));
+ len = end - start;
+ memmove (q, &Byte(orig, start), len);
q += len;
break;
default:
@@ -217,6 +467,6 @@ CAMLprim value str_replacement_text(value repl, value orig)
}
}
}
- return res;
+ CAMLreturn(res);
}