diff options
author | Xavier Leroy <xavier.leroy@inria.fr> | 2010-05-19 12:22:24 +0000 |
---|---|---|
committer | Xavier Leroy <xavier.leroy@inria.fr> | 2010-05-19 12:22:24 +0000 |
commit | d4d232101d144de0b2af78c002e0a13ecd98152e (patch) | |
tree | 22eb4f0c7e1aa8896c2138c699e73add1b0f5af7 | |
parent | e9f1c5e6dc9bf5a6f78e57ae6f4a94e7e182b648 (diff) |
PR#4874: interpretation of \b in regexps: characters that constitute a word now include 0-9 and _
git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@10429 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
-rw-r--r-- | Changes | 5 | ||||
-rw-r--r-- | otherlibs/str/strstubs.c | 13 | ||||
-rw-r--r-- | testsuite/tests/lib-str/t01.ml | 4 |
3 files changed, 18 insertions, 4 deletions
@@ -81,6 +81,11 @@ Standard library: * Random: changed the algorithm to produce better randomness. Now passes the DieHard tests. +Other libraries: +* Str: letters that constitute a word now include digits 0-9 and + underscore _. This changes the interpretation of '\b' (word boundary) + in regexps, but is more consistent with other regexp libraries. (PR#4874). + Ocamlbuild: - Add support for native dynlink. diff --git a/otherlibs/str/strstubs.c b/otherlibs/str/strstubs.c index ac530ef09..b8d53ff8f 100644 --- a/otherlibs/str/strstubs.c +++ b/otherlibs/str/strstubs.c @@ -109,10 +109,19 @@ static void free_backtrack_stack(struct backtrack_stack * stack) #define In_bitset(s,i,tmp) (tmp = (i), ((s)[tmp >> 3] >> (tmp & 7)) & 1) /* Determine if a character is a word constituent */ +/* PR#4874: word constituent = letter, digit, underscore. */ + static unsigned char re_word_letters[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 254, 255, 255, 7, 254, 255, 255, 7, - 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 127, 255, 255, 255, 127, 255 + 0x00, 0x00, 0x00, 0x00, /* 0x00-0x1F: none */ + 0x00, 0x00, 0xFF, 0x03, /* 0x20-0x3F: digits 0-9 */ + 0xFE, 0xFF, 0xFF, 0x87, /* 0x40-0x5F: A to Z, _ */ + 0xFE, 0xFF, 0xFF, 0x07, /* 0x60-0x7F: a to z */ + 0x00, 0x00, 0x00, 0x00, /* 0x80-0x9F: none */ + 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xBF: none */ + 0xFF, 0xFF, 0x7F, 0xFF, /* 0xC0-0xDF: Latin-1 accented uppercase */ + 0xFF, 0xFF, 0x7F, 0xFF /* 0xE0-0xFF: Latin-1 accented lowercase */ }; + #define Is_word_letter(c) ((re_word_letters[(c) >> 3] >> ((c) & 7)) & 1) /* The bytecode interpreter for the NFA */ diff --git a/testsuite/tests/lib-str/t01.ml b/testsuite/tests/lib-str/t01.ml index bb266a017..03c85ea40 100644 --- a/testsuite/tests/lib-str/t01.ml +++ b/testsuite/tests/lib-str/t01.ml @@ -588,7 +588,7 @@ let automated_test() = start_test "Search for /\\ba/"; let r = Str.regexp "\\ba" in let n = 0 in - test_search_forward r n "abcd" + test_search_forward r n "a2cd" [|"a"|]; test_search_forward r n "the a" [|"a"|]; @@ -606,7 +606,7 @@ let automated_test() = let n = 0 in test_search_forward r n "a" [|"a"|]; - test_search_forward r n "bcda" + test_search_forward r n "bc_a" [|"a"|]; test_search_forward r n "a foo" [|"a"|]; |