summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorXavier Leroy <xavier.leroy@inria.fr>2010-05-19 12:22:24 +0000
committerXavier Leroy <xavier.leroy@inria.fr>2010-05-19 12:22:24 +0000
commitd4d232101d144de0b2af78c002e0a13ecd98152e (patch)
tree22eb4f0c7e1aa8896c2138c699e73add1b0f5af7
parente9f1c5e6dc9bf5a6f78e57ae6f4a94e7e182b648 (diff)
PR#4874: interpretation of \b in regexps: characters that constitute a word now include 0-9 and _
git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@10429 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
-rw-r--r--Changes5
-rw-r--r--otherlibs/str/strstubs.c13
-rw-r--r--testsuite/tests/lib-str/t01.ml4
3 files changed, 18 insertions, 4 deletions
diff --git a/Changes b/Changes
index 48edc3aab..ac39b6faa 100644
--- a/Changes
+++ b/Changes
@@ -81,6 +81,11 @@ Standard library:
* Random: changed the algorithm to produce better randomness. Now passes the
DieHard tests.
+Other libraries:
+* Str: letters that constitute a word now include digits 0-9 and
+ underscore _. This changes the interpretation of '\b' (word boundary)
+ in regexps, but is more consistent with other regexp libraries. (PR#4874).
+
Ocamlbuild:
- Add support for native dynlink.
diff --git a/otherlibs/str/strstubs.c b/otherlibs/str/strstubs.c
index ac530ef09..b8d53ff8f 100644
--- a/otherlibs/str/strstubs.c
+++ b/otherlibs/str/strstubs.c
@@ -109,10 +109,19 @@ static void free_backtrack_stack(struct backtrack_stack * stack)
#define In_bitset(s,i,tmp) (tmp = (i), ((s)[tmp >> 3] >> (tmp & 7)) & 1)
/* Determine if a character is a word constituent */
+/* PR#4874: word constituent = letter, digit, underscore. */
+
static unsigned char re_word_letters[32] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 254, 255, 255, 7, 254, 255, 255, 7,
- 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 127, 255, 255, 255, 127, 255
+ 0x00, 0x00, 0x00, 0x00, /* 0x00-0x1F: none */
+ 0x00, 0x00, 0xFF, 0x03, /* 0x20-0x3F: digits 0-9 */
+ 0xFE, 0xFF, 0xFF, 0x87, /* 0x40-0x5F: A to Z, _ */
+ 0xFE, 0xFF, 0xFF, 0x07, /* 0x60-0x7F: a to z */
+ 0x00, 0x00, 0x00, 0x00, /* 0x80-0x9F: none */
+ 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xBF: none */
+ 0xFF, 0xFF, 0x7F, 0xFF, /* 0xC0-0xDF: Latin-1 accented uppercase */
+ 0xFF, 0xFF, 0x7F, 0xFF /* 0xE0-0xFF: Latin-1 accented lowercase */
};
+
#define Is_word_letter(c) ((re_word_letters[(c) >> 3] >> ((c) & 7)) & 1)
/* The bytecode interpreter for the NFA */
diff --git a/testsuite/tests/lib-str/t01.ml b/testsuite/tests/lib-str/t01.ml
index bb266a017..03c85ea40 100644
--- a/testsuite/tests/lib-str/t01.ml
+++ b/testsuite/tests/lib-str/t01.ml
@@ -588,7 +588,7 @@ let automated_test() =
start_test "Search for /\\ba/";
let r = Str.regexp "\\ba" in
let n = 0 in
- test_search_forward r n "abcd"
+ test_search_forward r n "a2cd"
[|"a"|];
test_search_forward r n "the a"
[|"a"|];
@@ -606,7 +606,7 @@ let automated_test() =
let n = 0 in
test_search_forward r n "a"
[|"a"|];
- test_search_forward r n "bcda"
+ test_search_forward r n "bc_a"
[|"a"|];
test_search_forward r n "a foo"
[|"a"|];