PR#4874: interpretation of \b in regexps: characters that constitute a word now include 0-9 and _

git-svn-id: http://caml.inria.fr/svn/ocaml/trunk@10429 f963ae5c-01c2-4b8c-9fe0-0dff7051ff02
author: Xavier Leroy <xavier.leroy@inria.fr> 2010-05-19 12:22:24 +0000
committer: Xavier Leroy <xavier.leroy@inria.fr> 2010-05-19 12:22:24 +0000
commit: d4d232101d144de0b2af78c002e0a13ecd98152e (patch)
tree: 22eb4f0c7e1aa8896c2138c699e73add1b0f5af7
parent: e9f1c5e6dc9bf5a6f78e57ae6f4a94e7e182b648 (diff)
3 files changed, 18 insertions, 4 deletions
diff --git a/Changes b/Changes
index 48edc3aab..ac39b6faa 100644
--- a/Changes
+++ b/Changes
@@ -81,6 +81,11 @@ Standard library:
 * Random: changed the algorithm to produce better randomness.  Now passes the
   DieHard tests.
 
+Other libraries:
+* Str: letters that constitute a word now include digits 0-9 and
+  underscore _.  This changes the interpretation of '\b' (word boundary)
+  in regexps, but is more consistent with other regexp libraries. (PR#4874).
+
 Ocamlbuild:
 - Add support for native dynlink.
 
diff --git a/otherlibs/str/strstubs.c b/otherlibs/str/strstubs.c
index ac530ef09..b8d53ff8f 100644
--- a/otherlibs/str/strstubs.c
+++ b/otherlibs/str/strstubs.c
@@ -109,10 +109,19 @@ static void free_backtrack_stack(struct backtrack_stack * stack)
 #define In_bitset(s,i,tmp) (tmp = (i), ((s)[tmp >> 3] >> (tmp & 7)) & 1)
 
 /* Determine if a character is a word constituent */
+/* PR#4874: word constituent = letter, digit, underscore. */
+
 static unsigned char re_word_letters[32] = {
-  0, 0, 0, 0, 0, 0, 0, 0, 254, 255, 255, 7, 254, 255, 255, 7,
-  0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 127, 255, 255, 255, 127, 255
+  0x00, 0x00, 0x00, 0x00,       /* 0x00-0x1F: none */
+  0x00, 0x00, 0xFF, 0x03,       /* 0x20-0x3F: digits 0-9 */
+  0xFE, 0xFF, 0xFF, 0x87,       /* 0x40-0x5F: A to Z, _ */
+  0xFE, 0xFF, 0xFF, 0x07,       /* 0x60-0x7F: a to z */
+  0x00, 0x00, 0x00, 0x00,       /* 0x80-0x9F: none */
+  0x00, 0x00, 0x00, 0x00,       /* 0xA0-0xBF: none */
+  0xFF, 0xFF, 0x7F, 0xFF,       /* 0xC0-0xDF: Latin-1 accented uppercase */
+  0xFF, 0xFF, 0x7F, 0xFF        /* 0xE0-0xFF: Latin-1 accented lowercase */
 };
+
 #define Is_word_letter(c) ((re_word_letters[(c) >> 3] >> ((c) & 7)) & 1)
 
 /* The bytecode interpreter for the NFA */
diff --git a/testsuite/tests/lib-str/t01.ml b/testsuite/tests/lib-str/t01.ml
index bb266a017..03c85ea40 100644
--- a/testsuite/tests/lib-str/t01.ml
+++ b/testsuite/tests/lib-str/t01.ml
@@ -588,7 +588,7 @@ let automated_test() =
   start_test "Search for /\\ba/";
   let r = Str.regexp "\\ba" in
   let n = 0 in
-  test_search_forward r n "abcd"
+  test_search_forward r n "a2cd"
     [|"a"|];
   test_search_forward r n "the a"
     [|"a"|];
@@ -606,7 +606,7 @@ let automated_test() =
   let n = 0 in
   test_search_forward r n "a"
     [|"a"|];
-  test_search_forward r n "bcda"
+  test_search_forward r n "bc_a"
     [|"a"|];
   test_search_forward r n "a foo"
     [|"a"|];
author	Xavier Leroy <xavier.leroy@inria.fr>	2010-05-19 12:22:24 +0000
committer	Xavier Leroy <xavier.leroy@inria.fr>	2010-05-19 12:22:24 +0000
commit	d4d232101d144de0b2af78c002e0a13ecd98152e (patch)
tree	22eb4f0c7e1aa8896c2138c699e73add1b0f5af7
parent	e9f1c5e6dc9bf5a6f78e57ae6f4a94e7e182b648 (diff)