7 files changed, 297 insertions, 21 deletions
diff --git a/Changes b/Changes
index 005126cfc..5f8320e30 100644
--- a/Changes
+++ b/Changes
@@ -11,12 +11,15 @@ Language features:
 
 Compilers:
 - Experimental native code generator for AArch64 (ARM 64 bits)
+- Optimization of integer division and modulus by constant divisors
+  (feature wish PR#6042)
 - PR#6182: better message for virtual objects and class types
   (Leo P. White, Stephen Dolan)
 - PR#5817: new flag to keep locations in cmi files
 - PR#5854: issue warning 3 when referring to a value marked with
   the [@@deprecated] attribute
 
+
 Bug fixes:
 - PR#4719: Sys.executable_name wrong if executable name contains dots (Windows)
 - PR#4855: 'camlp4 -I +dir' accepted, dir is relative to 'camlp4 -where'
diff --git a/asmcomp/amd64/emit.mlp b/asmcomp/amd64/emit.mlp
index 8dad2206a..9f5836dd6 100644
--- a/asmcomp/amd64/emit.mlp
+++ b/asmcomp/amd64/emit.mlp
@@ -524,21 +524,60 @@ let emit_instr fallthrough i =
     | Lop(Iintop_imm(Iadd, -1) | Iintop_imm(Isub, 1)) ->
         `	decq	{emit_reg i.res.(0)}\n`
     | Lop(Iintop_imm(Idiv, n)) ->
-        (* Note: i.arg.(0) = i.res.(0) = rdx  (cf. selection.ml) *)
+        (* Note: i.arg.(0) = i.res.(0) = rcx  (cf. selection.ml) *)
         let l = Misc.log2 n in
-        `	movq	{emit_reg i.arg.(0)}, %rax\n`;
-        `	addq	${emit_int(n-1)}, {emit_reg i.arg.(0)}\n`;
-        `	testq	%rax, %rax\n`;
-        `	cmovns	%rax, {emit_reg i.arg.(0)}\n`;
-        `	sarq	${emit_int l}, {emit_reg i.res.(0)}\n`
+        if n = 1 lsl l then begin
+          `	movq	{emit_reg i.arg.(0)}, %rax\n`;
+          `	addq	${emit_int(n-1)}, {emit_reg i.arg.(0)}\n`;
+          `	testq	%rax, %rax\n`;
+          `	cmovns	%rax, {emit_reg i.arg.(0)}\n`;
+          `	sarq	${emit_int l}, {emit_reg i.res.(0)}\n`
+        end else begin
+          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
+          (* Algorithm:
+                t = multiply-high-signed(arg, m)
+                if m < 0, t = t + m
+                t = shift-right-signed(t, p)
+                res = t + sign-bit(arg) 
+          *)
+          `	movabsq	${emit_nativeint m}, %rdx\n`;
+          `	movq	%rcx, %rax\n`;
+          `	imulq	%rdx\n`; (* rdx = high 64 bits of arg * m, signed *)
+          if m < 0n then
+            `	addq	%rcx, %rdx\n`;
+          if p > 0 then
+            `	sarq	${emit_int p}, %rdx\n`;
+          `	shrq	$63, %rcx\n`;
+          `	addq	%rdx, %rcx\n`
+        end
     | Lop(Iintop_imm(Imod, n)) ->
-        (* Note: i.arg.(0) = i.res.(0) = rdx  (cf. selection.ml) *)
-        `	movq	{emit_reg i.arg.(0)}, %rax\n`;
-        `	testq	%rax, %rax\n`;
-        `	leaq	{emit_int(n-1)}(%rax), %rax\n`;
-        `	cmovns	{emit_reg i.arg.(0)}, %rax\n`;
-        `	andq	${emit_int (-n)}, %rax\n`;
-        `	subq	%rax, {emit_reg i.res.(0)}\n`
+        (* Note: i.arg.(0) = i.res.(0) = rcx  (cf. selection.ml) *)
+        let l = Misc.log2 n in
+        if n = 1 lsl l then begin
+          `	movq	{emit_reg i.arg.(0)}, %rax\n`;
+          `	testq	%rax, %rax\n`;
+          `	leaq	{emit_int(n-1)}(%rax), %rax\n`; (* flags preserved *)
+          `	cmovns	{emit_reg i.arg.(0)}, %rax\n`;
+          `	andq	${emit_int (-n)}, %rax\n`;
+          `	subq	%rax, {emit_reg i.res.(0)}\n`
+        end else begin
+          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
+          (* Compute quotient as in the Idiv immediate case, but
+             compute it in rdx, preserving rcx *)
+          `	movabsq	${emit_nativeint m}, %rdx\n`;
+          `	movq	%rcx, %rax\n`;
+          `	imulq	%rdx\n`;
+          if m < 0n then
+            `	addq	%rcx, %rdx\n`;
+          if p > 0 then
+            `	sarq	${emit_int p}, %rdx\n`;
+          `	movq	%rcx, %rax\n`;
+          `	shrq	$63, %rax\n`;
+          `	addq	%rax, %rdx\n`;
+          (* Compute remainder via Euclid's equality *)
+          `	imulq	${emit_int n}, %rdx\n`;
+          `	subq	%rdx, %rcx\n`
+        end
     | Lop(Iintop_imm(op, n)) ->
         (* We have i.arg.(0) = i.res.(0) *)
         `	{emit_string(instr_for_intop op)}	${emit_int n}, {emit_reg i.res.(0)}\n`
diff --git a/asmcomp/amd64/selection.ml b/asmcomp/amd64/selection.ml
index 4de841287..97e0e84de 100644
--- a/asmcomp/amd64/selection.ml
+++ b/asmcomp/amd64/selection.ml
@@ -105,10 +105,10 @@ let pseudoregs_for_operation op arg res =
       ([| rax; rcx |], [| rax |])
   | Iintop(Imod) ->
       ([| rax; rcx |], [| rdx |])
-  (* For div and mod with immediate operand, arg must not be in rax.
-     Keep it simple, force it in rdx. *)
+  (* For div and mod with immediate operand, arg must not be in rax nor rdx.
+     Keep it simple, force it in rcx. *)
   | Iintop_imm((Idiv|Imod), _) ->
-      ([| rdx |], [| rdx |])
+      ([| rcx |], [| rcx |])
   (* Other instructions are regular *)
   | _ -> raise Use_default
 
@@ -176,18 +176,16 @@ method! select_operation op args =
       | (Iindexed2 0, _) -> super#select_operation op args
       | (addr, arg) -> (Ispecific(Ilea addr), [arg])
       end
-  (* Recognize (x / cst) and (x % cst) only if cst is a power of 2. *)
+  (* Recognize (x / cst) and (x % cst) only if cst is > 0. *)
   | Cdivi ->
       begin match args with
-        [arg1; Cconst_int n] when self#is_immediate n
-                               && n = 1 lsl (Misc.log2 n) ->
+        [arg1; Cconst_int n] when self#is_immediate n && n > 0 ->
           (Iintop_imm(Idiv, n), [arg1])
       | _ -> (Iintop Idiv, args)
       end
   | Cmodi ->
       begin match args with
-        [arg1; Cconst_int n] when self#is_immediate n
-                               && n = 1 lsl (Misc.log2 n) ->
+        [arg1; Cconst_int n] when self#is_immediate n && n > 0 ->
           (Iintop_imm(Imod, n), [arg1])
       | _ -> (Iintop Imod, args)
       end
diff --git a/asmcomp/selectgen.ml b/asmcomp/selectgen.ml
index 1d2bf96d2..fc2e7821d 100644
--- a/asmcomp/selectgen.ml
+++ b/asmcomp/selectgen.ml
@@ -835,3 +835,92 @@ let is_tail_call nargs =
 
 let _ =
   Simplif.is_tail_native_heuristic := is_tail_call
+
+(* Turning integer divisions into multiply-high then shift.
+   The [division_parameters] function is used in module Emit for
+   those target platforms that support this optimization. *)
+
+(* Unsigned comparison between native integers. *)
+
+let ucompare x y = Nativeint.(compare (add x min_int) (add y min_int))
+
+(* Unsigned division and modulus at type nativeint.
+   Algorithm: Hacker's Delight section 9.3 *)
+
+let udivmod n d = Nativeint.(
+  if d < 0n then
+    if ucompare n d < 0 then (0n, n) else (1n, sub n d)
+  else begin
+    let q = shift_left (div (shift_right_logical n 1) d) 1 in
+    let r = sub n (mul q d) in
+    if ucompare r d >= 0 then (succ q, sub r d) else (q, r)
+  end)
+
+(* Compute division parameters.  
+   Algorithm: Hacker's Delight chapter 10, fig 10-1. *)
+
+let divimm_parameters d = Nativeint.(
+  assert (d > 0n);
+  let twopsm1 = min_int in (* 2^31 for 32-bit archs, 2^63 for 64-bit archs *)
+  let nc = sub (pred twopsm1) (snd (udivmod twopsm1 d)) in
+  let rec loop p (q1, r1) (q2, r2) =
+    let p = p + 1 in
+    let q1 = shift_left q1 1 and r1 = shift_left r1 1 in
+    let (q1, r1) =
+      if ucompare r1 nc >= 0 then (succ q1, sub r1 nc) else (q1, r1) in
+    let q2 = shift_left q2 1 and r2 = shift_left r2 1 in
+    let (q2, r2) =
+      if ucompare r2 d >= 0 then (succ q2, sub r2 d) else (q2, r2) in
+    let delta = sub d r2 in
+    if ucompare q1 delta < 0 || (q1 = delta && r1 = 0n)
+    then loop p (q1, r1) (q2, r2)
+    else (succ q2, p - size)
+  in loop (size - 1) (udivmod twopsm1 nc) (udivmod twopsm1 d))
+
+(* The result [(m, p)] of [divimm_parameters d] satisfies the following
+   inequality:
+
+      2^(wordsize + p) < m * d <= 2^(wordsize + p) + 2^(p + 1)    (i)
+
+   from which it follows that
+
+      floor(n / d) = floor(n * m / 2^(wordsize+p))
+                              if 0 <= n < 2^(wordsize-1)
+      ceil(n / d) = floor(n * m / 2^(wordsize+p)) + 1
+                              if -2^(wordsize-1) <= n < 0
+
+   The correctness condition (i) above can be checked by the code below.
+   It was exhaustively tested for values of d from 2 to 10^9 in the
+   wordsize = 64 case.
+
+let add2 (xh, xl) (yh, yl) =
+  let zl = add xl yl and zh = add xh yh in
+  ((if ucompare zl xl < 0 then succ zh else zh), zl)
+
+let shl2 (xh, xl) n =
+  assert (0 < n && n < size + size);
+  if n < size
+  then (logor (shift_left xh n) (shift_right_logical xl (size - n)),
+        shift_left xl n)
+  else (shift_left xl (n - size), 0n)
+
+let mul2 x y =
+  let halfsize = size / 2 in
+  let halfmask = pred (shift_left 1n halfsize) in
+  let xl = logand x halfmask and xh = shift_right_logical x halfsize in
+  let yl = logand y halfmask and yh = shift_right_logical y halfsize in
+  add2 (mul xh yh, 0n)
+    (add2 (shl2 (0n, mul xl yh) halfsize)
+       (add2 (shl2 (0n, mul xh yl) halfsize)
+          (0n, mul xl yl)))
+
+let ucompare2 (xh, xl) (yh, yl) =
+  let c = ucompare xh yh in if c = 0 then ucompare xl yl else c
+
+let validate d m p =
+  let md = mul2 m d in
+  let one2 = (0n, 1n) in
+  let twoszp = shl2 one2 (size + p) in
+  let twop1 = shl2 one2 (p + 1) in
+  ucompare2 twoszp md < 0 && ucompare2 md (add2 twoszp twop1) <= 0
+*)
diff --git a/asmcomp/selectgen.mli b/asmcomp/selectgen.mli
index 11af7c1ff..6fa6b3e1f 100644
--- a/asmcomp/selectgen.mli
+++ b/asmcomp/selectgen.mli
@@ -76,3 +76,5 @@ class virtual selector_generic : object
     (Ident.t, Reg.t array) Tbl.t -> Cmm.expression -> Reg.t array option
   method emit_tail : (Ident.t, Reg.t array) Tbl.t -> Cmm.expression -> unit
 end
+
+val divimm_parameters: nativeint -> nativeint * int
diff --git a/testsuite/tests/basic/divint.ml b/testsuite/tests/basic/divint.ml
new file mode 100644
index 000000000..6dd4be3fc
--- /dev/null
+++ b/testsuite/tests/basic/divint.ml
@@ -0,0 +1,112 @@
+(***********************************************************************)
+(*                                                                     *)
+(*                                OCaml                                *)
+(*                                                                     *)
+(*            Xavier Leroy, projet Gallium, INRIA Rocquencourt         *)
+(*                                                                     *)
+(*  Copyright 2013 Institut National de Recherche en Informatique et   *)
+(*  en Automatique.  All rights reserved.  This file is distributed    *)
+(*  under the terms of the Q Public License version 1.0.               *)
+(*                                                                     *)
+(***********************************************************************)
+
+open Printf
+
+(* Test integer division and modulus, esp. ocamlopt's optimization
+   when the divisor is a constant. *)
+
+let error = ref false
+
+module WithInt = struct
+
+let d = ref 0
+let divref n = n / !d
+let modref n = n mod !d
+
+let test_one (df: int -> int) (mf: int -> int) x =
+  if not (df x = divref x && mf x = modref x) then begin
+    printf "mismatch for %d\n" x;
+    error := true
+  end
+
+let do_test divisor (df: int -> int) (mf: int -> int) =
+  d := divisor;
+  List.iter (test_one df mf)
+    [0; 1; 2; 3; 4; 5; 6; 7; 8; 9; 10; 
+     100; 1000; 10000; 100000; 1000000; max_int - 1; max_int;
+     -1; -2; -3; -4; -5; -6; -7; -8; -9; -10;
+     -100; -1000; -10000; -100000; -1000000; min_int + 1; min_int];
+  let seed = ref 0 in
+  for i = 1 to 1000 do
+    seed := !seed * 69069 + 25173;
+    test_one df mf !seed
+  done
+
+end
+
+module WithNat = struct
+
+let d = ref 0n
+let divref n = Nativeint.div n !d
+let modref n = Nativeint.rem n !d
+
+let test_one (df: nativeint -> nativeint) (mf: nativeint -> nativeint) x =
+  if not (df x = divref x && mf x = modref x) then begin
+    printf "mismatch for %nd\n" x;
+    error := true
+  end
+
+let do_test divisor (df: nativeint -> nativeint) (mf: nativeint -> nativeint) =
+  d := Nativeint.of_int divisor;
+  List.iter (test_one df mf)
+    [0n; 1n; 2n; 3n; 4n; 5n; 6n; 7n; 8n; 9n; 10n; 
+     100n; 1000n; 10000n; 100000n; 1000000n;
+     Nativeint.(pred max_int); Nativeint.max_int;
+     -1n; -2n; -3n; -4n; -5n; -6n; -7n; -8n; -9n; -10n;
+     -100n; -1000n; -10000n; -100000n; -1000000n;
+     Nativeint.(succ min_int); Nativeint.min_int];
+  let seed = ref 0n in
+  for i = 1 to 1000 do
+    seed := Nativeint.(add (mul !seed 69069n) 25173n);
+    test_one df mf !seed
+  done
+
+end
+
+let _ =
+  printf "2 int\n"; WithInt.do_test 2 (fun x -> x / 2)(fun x -> x mod 2);
+  printf "3 int\n"; WithInt.do_test 3 (fun x -> x / 3)(fun x -> x mod 3);
+  printf "4 int\n"; WithInt.do_test 4 (fun x -> x / 4)(fun x -> x mod 4);
+  printf "5 int\n"; WithInt.do_test 5 (fun x -> x / 5)(fun x -> x mod 5);
+  printf "6 int\n"; WithInt.do_test 6 (fun x -> x / 6)(fun x -> x mod 6);
+  printf "7 int\n"; WithInt.do_test 7 (fun x -> x / 7)(fun x -> x mod 7);
+  printf "9 int\n"; WithInt.do_test 9 (fun x -> x / 9)(fun x -> x mod 9);
+  printf "10 int\n"; WithInt.do_test 10 (fun x -> x / 10)(fun x -> x mod 10);
+  printf "11 int\n"; WithInt.do_test 11 (fun x -> x / 11)(fun x -> x mod 11);
+  printf "12 int\n"; WithInt.do_test 12 (fun x -> x / 12)(fun x -> x mod 12);
+  printf "25 int\n"; WithInt.do_test 25 (fun x -> x / 25)(fun x -> x mod 25);
+  printf "55 int\n"; WithInt.do_test 55 (fun x -> x / 55)(fun x -> x mod 55);
+  printf "125 int\n"; WithInt.do_test 125 (fun x -> x / 125)(fun x -> x mod 125);
+  printf "625 int\n"; WithInt.do_test 625 (fun x -> x / 625)(fun x -> x mod 625);
+  printf "-2 int\n"; WithInt.do_test (-2) (fun x -> x / (-2))(fun x -> x mod (-2));
+  printf "-3 int\n"; WithInt.do_test (-3) (fun x -> x / (-3))(fun x -> x mod (-3));
+
+  printf "2 nat\n"; WithNat.do_test 2 (fun x -> Nativeint.div x 2n)(fun x -> Nativeint.rem x 2n);
+  printf "3 nat\n"; WithNat.do_test 3 (fun x -> Nativeint.div x 3n)(fun x -> Nativeint.rem x 3n);
+  printf "4 nat\n"; WithNat.do_test 4 (fun x -> Nativeint.div x 4n)(fun x -> Nativeint.rem x 4n);
+  printf "5 nat\n"; WithNat.do_test 5 (fun x -> Nativeint.div x 5n)(fun x -> Nativeint.rem x 5n);
+  printf "6 nat\n"; WithNat.do_test 6 (fun x -> Nativeint.div x 6n)(fun x -> Nativeint.rem x 6n);
+  printf "7 nat\n"; WithNat.do_test 7 (fun x -> Nativeint.div x 7n)(fun x -> Nativeint.rem x 7n);
+  printf "9 nat\n"; WithNat.do_test 9 (fun x -> Nativeint.div x 9n)(fun x -> Nativeint.rem x 9n);
+  printf "10 nat\n"; WithNat.do_test 10 (fun x -> Nativeint.div x 10n)(fun x -> Nativeint.rem x 10n);
+  printf "11 nat\n"; WithNat.do_test 11 (fun x -> Nativeint.div x 11n)(fun x -> Nativeint.rem x 11n);
+  printf "12 nat\n"; WithNat.do_test 12 (fun x -> Nativeint.div x 12n)(fun x -> Nativeint.rem x 12n);
+  printf "25 nat\n"; WithNat.do_test 25 (fun x -> Nativeint.div x 25n)(fun x -> Nativeint.rem x 25n);
+  printf "55 nat\n"; WithNat.do_test 55 (fun x -> Nativeint.div x 55n)(fun x -> Nativeint.rem x 55n);
+  printf "125 nat\n"; WithNat.do_test 125 (fun x -> Nativeint.div x 125n)(fun x -> Nativeint.rem x 125n);
+  printf "625 nat\n"; WithNat.do_test 625 (fun x -> Nativeint.div x 625n)(fun x -> Nativeint.rem x 625n);
+  printf "-2 nat\n"; WithNat.do_test (-2) (fun x -> Nativeint.div x (-2n))(fun x -> Nativeint.rem x (-2n));
+  printf "-3 nat\n"; WithNat.do_test (-3) (fun x -> Nativeint.div x (-3n))(fun x -> Nativeint.rem x (-3n));
+
+  if !error then printf "TEST FAILED.\n" else printf "Test passed.\n"
+
diff --git a/testsuite/tests/basic/divint.reference b/testsuite/tests/basic/divint.reference
new file mode 100644
index 000000000..4aa1e2110
--- /dev/null
+++ b/testsuite/tests/basic/divint.reference
@@ -0,0 +1,33 @@
+2 int
+3 int
+4 int
+5 int
+6 int
+7 int
+9 int
+10 int
+11 int
+12 int
+25 int
+55 int
+125 int
+625 int
+-2 int
+-3 int
+2 nat
+3 nat
+4 nat
+5 nat
+6 nat
+7 nat
+9 nat
+10 nat
+11 nat
+12 nat
+25 nat
+55 nat
+125 nat
+625 nat
+-2 nat
+-3 nat
+Test passed.