33 files changed, 340 insertions, 680 deletions
diff --git a/asmcomp/amd64/emit.mlp b/asmcomp/amd64/emit.mlp
index b3775362d..674ed2adb 100644
--- a/asmcomp/amd64/emit.mlp
+++ b/asmcomp/amd64/emit.mlp
@@ -514,6 +514,8 @@ let emit_instr fallthrough i =
     | Lop(Iintop(Ilsl | Ilsr | Iasr as op)) ->
         (* We have i.arg.(0) = i.res.(0) and i.arg.(1) = %rcx *)
         `	{emit_string(instr_for_intop op)}	%cl, {emit_reg i.res.(0)}\n`
+    | Lop(Iintop Imulh) ->
+        `	imulq	{emit_reg i.arg.(1)}\n`
     | Lop(Iintop op) ->
         (* We have i.arg.(0) = i.res.(0) *)
         `	{emit_string(instr_for_intop op)}	{emit_reg i.arg.(1)}, {emit_reg i.res.(0)}\n`
@@ -523,61 +525,6 @@ let emit_instr fallthrough i =
         `	incq	{emit_reg i.res.(0)}\n`
     | Lop(Iintop_imm(Iadd, -1) | Iintop_imm(Isub, 1)) ->
         `	decq	{emit_reg i.res.(0)}\n`
-    | Lop(Iintop_imm(Idiv, n)) ->
-        (* Note: i.arg.(0) = i.res.(0) = rcx  (cf. selection.ml) *)
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-          `	movq	{emit_reg i.arg.(0)}, %rax\n`;
-          `	addq	${emit_int(n-1)}, {emit_reg i.arg.(0)}\n`;
-          `	testq	%rax, %rax\n`;
-          `	cmovns	%rax, {emit_reg i.arg.(0)}\n`;
-          `	sarq	${emit_int l}, {emit_reg i.res.(0)}\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Algorithm:
-                t = multiply-high-signed(arg, m)
-                if m < 0, t = t + m
-                t = shift-right-signed(t, p)
-                res = t + sign-bit(arg) 
-          *)
-          `	movabsq	${emit_nativeint m}, %rdx\n`;
-          `	movq	%rcx, %rax\n`;
-          `	imulq	%rdx\n`; (* rdx = high 64 bits of arg * m, signed *)
-          if m < 0n then
-            `	addq	%rcx, %rdx\n`;
-          if p > 0 then
-            `	sarq	${emit_int p}, %rdx\n`;
-          `	shrq	$63, %rcx\n`;
-          `	addq	%rdx, %rcx\n`
-        end
-    | Lop(Iintop_imm(Imod, n)) ->
-        (* Note: i.arg.(0) = i.res.(0) = rcx  (cf. selection.ml) *)
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-          `	movq	{emit_reg i.arg.(0)}, %rax\n`;
-          `	testq	%rax, %rax\n`;
-          `	leaq	{emit_int(n-1)}(%rax), %rax\n`; (* flags preserved *)
-          `	cmovns	{emit_reg i.arg.(0)}, %rax\n`;
-          `	andq	${emit_int (-n)}, %rax\n`;
-          `	subq	%rax, {emit_reg i.res.(0)}\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Compute quotient as in the Idiv immediate case, but
-             compute it in rdx, preserving rcx *)
-          `	movabsq	${emit_nativeint m}, %rdx\n`;
-          `	movq	%rcx, %rax\n`;
-          `	imulq	%rdx\n`;
-          if m < 0n then
-            `	addq	%rcx, %rdx\n`;
-          if p > 0 then
-            `	sarq	${emit_int p}, %rdx\n`;
-          `	movq	%rcx, %rax\n`;
-          `	shrq	$63, %rax\n`;
-          `	addq	%rax, %rdx\n`;
-          (* Compute remainder via Euclid's equality *)
-          `	imulq	${emit_int n}, %rdx\n`;
-          `	subq	%rdx, %rcx\n`
-        end
     | Lop(Iintop_imm(op, n)) ->
         (* We have i.arg.(0) = i.res.(0) *)
         `	{emit_string(instr_for_intop op)}	${emit_int n}, {emit_reg i.res.(0)}\n`
diff --git a/asmcomp/amd64/emit_nt.mlp b/asmcomp/amd64/emit_nt.mlp
index 468d8334a..cb023bb8c 100644
--- a/asmcomp/amd64/emit_nt.mlp
+++ b/asmcomp/amd64/emit_nt.mlp
@@ -522,6 +522,8 @@ let emit_instr fallthrough i =
     | Lop(Iintop(Ilsl | Ilsr | Iasr as op)) ->
         (* We have i.arg.(0) = i.res.(0) and i.arg.(1) = %rcx *)
         `	{emit_string(instr_for_intop op)}	{emit_reg i.res.(0)}, cl\n`
+    | Lop(Iintop Imulh) ->
+        `	imul	{emit_reg i.arg.(1)}\n`
     | Lop(Iintop op) ->
         (* We have i.arg.(0) = i.res.(0) *)
         `	{emit_string(instr_for_intop op)}	{emit_reg i.res.(0)}, {emit_reg i.arg.(1)}\n`
@@ -531,61 +533,6 @@ let emit_instr fallthrough i =
         `	inc	{emit_reg i.res.(0)}\n`
     | Lop(Iintop_imm(Iadd, -1) | Iintop_imm(Isub, 1)) ->
         `	dec	{emit_reg i.res.(0)}\n`
-    | Lop(Iintop_imm(Idiv, n)) ->
-        (* Note: i.arg.(0) = i.res.(0) = rcx  (cf. selection.ml) *)
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-        `	mov	rax, {emit_reg i.arg.(0)}\n`;
-        `	add	{emit_reg i.arg.(0)}, {emit_int(n-1)}\n`;
-        `	test	rax, rax\n`;
-        `	cmovns	{emit_reg i.arg.(0)}, rax\n`;
-        `	sar	{emit_reg i.res.(0)}, {emit_int l}\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Algorithm:
-                t = multiply-high-signed(arg, m)
-                if m < 0, t = t + m
-                t = shift-right-signed(t, p)
-                res = t + sign-bit(arg) 
-          *)
-          emit_movabs rdx m;
-          `	mov	rax, rcx\n`;
-          `	imul	rdx\n`; (* rdx = high 64 bits of arg * m, signed *)
-          if m < 0n then
-            `	add	rdx, rcx\n`;
-          if p > 0 then
-            `	sar	rdx, {emit_int p}\n`;
-          `	shr	rcx, 63\n`;
-          `	add	rcx, rdx\n`
-        end
-    | Lop(Iintop_imm(Imod, n)) ->
-        (* Note: i.arg.(0) = i.res.(0) = rcx  (cf. selection.ml) *)
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-        `	mov	rax, {emit_reg i.arg.(0)}\n`;
-        `	test	rax, rax\n`;
-        `	lea	rax, {emit_int(n-1)}[rax]\n`;
-        `	cmovns	rax, {emit_reg i.arg.(0)}\n`;
-        `	and	rax, {emit_int (-n)}\n`;
-        `	sub	{emit_reg i.res.(0)}, rax\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Compute quotient as in the Idiv immediate case, but
-             compute it in rdx, preserving rcx *)
-          emit_movabs rdx m;
-          `	mov	rax, rcx\n`;
-          `	imul	rdx\n`;
-          if m < 0n then
-            `	add	rdx, rcx\n`;
-          if p > 0 then
-            `	sar	rdx, {emit_int p}\n`;
-          `	mov	rax, rcx\n`;
-          `	shr	rax, 63\n`;
-          `	add	rdx, rax\n`;
-          (* Compute remainder via Euclid's equality *)
-          `	imul	rdx, {emit_int n}\n`;
-          `	sub	rcx, rdx\n`
-        end
     | Lop(Iintop_imm(op, n)) ->
         (* We have i.arg.(0) = i.res.(0) *)
         `	{emit_string(instr_for_intop op)}	{emit_reg i.res.(0)}, {emit_int n}\n`
diff --git a/asmcomp/amd64/proc.ml b/asmcomp/amd64/proc.ml
index 838affba6..ddec43db7 100644
--- a/asmcomp/amd64/proc.ml
+++ b/asmcomp/amd64/proc.ml
@@ -260,7 +260,7 @@ let destroyed_at_oper = function
   | Iop(Iintop(Idiv | Imod)) | Iop(Iintop_imm((Idiv | Imod), _))
         -> [| rax; rdx |]
   | Iop(Istore(Single, _)) -> [| rxmm15 |]
-  | Iop(Ialloc _ | Iintop(Icomp _) | Iintop_imm((Icomp _), _))
+  | Iop(Ialloc _ | Iintop(Imulh | Icomp _) | Iintop_imm((Icomp _), _))
         -> [| rax |]
   | Iswitch(_, _) -> [| rax; rdx |]
   | _ ->
diff --git a/asmcomp/amd64/reload.ml b/asmcomp/amd64/reload.ml
index 510f201f1..a7cb86028 100644
--- a/asmcomp/amd64/reload.ml
+++ b/asmcomp/amd64/reload.ml
@@ -32,7 +32,8 @@ open Mach
      Istore                             R       R
      Iintop(Icomp)              R       R       S
                             or  S       S       R
-     Iintop(Imul|Idiv|mod)      R       R       S
+     Iintop(Imul|Idiv|Imod)     R       R       S
+     Iintop(Imulh)              R       R       S
      Iintop(shift)              S       S       R
      Iintop(others)             R       R       S
                             or  S       S       R
@@ -71,10 +72,10 @@ method! reload_operation op arg res =
       (* This add will be turned into a lea; args and results must be
          in registers *)
       super#reload_operation op arg res
-  | Iintop(Idiv | Imod | Ilsl | Ilsr | Iasr)
+  | Iintop(Imulh | Idiv | Imod | Ilsl | Ilsr | Iasr)
   | Iintop_imm(_, _) ->
       (* The argument(s) and results can be either in register or on stack *)
-      (* Note: Idiv, Imod: arg(0) and res(0) already forced in regs
+      (* Note: Imulh, Idiv, Imod: arg(0) and res(0) already forced in regs
                Ilsl, Ilsr, Iasr: arg(1) already forced in regs *)
       (arg, res)
   | Iintop(Imul) | Iaddf | Isubf | Imulf | Idivf ->
diff --git a/asmcomp/amd64/selection.ml b/asmcomp/amd64/selection.ml
index 97e0e84de..cce7e575d 100644
--- a/asmcomp/amd64/selection.ml
+++ b/asmcomp/amd64/selection.ml
@@ -91,6 +91,10 @@ let pseudoregs_for_operation op arg res =
      (rax, rbx, rcx or rdx). Keep it simple, just force the argument in rax. *)
   | Ispecific(Ibswap 16) ->
       ([| rax |], [| rax |])
+  (* For imulq, first arg must be in rax, rax is clobbered, and result is in
+     rdx. *)
+  | Iintop(Imulh) ->
+      ([| rax; arg.(1) |], [| rdx |])
   | Ispecific(Ifloatarithmem(_,_)) ->
       let arg' = Array.copy arg in
       arg'.(0) <- res.(0);
@@ -105,10 +109,6 @@ let pseudoregs_for_operation op arg res =
       ([| rax; rcx |], [| rax |])
   | Iintop(Imod) ->
       ([| rax; rcx |], [| rdx |])
-  (* For div and mod with immediate operand, arg must not be in rax nor rdx.
-     Keep it simple, force it in rcx. *)
-  | Iintop_imm((Idiv|Imod), _) ->
-      ([| rcx |], [| rcx |])
   (* Other instructions are regular *)
   | _ -> raise Use_default
 
@@ -176,19 +176,6 @@ method! select_operation op args =
       | (Iindexed2 0, _) -> super#select_operation op args
       | (addr, arg) -> (Ispecific(Ilea addr), [arg])
       end
-  (* Recognize (x / cst) and (x % cst) only if cst is > 0. *)
-  | Cdivi ->
-      begin match args with
-        [arg1; Cconst_int n] when self#is_immediate n && n > 0 ->
-          (Iintop_imm(Idiv, n), [arg1])
-      | _ -> (Iintop Idiv, args)
-      end
-  | Cmodi ->
-      begin match args with
-        [arg1; Cconst_int n] when self#is_immediate n && n > 0 ->
-          (Iintop_imm(Imod, n), [arg1])
-      | _ -> (Iintop Imod, args)
-      end
   (* Recognize float arithmetic with memory. *)
   | Caddf ->
       self#select_floatarith true Iaddf Ifloatadd args
@@ -225,6 +212,9 @@ method! select_operation op args =
   | Cextcall("caml_int64_direct_bswap", _, _, _)
   | Cextcall("caml_nativeint_direct_bswap", _, _, _) ->
       (Ispecific (Ibswap 64), args)
+  (* AMD64 does not support immediate operands for multiply high signed *)
+  | Cmulhi ->
+      (Iintop Imulh, args)
   | _ -> super#select_operation op args
 
 (* Recognize float arithmetic with mem *)
diff --git a/asmcomp/arm/arch.ml b/asmcomp/arm/arch.ml
index bdbac71bf..fbd9f6db0 100644
--- a/asmcomp/arm/arch.ml
+++ b/asmcomp/arm/arch.ml
@@ -110,6 +110,7 @@ type specific_operation =
     Ishiftarith of arith_operation * shift_operation * int
   | Ishiftcheckbound of shift_operation * int
   | Irevsubimm of int
+  | Imulhadd      (* multiply high and add *)
   | Imuladd       (* multiply and add *)
   | Imulsub       (* multiply and subtract *)
   | Inegmulf      (* floating-point negate and multiply *)
@@ -193,6 +194,11 @@ let print_specific_operation printreg op ppf arg =
         printreg arg.(1)
   | Irevsubimm n ->
       fprintf ppf "%i %s %a" n "-" printreg arg.(0)
+  | Imulhadd ->
+      fprintf ppf "%a *h %a) + %a"
+        printreg arg.(0)
+        printreg arg.(1)
+        printreg arg.(2)
   | Imuladd ->
       fprintf ppf "(%a * %a) + %a"
         printreg arg.(0)
diff --git a/asmcomp/arm/emit.mlp b/asmcomp/arm/emit.mlp
index 4fd0ed2f7..55a8f96b0 100644
--- a/asmcomp/arm/emit.mlp
+++ b/asmcomp/arm/emit.mlp
@@ -174,15 +174,16 @@ let name_for_comparison = function
 
 let name_for_int_operation = function
   (* Use adds,subs,... to enable 16-bit T1 encoding *)
-    Iadd -> "adds"
-  | Isub -> "subs"
-  | Imul -> "mul"
-  | Iand -> "ands"
-  | Ior  -> "orrs"
-  | Ixor -> "eors"
-  | Ilsl -> "lsls"
-  | Ilsr -> "lsrs"
-  | Iasr -> "asrs"
+    Iadd  -> "adds"
+  | Isub  -> "subs"
+  | Imul  -> "mul"
+  | Imulh -> "smmul"
+  | Iand  -> "ands"
+  | Ior   -> "orrs"
+  | Ixor  -> "eors"
+  | Ilsl  -> "lsls"
+  | Ilsr  -> "lsrs"
+  | Iasr  -> "asrs"
   | _ -> assert false
 
 let name_for_shift_operation = function
@@ -593,91 +594,16 @@ let emit_instr i =
         let op = name_for_shift_operation shiftop in
         `	cmp	{emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, {emit_string op} #{emit_int n}\n`;
         `	bcs	{emit_label lbl}\n`; 2
+    | Lop(Iintop Imulh) when !arch < ARMv6 ->
+        `	smull	r12, {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1
+    | Lop(Ispecific Imulhadd) ->
+        `	smmla	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`; 1
     | Lop(Iintop op) ->
         let instr = name_for_int_operation op in
         `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1
-    | Lop(Iintop_imm(Idiv, n)) ->
-        let l = Misc.log2 n in
-        let a = i.arg.(0) in
-        let r = i.res.(0) in
-        if n = 1 lsl l then begin
-          `	movs	{emit_reg r}, {emit_reg a}\n`;
-          if n <= 256 then begin
-            `	it	lt\n`;
-            `	addlt	{emit_reg r}, {emit_reg r}, #{emit_int (n-1)}\n`
-          end else begin
-            `	itt	lt\n`;
-            `	addlt	{emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
-            `	sublt	{emit_reg r}, {emit_reg r}, #1\n`
-          end;
-          (* Use movs to enable 16-bit T1 encoding *)
-          `	movs	{emit_reg r}, {emit_reg r}, asr #{emit_int l}\n`; 5
-        end else begin
-          assert (!arch >= ARMv6);
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Algorithm:
-                t = multiply-high-signed(arg, m)
-                if m < 0, t = t + arg
-                t = shift-right-signed(t, p)
-                res = t + sign-bit(arg)
-          *)
-          let ninstr = emit_intconst r (Nativeint.to_int32 m) in
-          if m >= 0n then
-            `	smmul	{emit_reg r}, {emit_reg r}, {emit_reg a}\n`
-          else
-            `	smmla	{emit_reg r}, {emit_reg r}, {emit_reg a}, {emit_reg a}\n`;
-          if p > 0 then
-            `	movs	{emit_reg r}, {emit_reg r}, asr #{emit_int p}\n`;
-          `	add	{emit_reg r}, {emit_reg r}, {emit_reg a}, lsr #31\n`;
-          ninstr + 3
-        end
-    | Lop(Iintop_imm(Imod, n)) ->
-        let l = Misc.log2 n in
-        let a = i.arg.(0) in
-        let r = i.res.(0) in
-        if n = 1 lsl l then begin
-          let lbl = new_label() in
-          `	cmp	{emit_reg a}, #0\n`;
-          `	mov	{emit_reg r}, {emit_reg a}, lsl #{emit_int (32-l)}\n`;
-          `	mov	{emit_reg r}, {emit_reg r}, lsr #{emit_int (32-l)}\n`;
-          `	bpl	{emit_label lbl}\n`;
-          `	cmp	{emit_reg r}, #0\n`;
-          `	it	ne\n`;
-          `	subne	{emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
-          `{emit_label lbl}:\n`; 7
-        end else begin
-          assert (!arch >= ARMv6);
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Algorithm:
-                t = multiply-high-signed(arg, m)
-                if m < 0, t = t + arg
-                t = shift-right-signed(t, p)
-                t = t + sign-bit(arg)
-                res = arg - (t * n)
-          *)
-          let r12 = phys_reg 8 in
-          let ninstr = emit_intconst r (Nativeint.to_int32 m) in
-          if m >= 0n then
-            `	smmul	{emit_reg r}, {emit_reg r}, {emit_reg a}\n`
-          else
-            `	smmla	{emit_reg r}, {emit_reg r}, {emit_reg a}, {emit_reg a}\n`;
-          if p > 0 then
-            `	movs	{emit_reg r}, {emit_reg r}, asr #{emit_int p}\n`;
-          `	add	{emit_reg r}, {emit_reg r}, {emit_reg a}, lsr #31\n`;
-          (* Compute remainder via Euclid's equality *)
-          let ninstr = ninstr + emit_intconst r12 (Int32.of_int n) in
-          if !arch >= ARMv6T2 then begin
-            `	mls	{emit_reg r}, {emit_reg r}, r12, {emit_reg a}\n`;
-            ninstr + 4
-          end else begin
-            `	mul	{emit_reg r}, {emit_reg r}, r12\n`;
-            `	sub	{emit_reg r}, {emit_reg a}, {emit_reg r}\n`;
-            ninstr + 5
-          end
-        end
     | Lop(Iintop_imm(op, n)) ->
         let instr = name_for_int_operation op in
-        `	{emit_string instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
     | Lop(Iabsf | Inegf as op) when !fpu = Soft ->
         let instr = (match op with
                        Iabsf -> "bic"
diff --git a/asmcomp/arm/proc.ml b/asmcomp/arm/proc.ml
index 48555a5c2..a16c35a22 100644
--- a/asmcomp/arm/proc.ml
+++ b/asmcomp/arm/proc.ml
@@ -201,7 +201,7 @@ let destroyed_at_oper = function
       destroyed_at_alloc
   | Iop(Iconst_symbol _) when !pic_code ->
       [| phys_reg 3; phys_reg 8 |]  (* r3 and r12 destroyed *)
-  | Iop(Iintop_imm(Imod, n)) when !arch >= ARMv6 && n <> 1 lsl Misc.log2 n ->
+  | Iop(Iintop Imulh) when !arch < ARMv6 ->
       [| phys_reg 8 |]              (* r12 destroyed *)
   | Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _)) ->
       [| phys_reg 107 |]            (* d7 (s14-s15) destroyed *)
diff --git a/asmcomp/arm/scheduling.ml b/asmcomp/arm/scheduling.ml
index 9e2d65bc6..0d6618ab9 100644
--- a/asmcomp/arm/scheduling.ml
+++ b/asmcomp/arm/scheduling.ml
@@ -31,8 +31,8 @@ method oper_latency = function
   | Ifloatofint       (* mcr/mrc count as memory access *)
   | Iintoffloat -> 2
   (* Multiplys have a latency of two cycles *)
-  | Iintop Imul
-  | Ispecific(Imuladd | Imulsub) -> 2
+  | Iintop (Imul | Imulh)
+  | Ispecific(Imuladd | Imulsub | Imulhadd) -> 2
   (* VFP instructions *)
   | Iaddf
   | Isubf
@@ -58,10 +58,8 @@ method oper_issue_cycles = function
   | Iintop(Icheckbound)
   | Iintop_imm(Icheckbound, _) -> 2
   | Ispecific(Ishiftcheckbound _) -> 3
-  | Iintop_imm(Idiv, _) -> 4
-  | Iintop_imm(Imod, _) -> 6
-  | Iintop Imul
-  | Ispecific(Imuladd | Imulsub) -> 2
+  | Iintop(Imul | Imulh)
+  | Ispecific(Imuladd | Imulsub | Imulhadd) -> 2
   (* VFP instructions *)
   | Iaddf
   | Isubf -> 7
diff --git a/asmcomp/arm/selection.ml b/asmcomp/arm/selection.ml
index 474f1a391..9cd6090cd 100644
--- a/asmcomp/arm/selection.ml
+++ b/asmcomp/arm/selection.ml
@@ -57,15 +57,11 @@ let pseudoregs_for_operation op arg res =
      is also a result of the mul / mla operation. *)
     Iintop Imul | Ispecific Imuladd when !arch < ARMv6 ->
       (arg, [| res.(0); arg.(0) |])
-  (* For integer division by a constant, which is not a power of 2, on ARMv6
-     and later, the result and argument registers must be different. We deal
-     with this by pretending that the argument value is also a result of the
-     operation. For modulus we also require a scratch register (r12) that
-     is different from both the result and argument registers. *)
-  | Iintop_imm(Idiv, n) when !arch >= ARMv6 && n <> 1 lsl Misc.log2 n->
+  (* For smull rdlo,rdhi,rn,rm (pre-ARMv6) the registers rdlo, rdhi and rn
+     must be different. We deal with this by  pretending that rn is also a
+     result of the smull operation. *)
+  | Iintop Imulh when !arch < ARMv6 ->
       (arg, [| res.(0); arg.(0) |])
-  | Iintop_imm(Imod, n) when !arch >= ARMv6 && n <> 1 lsl Misc.log2 n->
-      (arg, [| res.(0); arg.(0); r12 |])
   (* Soft-float Iabsf and Inegf: arg.(0) and res.(0) must be the same *)
   | Iabsf | Inegf when !fpu = Soft ->
       ([|res.(0); arg.(1)|], res)
@@ -132,8 +128,17 @@ method select_shift_arith op arithop arithrevop args =
       (Ispecific(Ishiftarith(arithrevop, select_shiftop op, n)), [arg2; arg1])
   | args ->
       begin match super#select_operation op args with
+      (* Recognize multiply high and add *)
+        (Iintop Iadd, [Cop(Cmulhi, args); arg3])
+      | (Iintop Iadd, [arg3; Cop(Cmulhi, args)]) as op_args
+        when !arch >= ARMv6 ->
+          begin match self#select_operation Cmulhi args with
+            (Iintop Imulh, [arg1; arg2]) ->
+              (Ispecific Imulhadd, [arg1; arg2; arg3])
+          | _ -> op_args
+          end
       (* Recognize multiply and add *)
-        (Iintop Iadd, [Cop(Cmuli, args); arg3])
+      | (Iintop Iadd, [Cop(Cmuli, args); arg3])
       | (Iintop Iadd, [arg3; Cop(Cmuli, args)]) as op_args ->
           begin match self#select_operation Cmuli args with
             (Iintop Imul, [arg1; arg2]) ->
@@ -179,15 +184,11 @@ method! select_operation op args =
   (* ARM does not support immediate operands for multiplication *)
   | (Cmuli, args) ->
       (Iintop Imul, args)
+  | (Cmulhi, args) ->
+      (Iintop Imulh, args)
   (* Turn integer division/modulus into runtime ABI calls *)
-  | (Cdivi, [arg; Cconst_int n]) when n > 0 && (!arch >= ARMv6
-                                                || n = 1 lsl Misc.log2 n) ->
-      (Iintop_imm(Idiv, n), [arg])
   | (Cdivi, args) ->
       (Iextcall("__aeabi_idiv", false), args)
-  | (Cmodi, [arg; Cconst_int n]) when n > 0 && (!arch >= ARMv6
-                                                || n = 1 lsl Misc.log2 n) ->
-      (Iintop_imm(Imod, n), [arg])
   | (Cmodi, args) ->
       (* See above for fix up of return register *)
       (Iextcall("__aeabi_idivmod", false), args)
diff --git a/asmcomp/arm64/arch.ml b/asmcomp/arm64/arch.ml
index a53251f51..bfbe183fb 100644
--- a/asmcomp/arm64/arch.ml
+++ b/asmcomp/arm64/arch.ml
@@ -143,4 +143,3 @@ let print_specific_operation printreg op ppf arg =
   | Ibswap n ->
       fprintf ppf "bswap%i %a" n
         printreg arg.(0)
-
diff --git a/asmcomp/arm64/emit.mlp b/asmcomp/arm64/emit.mlp
index 207d5b8ef..bc03c5d52 100644
--- a/asmcomp/arm64/emit.mlp
+++ b/asmcomp/arm64/emit.mlp
@@ -454,41 +454,8 @@ let emit_instr i =
     | Lop(Iintop Imod) ->
         `	sdiv	{emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
         `	msub	{emit_reg i.res.(0)}, {emit_reg reg_tmp1}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}\n`
-    | Lop(Iintop_imm(Idiv, n)) ->
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-          `	asr	{emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, #63\n`;
-          `	add	{emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, {emit_reg reg_tmp1}, lsr {emit_int (64-l)}\n`;
-          `	asr	{emit_reg i.res.(0)}, {emit_reg reg_tmp1}, {emit_int l}\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          emit_intconst reg_tmp1 m;
-          `	smulh	{emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, {emit_reg reg_tmp1}\n`;
-          if m < 0n then
-            `	add	{emit_reg reg_tmp1}, {emit_reg reg_tmp1}, {emit_reg i.arg.(0)}\n`;
-          if p > 0 then
-            `	asr	{emit_reg reg_tmp1}, #{emit_int p}\n`;
-          `	add	{emit_reg i.res.(0)}, {emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, lsr 63\n`
-        end
-    | Lop(Iintop_imm(Imod, n)) ->
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-          `	asr	{emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, #63\n`;
-          `	add	{emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, {emit_reg reg_tmp1}, lsr {emit_int (64-l)}\n`;
-          `	asr	{emit_reg reg_tmp1}, {emit_reg reg_tmp1}, {emit_int l}\n`;
-          `	sub	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg reg_tmp1}, lsl {emit_int l}\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          emit_intconst reg_tmp1 m;
-          `	smulh	{emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, {emit_reg reg_tmp1}\n`;
-          if m < 0n then
-            `	add	{emit_reg reg_tmp1}, {emit_reg reg_tmp1}, {emit_reg i.arg.(0)}\n`;
-          if p > 0 then
-            `	asr	{emit_reg reg_tmp1}, #{emit_int p}\n`;
-          `	add	{emit_reg reg_tmp1}, {emit_reg reg_tmp1}, {emit_reg i.arg.(0)}, lsr 63\n`;
-          emit_intconst reg_tmp2 (Nativeint.of_int n);
-          `	msub	{emit_reg i.res.(0)}, {emit_reg reg_tmp1}, {emit_reg reg_tmp2}, {emit_reg i.arg.(0)}\n`
-        end
+    | Lop(Iintop Imulh) ->
+        `	smulh	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
     | Lop(Iintop op) ->
         let instr = name_for_int_operation op in
         `	{emit_string instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
diff --git a/asmcomp/arm64/selection.ml b/asmcomp/arm64/selection.ml
index 6af158f70..36b401894 100644
--- a/asmcomp/arm64/selection.ml
+++ b/asmcomp/arm64/selection.ml
@@ -184,31 +184,9 @@ method! select_operation op args =
   (* Integer multiplication *)
   (* ARM does not support immediate operands for multiplication *)
   | Cmuli ->
-      begin match args with
-      | [arg; Cconst_int n] | [Cconst_int n; arg] ->
-          let l = Misc.log2 n in
-          if n = 1 lsl l
-          then (Iintop_imm(Ilsl, l), [arg])
-          else (Iintop Imul, args)
-      | _ ->
-          (Iintop Imul, args)
-      end
-  (* Division and modulus *)
-  (* Recognize (x / cst) and (x % cst) only if cst is > 0. *)
-  | Cdivi ->
-      begin match args with
-      | [arg; Cconst_int n] when n > 0 ->
-          ((if n = 1 then Imove else Iintop_imm(Idiv, n)), [arg])
-      | _ ->
-          (Iintop Idiv, args)
-      end
-  | Cmodi ->
-      begin match args with
-      | [arg; Cconst_int n] when n > 0 ->
-          ((if n = 1 then Iconst_int 0n else Iintop_imm(Imod, n)), [arg])
-      | _ ->
-          (Iintop Imod, args)
-      end
+      (Iintop Imul, args)
+  | Cmulhi ->
+      (Iintop Imulh, args)
   (* Bitwise logical operations have a different range of immediate
      operands than the other instructions *)
   | Cand -> self#select_logical Iand args
diff --git a/asmcomp/cmm.ml b/asmcomp/cmm.ml
index fc763c8ae..cdb833896 100644
--- a/asmcomp/cmm.ml
+++ b/asmcomp/cmm.ml
@@ -70,7 +70,7 @@ type operation =
   | Cload of memory_chunk
   | Calloc
   | Cstore of memory_chunk
-  | Caddi | Csubi | Cmuli | Cdivi | Cmodi
+  | Caddi | Csubi | Cmuli | Cmulhi | Cdivi | Cmodi
   | Cand | Cor | Cxor | Clsl | Clsr | Casr
   | Ccmpi of comparison
   | Cadda | Csuba
diff --git a/asmcomp/cmm.mli b/asmcomp/cmm.mli
index 9a6ebd1e4..2ae9eb658 100644
--- a/asmcomp/cmm.mli
+++ b/asmcomp/cmm.mli
@@ -56,7 +56,7 @@ type operation =
   | Cload of memory_chunk
   | Calloc
   | Cstore of memory_chunk
-  | Caddi | Csubi | Cmuli | Cdivi | Cmodi
+  | Caddi | Csubi | Cmuli | Cmulhi | Cdivi | Cmodi
   | Cand | Cor | Cxor | Clsl | Clsr | Casr
   | Ccmpi of comparison
   | Cadda | Csuba
diff --git a/asmcomp/cmmgen.ml b/asmcomp/cmmgen.ml
index 6a1deeda1..591822f56 100644
--- a/asmcomp/cmmgen.ml
+++ b/asmcomp/cmmgen.ml
@@ -136,26 +136,6 @@ let mul_int c1 c2 =
   | (c1, c2) ->
       Cop(Cmuli, [c1; c2])
 
-let tag_int = function
-    Cconst_int n -> int_const n
-  | c -> Cop(Caddi, [Cop(Clsl, [c; Cconst_int 1]); Cconst_int 1])
-
-let force_tag_int = function
-    Cconst_int n -> int_const n
-  | c -> Cop(Cor, [Cop(Clsl, [c; Cconst_int 1]); Cconst_int 1])
-
-let untag_int = function
-    Cconst_int n -> Cconst_int(n asr 1)
-  | Cop(Caddi, [Cop(Clsl, [c; Cconst_int 1]); Cconst_int 1]) -> c
-  | Cop(Cor, [Cop(Casr, [c; Cconst_int n]); Cconst_int 1])
-    when n > 0 && n < size_int * 8 ->
-      Cop(Casr, [c; Cconst_int (n+1)])
-  | Cop(Cor, [Cop(Clsr, [c; Cconst_int n]); Cconst_int 1])
-    when n > 0 && n < size_int * 8 ->
-      Cop(Clsr, [c; Cconst_int (n+1)])
-  | Cop(Cor, [c; Cconst_int 1]) -> Cop(Casr, [c; Cconst_int 1])
-  | c -> Cop(Casr, [c; Cconst_int 1])
-
 let lsl_int c1 c2 =
   match (c1, c2) with
     (Cop(Clsl, [c; Cconst_int n1]), Cconst_int n2)
@@ -172,64 +152,251 @@ let ignore_low_bit_int = function
 
 let lsr_int c1 c2 =
   match c2 with
-    (Cconst_int n) when n > 0 ->
-    Cop(Clsr, [ignore_low_bit_int c1; c2])
+    Cconst_int 0 ->
+      c1
+  | Cconst_int n when n > 0 ->
+      Cop(Clsr, [ignore_low_bit_int c1; c2])
   | _ ->
-    Cop(Clsr, [c1; c2])
+      Cop(Clsr, [c1; c2])
 
 let asr_int c1 c2 =
   match c2 with
-    (Cconst_int n) when n > 0 ->
-    Cop(Casr, [ignore_low_bit_int c1; c2])
+    Cconst_int 0 ->
+      c1
+  | Cconst_int n when n > 0 ->
+      Cop(Casr, [ignore_low_bit_int c1; c2])
   | _ ->
-    Cop(Casr, [c1; c2])
+      Cop(Casr, [c1; c2])
 
-(* Division or modulo on tagged integers.  The overflow case min_int / -1
-   cannot occur, but we must guard against division by zero. *)
+let tag_int = function
+    Cconst_int n ->
+      int_const n
+  | Cop(Casr, [c; Cconst_int n]) when n > 0 ->
+      Cop(Cor, [asr_int c (Cconst_int (n - 1)); Cconst_int 1])
+  | c ->
+      incr_int (lsl_int c (Cconst_int 1))
 
-let is_different_from x = function
-    Cconst_int n -> n <> x
-  | Cconst_natint n -> n <> Nativeint.of_int x
-  | _ -> false
+let force_tag_int = function
+    Cconst_int n ->
+      int_const n
+  | Cop(Casr, [c; Cconst_int n]) when n > 0 ->
+      Cop(Cor, [asr_int c (Cconst_int (n - 1)); Cconst_int 1])
+  | c ->
+      Cop(Cor, [lsl_int c (Cconst_int 1); Cconst_int 1])
 
-let safe_divmod op c1 c2 dbg =
-  if !Clflags.fast || is_different_from 0 c2 then
-    Cop(op, [c1; c2])
-  else
-    bind "divisor" c2 (fun c2 ->
-      Cifthenelse(c2,
-                  Cop(op, [c1; c2]),
-                  Cop(Craise (Raise_regular, dbg),
-                      [Cconst_symbol "caml_exn_Division_by_zero"])))
+let untag_int = function
+    Cconst_int n -> Cconst_int(n asr 1)
+  | Cop(Caddi, [Cop(Clsl, [c; Cconst_int 1]); Cconst_int 1]) -> c
+  | Cop(Cor, [Cop(Casr, [c; Cconst_int n]); Cconst_int 1])
+    when n > 0 && n < size_int * 8 ->
+      Cop(Casr, [c; Cconst_int (n+1)])
+  | Cop(Cor, [Cop(Clsr, [c; Cconst_int n]); Cconst_int 1])
+    when n > 0 && n < size_int * 8 ->
+      Cop(Clsr, [c; Cconst_int (n+1)])
+  | Cop(Cor, [c; Cconst_int 1]) -> Cop(Casr, [c; Cconst_int 1])
+  | c -> Cop(Casr, [c; Cconst_int 1])
+
+(* Turning integer divisions into multiply-high then shift.
+   The [division_parameters] function is used in module Emit for
+   those target platforms that support this optimization. *)
+
+(* Unsigned comparison between native integers. *)
+
+let ucompare x y = Nativeint.(compare (add x min_int) (add y min_int))
+
+(* Unsigned division and modulus at type nativeint.
+   Algorithm: Hacker's Delight section 9.3 *)
+
+let udivmod n d = Nativeint.(
+  if d < 0n then
+    if ucompare n d < 0 then (0n, n) else (1n, sub n d)
+  else begin
+    let q = shift_left (div (shift_right_logical n 1) d) 1 in
+    let r = sub n (mul q d) in
+    if ucompare r d >= 0 then (succ q, sub r d) else (q, r)
+  end)
+
+(* Compute division parameters.
+   Algorithm: Hacker's Delight chapter 10, fig 10-1. *)
+
+let divimm_parameters d = Nativeint.(
+  assert (d > 0n);
+  let twopsm1 = min_int in (* 2^31 for 32-bit archs, 2^63 for 64-bit archs *)
+  let nc = sub (pred twopsm1) (snd (udivmod twopsm1 d)) in
+  let rec loop p (q1, r1) (q2, r2) =
+    let p = p + 1 in
+    let q1 = shift_left q1 1 and r1 = shift_left r1 1 in
+    let (q1, r1) =
+      if ucompare r1 nc >= 0 then (succ q1, sub r1 nc) else (q1, r1) in
+    let q2 = shift_left q2 1 and r2 = shift_left r2 1 in
+    let (q2, r2) =
+      if ucompare r2 d >= 0 then (succ q2, sub r2 d) else (q2, r2) in
+    let delta = sub d r2 in
+    if ucompare q1 delta < 0 || (q1 = delta && r1 = 0n)
+    then loop p (q1, r1) (q2, r2)
+    else (succ q2, p - size)
+  in loop (size - 1) (udivmod twopsm1 nc) (udivmod twopsm1 d))
+
+(* The result [(m, p)] of [divimm_parameters d] satisfies the following
+   inequality:
+
+      2^(wordsize + p) < m * d <= 2^(wordsize + p) + 2^(p + 1)    (i)
+
+   from which it follows that
+
+      floor(n / d) = floor(n * m / 2^(wordsize+p))
+                              if 0 <= n < 2^(wordsize-1)
+      ceil(n / d) = floor(n * m / 2^(wordsize+p)) + 1
+                              if -2^(wordsize-1) <= n < 0
+
+   The correctness condition (i) above can be checked by the code below.
+   It was exhaustively tested for values of d from 2 to 10^9 in the
+   wordsize = 64 case.
+
+let add2 (xh, xl) (yh, yl) =
+  let zl = add xl yl and zh = add xh yh in
+  ((if ucompare zl xl < 0 then succ zh else zh), zl)
+
+let shl2 (xh, xl) n =
+  assert (0 < n && n < size + size);
+  if n < size
+  then (logor (shift_left xh n) (shift_right_logical xl (size - n)),
+        shift_left xl n)
+  else (shift_left xl (n - size), 0n)
+
+let mul2 x y =
+  let halfsize = size / 2 in
+  let halfmask = pred (shift_left 1n halfsize) in
+  let xl = logand x halfmask and xh = shift_right_logical x halfsize in
+  let yl = logand y halfmask and yh = shift_right_logical y halfsize in
+  add2 (mul xh yh, 0n)
+    (add2 (shl2 (0n, mul xl yh) halfsize)
+       (add2 (shl2 (0n, mul xh yl) halfsize)
+          (0n, mul xl yl)))
+
+let ucompare2 (xh, xl) (yh, yl) =
+  let c = ucompare xh yh in if c = 0 then ucompare xl yl else c
+
+let validate d m p =
+  let md = mul2 m d in
+  let one2 = (0n, 1n) in
+  let twoszp = shl2 one2 (size + p) in
+  let twop1 = shl2 one2 (p + 1) in
+  ucompare2 twoszp md < 0 && ucompare2 md (add2 twoszp twop1) <= 0
+*)
+
+let rec div_int c1 c2 dbg =
+  match (c1, c2) with
+    (c1, Cconst_int 0) ->
+      Csequence(c1, Cop(Craise (Raise_regular, dbg),
+                        [Cconst_symbol "caml_exn_Division_by_zero"]))
+  | (c1, Cconst_int 1) ->
+      c1
+  | (Cconst_int 0 as c1, c2) ->
+      Csequence(c2, c1)
+  | (Cconst_int n1, Cconst_int n2) ->
+      Cconst_int (n1 / n2)
+  | (c1, Cconst_int n) when n <> min_int ->
+      let l = Misc.log2 n in
+      if n = 1 lsl l then
+        (* Algorithm:
+              t = shift-right-signed(c1, l - 1)
+              t = shift-right(t, W - l)
+              t = c1 + t
+              res = shift-right-signed(c1 + t, l)
+        *)
+        Cop(Casr, [bind "dividend" c1 (fun c1 ->
+                     let t = asr_int c1 (Cconst_int (l - 1)) in
+                     let t = lsr_int t (Cconst_int (Nativeint.size - l)) in
+                     add_int c1 t);
+                   Cconst_int l])
+      else if n < 0 then
+        sub_int (Cconst_int 0) (div_int c1 (Cconst_int (-n)) dbg)
+      else begin
+        let (m, p) = divimm_parameters (Nativeint.of_int n) in
+        (* Algorithm:
+              t = multiply-high-signed(c1, m)
+              if m < 0, t = t + c1
+              if p > 0, t = shift-right-signed(t, p)
+              res = t + sign-bit(c1)
+        *)
+        bind "dividend" c1 (fun c1 ->
+          let t = Cop(Cmulhi, [c1; Cconst_natint m]) in
+          let t = if m < 0n then Cop(Caddi, [t; c1]) else t in
+          let t = if p > 0 then Cop(Casr, [t; Cconst_int p]) else t in
+          add_int t (lsr_int c1 (Cconst_int (Nativeint.size - 1))))
+      end
+  | (c1, c2) when !Clflags.fast ->
+      Cop(Cdivi, [c1; c2])
+  | (c1, c2) ->
+      bind "divisor" c2 (fun c2 ->
+        Cifthenelse(c2,
+                    Cop(Cdivi, [c1; c2]),
+                    Cop(Craise (Raise_regular, dbg),
+                        [Cconst_symbol "caml_exn_Division_by_zero"])))
+
+let mod_int c1 c2 dbg =
+  match (c1, c2) with
+    (c1, Cconst_int 0) ->
+      Csequence(c1, Cop(Craise (Raise_regular, dbg),
+                        [Cconst_symbol "caml_exn_Division_by_zero"]))
+  | (c1, Cconst_int 1) ->
+      c1
+  | (Cconst_int(0 | 1) as c1, c2) ->
+      Csequence(c2, c1)
+  | (Cconst_int n1, Cconst_int n2) ->
+      Cconst_int (n1 mod n2)
+  | (c1, (Cconst_int n as c2)) when n <> min_int ->
+      let l = Misc.log2 n in
+      if n = 1 lsl l then
+        (* Algorithm:
+              t = shift-right-signed(c1, l - 1)
+              t = shift-right(t, W - l)
+              t = c1 + t
+              t = bit-and(t, -n)
+              res = c1 - t
+         *)
+        bind "dividend" c1 (fun c1 ->
+          let t = asr_int c1 (Cconst_int (l - 1)) in
+          let t = lsr_int t (Cconst_int (Nativeint.size - l)) in
+          let t = add_int c1 t in
+          let t = Cop(Cand, [t; Cconst_int (-n)]) in
+          sub_int c1 t)
+      else
+        bind "dividend" c1 (fun c1 ->
+          sub_int c1 (mul_int (div_int c1 c2 dbg) c2))
+  | (c1, c2) when !Clflags.fast ->
+      Cop(Cmodi, [c1; c2])
+  | (c1, c2) ->
+      bind "divisor" c2 (fun c2 ->
+        Cifthenelse(c2,
+                    Cop(Cmodi, [c1; c2]),
+                    Cop(Craise (Raise_regular, dbg),
+                        [Cconst_symbol "caml_exn_Division_by_zero"])))
 
 (* Division or modulo on boxed integers.  The overflow case min_int / -1
    can occur, in which case we force x / -1 = -x and x mod -1 = 0. (PR#5513). *)
 
+let is_different_from x = function
+    Cconst_int n -> n <> x
+  | Cconst_natint n -> n <> Nativeint.of_int x
+  | _ -> false
+
 let safe_divmod_bi mkop mkm1 c1 c2 bi dbg =
   bind "dividend" c1 (fun c1 ->
   bind "divisor" c2 (fun c2 ->
-    let c3 =
-      if Arch.division_crashes_on_overflow
-      && (size_int = 4 || bi <> Pint32)
-      && not (is_different_from (-1) c2)
-      then
-        Cifthenelse(Cop(Ccmpi Cne, [c2; Cconst_int(-1)]), mkop c1 c2, mkm1 c1)
-      else
-        mkop c1 c2 in
-    if !Clflags.fast || is_different_from 0 c2 then
-      c3
-    else
-      Cifthenelse(c2, c3,
-                  Cop(Craise (Raise_regular, dbg),
-                      [Cconst_symbol "caml_exn_Division_by_zero"]))))
+    let c = mkop c1 c2 dbg in
+    if Arch.division_crashes_on_overflow
+    && (size_int = 4 || bi <> Pint32)
+    && not (is_different_from (-1) c2)
+    then Cifthenelse(Cop(Ccmpi Cne, [c2; Cconst_int(-1)]), c, mkm1 c1)
+    else c))
 
 let safe_div_bi =
-  safe_divmod_bi (fun c1 c2 -> Cop(Cdivi, [c1;c2]))
-                 (fun c1 -> Cop(Csubi, [Cconst_int 0; c1]))
+  safe_divmod_bi div_int (fun c1 -> Cop(Csubi, [Cconst_int 0; c1]))
 
 let safe_mod_bi =
-  safe_divmod_bi (fun c1 c2 -> Cop(Cmodi, [c1;c2]))
-                 (fun c1 -> Cconst_int 0)
+  safe_divmod_bi mod_int (fun c1 -> Cconst_int 0)
 
 (* Bool *)
 
@@ -1426,11 +1593,9 @@ and transl_prim_2 p arg1 arg2 dbg =
   | Pmulint ->
       incr_int(mul_int (decr_int(transl arg1)) (untag_int(transl arg2)))
   | Pdivint ->
-      tag_int(safe_divmod Cdivi (untag_int(transl arg1))
-                          (untag_int(transl arg2)) dbg)
+      tag_int(div_int (untag_int(transl arg1)) (untag_int(transl arg2)) dbg)
   | Pmodint ->
-      tag_int(safe_divmod Cmodi (untag_int(transl arg1))
-                          (untag_int(transl arg2)) dbg)
+      tag_int(mod_int (untag_int(transl arg1)) (untag_int(transl arg2)) dbg)
   | Pandint ->
       Cop(Cand, [transl arg1; transl arg2])
   | Porint ->
diff --git a/asmcomp/i386/emit.mlp b/asmcomp/i386/emit.mlp
index a199e9fb5..3c77529ab 100644
--- a/asmcomp/i386/emit.mlp
+++ b/asmcomp/i386/emit.mlp
@@ -615,6 +615,8 @@ let emit_instr fallthrough i =
     | Lop(Iintop(Ilsl | Ilsr | Iasr as op)) ->
         (* We have i.arg.(0) = i.res.(0) and i.arg.(1) = %ecx *)
         `	{emit_string(instr_for_intop op)}	%cl, {emit_reg i.res.(0)}\n`
+    | Lop(Iintop Imulh) ->
+        `	imull	{emit_reg i.arg.(1)}\n`
     | Lop(Iintop op) ->
         (* We have i.arg.(0) = i.res.(0) *)
         `	{emit_string(instr_for_intop op)}	{emit_reg i.arg.(1)}, {emit_reg i.res.(0)}\n`
@@ -624,62 +626,6 @@ let emit_instr fallthrough i =
         `	incl	{emit_reg i.res.(0)}\n`
     | Lop(Iintop_imm(Iadd, -1) | Iintop_imm(Isub, 1)) ->
         `	decl	{emit_reg i.res.(0)}\n`
-    | Lop(Iintop_imm(Idiv, n)) ->
-        (* Note: i.arg.(0) = i.res.(0) = ecx  (cf. selection.ml) *)
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-          let lbl = new_label() in
-          output_test_zero i.arg.(0);
-          `	jge	{emit_label lbl}\n`;
-          `	addl	${emit_int(n-1)}, {emit_reg i.arg.(0)}\n`;
-          `{emit_label lbl}:	sarl	${emit_int l}, {emit_reg i.arg.(0)}\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Algorithm:
-                t = multiply-high-signed(arg, m)
-                if m < 0, t = t + m
-                t = shift-right-signed(t, p)
-                res = t + sign-bit(arg) 
-          *)
-          `	movl	${emit_nativeint m}, %edx\n`;
-          `	movl	%ecx, %eax\n`;
-          `	imull	%edx\n`; (* edx = high 32 bits of arg * m, signed *)
-          if m < 0n then
-            `	addl	%ecx, %edx\n`;
-          if p > 0 then
-            `	sarl	${emit_int p}, %edx\n`;
-          `	shrl	$31, %ecx\n`;
-          `	addl	%edx, %ecx\n`
-        end
-    | Lop(Iintop_imm(Imod, n)) ->
-        (* Note: i.arg.(0) = i.res.(0) = rcx  (cf. selection.ml) *)
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-          let lbl = new_label() in
-          `	movl	{emit_reg i.arg.(0)}, %eax\n`;
-          `	testl	%eax, %eax\n`;
-          `	jge	{emit_label lbl}\n`;
-          `	addl	${emit_int(n-1)}, %eax\n`;
-          `{emit_label lbl}:	andl	${emit_int(-n)}, %eax\n`;
-          `	subl	%eax, {emit_reg i.arg.(0)}\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Compute quotient as in the Idiv immediate case, but
-             compute it in edx, preserving ecx *)
-          `	movl	${emit_nativeint m}, %edx\n`;
-          `	movl	%ecx, %eax\n`;
-          `	imull	%edx\n`;
-          if m < 0n then
-            `	addl	%ecx, %edx\n`;
-          if p > 0 then
-            `	sarl	${emit_int p}, %edx\n`;
-          `	movl	%ecx, %eax\n`;
-          `	shrl	$31, %eax\n`;
-          `	addl	%eax, %edx\n`;
-          (* Compute remainder via Euclid's equality *)
-          `	imull	${emit_int n}, %edx\n`;
-          `	subl	%edx, %ecx\n`
-        end
     | Lop(Iintop_imm(op, n)) ->
         (* We have i.arg.(0) = i.res.(0) *)
         `	{emit_string(instr_for_intop op)}	${emit_int n}, {emit_reg i.res.(0)}\n`
diff --git a/asmcomp/i386/emit_nt.mlp b/asmcomp/i386/emit_nt.mlp
index 79a127812..145241d95 100644
--- a/asmcomp/i386/emit_nt.mlp
+++ b/asmcomp/i386/emit_nt.mlp
@@ -565,6 +565,8 @@ let emit_instr i =
     | Lop(Iintop(Ilsl | Ilsr | Iasr as op)) ->
         (* We have i.arg.(0) = i.res.(0) and i.arg.(1) = %ecx *)
         `	{emit_string(instr_for_intop op)}	{emit_reg i.res.(0)}, cl\n`
+    | Lop(Iintop Imulh) ->
+        `	imul	{emit_reg i.arg.(1)}\n`
     | Lop(Iintop op) ->
         (* We have i.arg.(0) = i.res.(0) *)
         `	{emit_string(instr_for_intop op)}	{emit_reg i.res.(0)}, {emit_reg i.arg.(1)}\n`
@@ -574,61 +576,6 @@ let emit_instr i =
         `	inc	{emit_reg i.res.(0)}\n`
     | Lop(Iintop_imm(Iadd, -1) | Iintop_imm(Isub, 1)) ->
         `	dec	{emit_reg i.res.(0)}\n`
-    | Lop(Iintop_imm(Idiv, n)) ->
-        (* Note: i.arg.(0) = i.res.(0) = ecx  (cf. selection.ml) *)
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-          let lbl = new_label() in
-          output_test_zero i.arg.(0);
-          `	jge	{emit_label lbl}\n`;
-          `	add	{emit_reg i.arg.(0)}, {emit_int(n-1)}\n`;
-          `{emit_label lbl}:	sar	{emit_reg i.arg.(0)}, {emit_int l}\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Algorithm:
-                t = multiply-high-signed(arg, m)
-                if m < 0, t = t + m
-                t = shift-right-signed(t, p)
-                res = t + sign-bit(arg) 
-          *)
-          `	mov	edx, {emit_nativeint m}\n`;
-          `	mov	eax, ecx\n`;
-          `	imul	edx\n`; (* edx = high 32 bits of arg * m, signed *)
-          if m < 0n then
-            `	add	edx, ecx\n`;
-          if p > 0 then
-            `	sar	edx, {emit_int p}\n`;
-          `	shr	ecx, 31\n`;
-          `	add	ecx, edx\n`
-        end
-    | Lop(Iintop_imm(Imod, n)) ->
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-          let lbl = new_label() in
-          `	mov	eax, {emit_reg i.arg.(0)}\n`;
-          `	test	eax, eax\n`;
-          `	jge	{emit_label lbl}\n`;
-          `	add	eax, {emit_int(n-1)}\n`;
-          `{emit_label lbl}:	and	eax, {emit_int(-n)}\n`;
-          `	sub	{emit_reg i.arg.(0)}, eax\n`
-        end else begin
-          let (m, p) = Selectgen.divimm_parameters (Nativeint.of_int n) in
-          (* Compute quotient as in the Idiv immediate case, but
-             compute it in edx, preserving ecx *)
-          `	mov	edx, {emit_nativeint m}\n`;
-          `	mov	eax, ecx\n`;
-          `	imul	edx\n`; (* edx = high 32 bits of arg * m, signed *)
-          if m < 0n then
-            `	add	edx, ecx\n`;
-          if p > 0 then
-            `	sar	edx, {emit_int p}\n`;
-          `	mov	eax, ecx\n`;
-          `	shr	eax, 31\n`;
-          `	add	edx, eax\n`;
-          (* Compute remainder via Euclid's equality *)
-          `	imul	edx, {emit_int n}\n`;
-          `	sub	ecx, edx\n`
-        end
     | Lop(Iintop_imm(op, n)) ->
         (* We have i.arg.(0) = i.res.(0) *)
         `	{emit_string(instr_for_intop op)}	{emit_reg i.res.(0)}, {emit_int n}\n`
diff --git a/asmcomp/i386/proc.ml b/asmcomp/i386/proc.ml
index eba767c29..d80d18208 100644
--- a/asmcomp/i386/proc.ml
+++ b/asmcomp/i386/proc.ml
@@ -162,9 +162,8 @@ let destroyed_at_c_call =               (* ebx, esi, edi, ebp preserved *)
 let destroyed_at_oper = function
     Iop(Icall_ind | Icall_imm _ | Iextcall(_, true)) -> all_phys_regs
   | Iop(Iextcall(_, false)) -> destroyed_at_c_call
-  | Iop(Iintop(Idiv | Imod))
-  | Iop(Iintop_imm((Idiv | Imod), _)) -> [| eax; edx |]
-  | Iop(Ialloc _) -> [| eax |]
+  | Iop(Iintop(Idiv | Imod)) -> [| eax; edx |]
+  | Iop(Ialloc _ | Iintop Imulh) -> [| eax |]
   | Iop(Iintop(Icomp _) | Iintop_imm(Icomp _, _)) -> [| eax |]
   | Iop(Iintoffloat) -> [| eax |]
   | Iifthenelse(Ifloattest(_, _), _, _) -> [| eax |]
diff --git a/asmcomp/i386/reload.ml b/asmcomp/i386/reload.ml
index 623d12a84..bc1b08f59 100644
--- a/asmcomp/i386/reload.ml
+++ b/asmcomp/i386/reload.ml
@@ -57,9 +57,11 @@ method! reload_operation op arg res =
       if stackp arg.(0)
       then let r = self#makereg arg.(0) in ([|r|], [|r|])
       else (arg, res)
-  | Iintop(Ilsl|Ilsr|Iasr) | Iintop_imm(_, _) | Ifloatofint | Iintoffloat |
-    Ispecific(Ipush) ->
+  | Iintop(Imulh | Ilsl | Ilsr | Iasr) | Iintop_imm(_, _)
+  | Ifloatofint | Iintoffloat | Ispecific(Ipush) ->
       (* The argument(s) can be either in register or on stack *)
+      (* Note: Imulh: arg(0 and res(0) already forced in regs
+               Ilsl, Ilsr, Iasr: arg(1) already forced in regs *)
       (arg, res)
   | _ -> (* Other operations: all args and results in registers *)
       super#reload_operation op arg res
diff --git a/asmcomp/i386/selection.ml b/asmcomp/i386/selection.ml
index ed2c367f0..473499f36 100644
--- a/asmcomp/i386/selection.ml
+++ b/asmcomp/i386/selection.ml
@@ -112,6 +112,10 @@ let pseudoregs_for_operation op arg res =
   (* Two-address unary operations *)
   | Iintop_imm((Iadd|Isub|Imul|Iand|Ior|Ixor|Ilsl|Ilsr|Iasr), _) ->
       (res, res, false)
+  (* For imull, first arg must be in eax, eax is clobbered, and result is in
+     edx. *)
+  | Iintop(Imulh) ->
+      ([| eax; arg.(1) |], [| edx |], true)
   (* For shifts with variable shift count, second arg must be in ecx *)
   | Iintop(Ilsl|Ilsr|Iasr) ->
       ([|res.(0); ecx|], res, false)
@@ -122,10 +126,6 @@ let pseudoregs_for_operation op arg res =
       ([| eax; ecx |], [| eax |], true)
   | Iintop(Imod) ->
       ([| eax; ecx |], [| edx |], true)
-  (* For div and mod with immediate operand, arg must not be in eax nor edx.
-     Keep it simple, force it in ecx. *)
-  | Iintop_imm((Idiv|Imod), _) ->
-      ([| ecx |], [| ecx |], true)
   (* For floating-point operations and floating-point loads,
      the result is always left at the top of the floating-point stack *)
   | Iconst_float _ | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
@@ -202,19 +202,6 @@ method! select_operation op args =
       | (Iindexed2 0, _) -> super#select_operation op args
       | (addr, arg) -> (Ispecific(Ilea addr), [arg])
       end
-  (* Recognize (x / cst) and (x % cst) only if cst is > 0. *)
-  | Cdivi ->
-      begin match args with
-        [arg1; Cconst_int n] when n > 0 ->
-          (Iintop_imm(Idiv, n), [arg1])
-      | _ -> (Iintop Idiv, args)
-      end
-  | Cmodi ->
-      begin match args with
-        [arg1; Cconst_int n] when n > 0 ->
-          (Iintop_imm(Imod, n), [arg1])
-      | _ -> (Iintop Imod, args)
-      end
   (* Recognize float arithmetic with memory.
      In passing, apply Ershov's algorithm to reduce stack usage *)
   | Caddf ->
@@ -241,6 +228,9 @@ method! select_operation op args =
   | Cextcall(fn, ty_res, false, dbg)
     when !fast_math && List.mem fn inline_float_ops ->
       (Ispecific(Ifloatspecial fn), args)
+  (* i386 does not support immediate operands for multiply high signed *)
+  | Cmulhi ->
+      (Iintop Imulh, args)
   (* Default *)
   | _ -> super#select_operation op args
 
diff --git a/asmcomp/mach.ml b/asmcomp/mach.ml
index 5cddd1676..58d0c1076 100644
--- a/asmcomp/mach.ml
+++ b/asmcomp/mach.ml
@@ -17,7 +17,7 @@ type integer_comparison =
   | Iunsigned of Cmm.comparison
 
 type integer_operation =
-    Iadd | Isub | Imul | Idiv | Imod
+    Iadd | Isub | Imul | Imulh | Idiv | Imod
   | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
   | Icomp of integer_comparison
   | Icheckbound
diff --git a/asmcomp/mach.mli b/asmcomp/mach.mli
index 837fd7d71..03028b2ca 100644
--- a/asmcomp/mach.mli
+++ b/asmcomp/mach.mli
@@ -17,7 +17,7 @@ type integer_comparison =
   | Iunsigned of Cmm.comparison
 
 type integer_operation =
-    Iadd | Isub | Imul | Idiv | Imod
+    Iadd | Isub | Imul | Imulh | Idiv | Imod
   | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
   | Icomp of integer_comparison
   | Icheckbound
diff --git a/asmcomp/power/emit.mlp b/asmcomp/power/emit.mlp
index e669c9005..e82f5ff79 100644
--- a/asmcomp/power/emit.mlp
+++ b/asmcomp/power/emit.mlp
@@ -266,15 +266,16 @@ let name_for_int_comparison = function
 (* Names for various instructions *)
 
 let name_for_intop = function
-    Iadd -> "add"
-  | Imul -> if ppc64 then "mulld" else "mullw"
-  | Idiv -> if ppc64 then "divd" else "divw"
-  | Iand -> "and"
-  | Ior  -> "or"
-  | Ixor -> "xor"
-  | Ilsl -> if ppc64 then "sld" else "slw"
-  | Ilsr -> if ppc64 then "srd" else "srw"
-  | Iasr -> if ppc64 then "srad" else "sraw"
+    Iadd  -> "add"
+  | Imul - > if ppc64 then "mulld" else "mullw"
+  | Imulh -> if ppc64 then "mulhd" else "mulhw"
+  | Idiv  -> if ppc64 then "divd" else "divw"
+  | Iand  -> "and"
+  | Ior   -> "or"
+  | Ixor  -> "xor"
+  | Ilsl  -> if ppc64 then "sld" else "slw"
+  | Ilsr  -> if ppc64 then "srd" else "srw"
+  | Iasr  -> if ppc64 then "srad" else "sraw"
   | _ -> Misc.fatal_error "Emit.Intop"
 
 let name_for_intop_imm = function
@@ -344,8 +345,6 @@ let instr_size = function
   | Lop(Iintop Imod) -> 3
   | Lop(Iintop(Icomp cmp)) -> 4
   | Lop(Iintop op) -> 1
-  | Lop(Iintop_imm(Idiv, n)) -> 2
-  | Lop(Iintop_imm(Imod, n)) -> 4
   | Lop(Iintop_imm(Icomp cmp, n)) -> 4
   | Lop(Iintop_imm(op, n)) -> 1
   | Lop(Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf) -> 1
@@ -605,16 +604,6 @@ let rec emit_instr i dslot =
         `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
     | Lop(Iintop_imm(Isub, n)) ->
         `	addi	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_int(-n)}\n`
-    | Lop(Iintop_imm(Idiv, n)) ->       (* n is guaranteed to be a power of 2 *)
-        let l = Misc.log2 n in
-        `	{emit_string sragi}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_int l}\n`;
-        `	addze	{emit_reg i.res.(0)}, {emit_reg i.res.(0)}\n`
-    | Lop(Iintop_imm(Imod, n)) ->       (* n is guaranteed to be a power of 2 *)
-        let l = Misc.log2 n in
-        `	{emit_string sragi}	{emit_gpr 0}, {emit_reg i.arg.(0)}, {emit_int l}\n`;
-        `	addze	{emit_gpr 0}, {emit_gpr 0}\n`;
-        `	{emit_string slgi}	{emit_gpr 0}, {emit_gpr 0}, {emit_int l}\n`;
-        `	subfc	{emit_reg i.res.(0)}, {emit_gpr 0}, {emit_reg i.arg.(0)}\n`
     | Lop(Iintop_imm(Icomp cmp, n)) ->
         begin match cmp with
           Isigned c ->
diff --git a/asmcomp/power/scheduling.ml b/asmcomp/power/scheduling.ml
index e4a575e0a..6e594f028 100644
--- a/asmcomp/power/scheduling.ml
+++ b/asmcomp/power/scheduling.ml
@@ -26,7 +26,7 @@ method oper_latency = function
   | Iload(_, _) -> 2
   | Iconst_float _ -> 2 (* turned into a load *)
   | Iconst_symbol _ -> 1
-  | Iintop Imul -> 9
+  | Iintop(Imul | Imulh) -> 9
   | Iintop_imm(Imul, _) -> 5
   | Iintop(Idiv | Imod) -> 36
   | Iaddf | Isubf -> 4
@@ -48,8 +48,6 @@ method oper_issue_cycles = function
   | Ialloc _ -> 4
   | Iintop(Imod) -> 40 (* assuming full stall *)
   | Iintop(Icomp _) -> 4
-  | Iintop_imm(Idiv, _) -> 2
-  | Iintop_imm(Imod, _) -> 4
   | Iintop_imm(Icomp _, _) -> 4
   | Ifloatofint -> 9
   | Iintoffloat -> 4
diff --git a/asmcomp/power/selection.ml b/asmcomp/power/selection.ml
index a68c63fcc..86aea05f4 100644
--- a/asmcomp/power/selection.ml
+++ b/asmcomp/power/selection.ml
@@ -61,16 +61,8 @@ method select_addressing chunk exp =
 
 method! select_operation op args =
   match (op, args) with
-  (* Prevent the recognition of (x / cst) and (x % cst) when cst is not
-     a power of 2, which do not correspond to an instruction. *)
-    (Cdivi, [arg; Cconst_int n]) when n = 1 lsl (Misc.log2 n) ->
-      (Iintop_imm(Idiv, n), [arg])
-  | (Cdivi, _) ->
-      (Iintop Idiv, args)
-  | (Cmodi, [arg; Cconst_int n]) when n = 1 lsl (Misc.log2 n) ->
-      (Iintop_imm(Imod, n), [arg])
-  | (Cmodi, _) ->
-      (Iintop Imod, args)
+  (* PowerPC does not support immediate operands for multiply high *)
+    (Cmulhi, _) -> (Iintop Imulh, args)
   (* The and, or and xor instructions have a different range of immediate
      operands than the other instructions *)
   | (Cand, _) -> self#select_logical Iand args
diff --git a/asmcomp/printcmm.ml b/asmcomp/printcmm.ml
index b89937a29..f1c9243a0 100644
--- a/asmcomp/printcmm.ml
+++ b/asmcomp/printcmm.ml
@@ -60,6 +60,7 @@ let operation = function
   | Caddi -> "+"
   | Csubi -> "-"
   | Cmuli -> "*"
+  | Cmulhi -> "*h"
   | Cdivi -> "/"
   | Cmodi -> "mod"
   | Cand -> "and"
diff --git a/asmcomp/printmach.ml b/asmcomp/printmach.ml
index 395272d45..f260c3df7 100644
--- a/asmcomp/printmach.ml
+++ b/asmcomp/printmach.ml
@@ -70,6 +70,7 @@ let intop = function
   | Iadd -> " + "
   | Isub -> " - "
   | Imul -> " * "
+  | Imulh -> " *h "
   | Idiv -> " div "
   | Imod -> " mod "
   | Iand -> " & "
diff --git a/asmcomp/selectgen.ml b/asmcomp/selectgen.ml
index 5e47efe51..8f1277a17 100644
--- a/asmcomp/selectgen.ml
+++ b/asmcomp/selectgen.ml
@@ -33,7 +33,7 @@ let oper_result_type = function
       end
   | Calloc -> typ_addr
   | Cstore c -> typ_void
-  | Caddi | Csubi | Cmuli | Cdivi | Cmodi |
+  | Caddi | Csubi | Cmuli | Cmulhi | Cdivi | Cmodi |
     Cand | Cor | Cxor | Clsl | Clsr | Casr |
     Ccmpi _ | Ccmpa _ | Ccmpf _ -> typ_int
   | Cadda | Csuba -> typ_addr
@@ -231,19 +231,10 @@ method select_operation op args =
   | (Calloc, _) -> (Ialloc 0, args)
   | (Caddi, _) -> self#select_arith_comm Iadd args
   | (Csubi, _) -> self#select_arith Isub args
-  | (Cmuli, [arg1; Cconst_int n]) ->
-      let l = Misc.log2 n in
-      if n = 1 lsl l
-      then (Iintop_imm(Ilsl, l), [arg1])
-      else self#select_arith_comm Imul args
-  | (Cmuli, [Cconst_int n; arg1]) ->
-      let l = Misc.log2 n in
-      if n = 1 lsl l
-      then (Iintop_imm(Ilsl, l), [arg1])
-      else self#select_arith_comm Imul args
   | (Cmuli, _) -> self#select_arith_comm Imul args
-  | (Cdivi, _) -> self#select_arith Idiv args
-  | (Cmodi, _) -> self#select_arith_comm Imod args
+  | (Cmulhi, _) -> self#select_arith_comm Imulh args
+  | (Cdivi, _) -> (Iintop Idiv, args)
+  | (Cmodi, _) -> (Iintop Imod, args)
   | (Cand, _) -> self#select_arith_comm Iand args
   | (Cor, _) -> self#select_arith_comm Ior args
   | (Cxor, _) -> self#select_arith_comm Ixor args
@@ -835,92 +826,3 @@ let is_tail_call nargs =
 
 let _ =
   Simplif.is_tail_native_heuristic := is_tail_call
-
-(* Turning integer divisions into multiply-high then shift.
-   The [division_parameters] function is used in module Emit for
-   those target platforms that support this optimization. *)
-
-(* Unsigned comparison between native integers. *)
-
-let ucompare x y = Nativeint.(compare (add x min_int) (add y min_int))
-
-(* Unsigned division and modulus at type nativeint.
-   Algorithm: Hacker's Delight section 9.3 *)
-
-let udivmod n d = Nativeint.(
-  if d < 0n then
-    if ucompare n d < 0 then (0n, n) else (1n, sub n d)
-  else begin
-    let q = shift_left (div (shift_right_logical n 1) d) 1 in
-    let r = sub n (mul q d) in
-    if ucompare r d >= 0 then (succ q, sub r d) else (q, r)
-  end)
-
-(* Compute division parameters.  
-   Algorithm: Hacker's Delight chapter 10, fig 10-1. *)
-
-let divimm_parameters d = Nativeint.(
-  assert (d > 0n);
-  let twopsm1 = min_int in (* 2^31 for 32-bit archs, 2^63 for 64-bit archs *)
-  let nc = sub (pred twopsm1) (snd (udivmod twopsm1 d)) in
-  let rec loop p (q1, r1) (q2, r2) =
-    let p = p + 1 in
-    let q1 = shift_left q1 1 and r1 = shift_left r1 1 in
-    let (q1, r1) =
-      if ucompare r1 nc >= 0 then (succ q1, sub r1 nc) else (q1, r1) in
-    let q2 = shift_left q2 1 and r2 = shift_left r2 1 in
-    let (q2, r2) =
-      if ucompare r2 d >= 0 then (succ q2, sub r2 d) else (q2, r2) in
-    let delta = sub d r2 in
-    if ucompare q1 delta < 0 || (q1 = delta && r1 = 0n)
-    then loop p (q1, r1) (q2, r2)
-    else (succ q2, p - size)
-  in loop (size - 1) (udivmod twopsm1 nc) (udivmod twopsm1 d))
-
-(* The result [(m, p)] of [divimm_parameters d] satisfies the following
-   inequality:
-
-      2^(wordsize + p) < m * d <= 2^(wordsize + p) + 2^(p + 1)    (i)
-
-   from which it follows that
-
-      floor(n / d) = floor(n * m / 2^(wordsize+p))
-                              if 0 <= n < 2^(wordsize-1)
-      ceil(n / d) = floor(n * m / 2^(wordsize+p)) + 1
-                              if -2^(wordsize-1) <= n < 0
-
-   The correctness condition (i) above can be checked by the code below.
-   It was exhaustively tested for values of d from 2 to 10^9 in the
-   wordsize = 64 case.
-
-let add2 (xh, xl) (yh, yl) =
-  let zl = add xl yl and zh = add xh yh in
-  ((if ucompare zl xl < 0 then succ zh else zh), zl)
-
-let shl2 (xh, xl) n =
-  assert (0 < n && n < size + size);
-  if n < size
-  then (logor (shift_left xh n) (shift_right_logical xl (size - n)),
-        shift_left xl n)
-  else (shift_left xl (n - size), 0n)
-
-let mul2 x y =
-  let halfsize = size / 2 in
-  let halfmask = pred (shift_left 1n halfsize) in
-  let xl = logand x halfmask and xh = shift_right_logical x halfsize in
-  let yl = logand y halfmask and yh = shift_right_logical y halfsize in
-  add2 (mul xh yh, 0n)
-    (add2 (shl2 (0n, mul xl yh) halfsize)
-       (add2 (shl2 (0n, mul xh yl) halfsize)
-          (0n, mul xl yl)))
-
-let ucompare2 (xh, xl) (yh, yl) =
-  let c = ucompare xh yh in if c = 0 then ucompare xl yl else c
-
-let validate d m p =
-  let md = mul2 m d in
-  let one2 = (0n, 1n) in
-  let twoszp = shl2 one2 (size + p) in
-  let twop1 = shl2 one2 (p + 1) in
-  ucompare2 twoszp md < 0 && ucompare2 md (add2 twoszp twop1) <= 0
-*)
diff --git a/asmcomp/selectgen.mli b/asmcomp/selectgen.mli
index 6fa6b3e1f..11af7c1ff 100644
--- a/asmcomp/selectgen.mli
+++ b/asmcomp/selectgen.mli
@@ -76,5 +76,3 @@ class virtual selector_generic : object
     (Ident.t, Reg.t array) Tbl.t -> Cmm.expression -> Reg.t array option
   method emit_tail : (Ident.t, Reg.t array) Tbl.t -> Cmm.expression -> unit
 end
-
-val divimm_parameters: nativeint -> nativeint * int
diff --git a/asmcomp/sparc/emit.mlp b/asmcomp/sparc/emit.mlp
index 6c7e86da8..1d0699fb4 100644
--- a/asmcomp/sparc/emit.mlp
+++ b/asmcomp/sparc/emit.mlp
@@ -443,36 +443,15 @@ let rec emit_instr i dslot =
         `	sra	{emit_reg i.arg.(0)}, 31, %g1\n`;
         `	wr	%g1, %y\n`;
         `	sdiv	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.res.(0)}\n`
+    | Lop(Iintop Imulh) ->
+        `	smul	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, %g1\n`;
+        `	rd	%y, {emit_reg i.res.(0)}\n`
     | Lop(Iintop op) ->
         let instr = name_for_int_operation op in
         `	{emit_string instr}	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.res.(0)}\n`
     | Lop(Iintop_imm(Ilsl, 1)) ->
         (* UltraSPARC has two add units but only one shifter. *)
         `	add	{emit_reg i.arg.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.res.(0)}\n`
-    | Lop(Iintop_imm(Idiv, n)) ->
-        let l = Misc.log2 n in
-        if n = 1 lsl l then begin
-          let lbl = new_label() in
-          `	cmp	{emit_reg i.arg.(0)}, 0\n`;
-          `	bge	{emit_label lbl}\n`;
-          `	mov	{emit_reg i.arg.(0)}, %g1\n`; (* in delay slot *)
-          `	add	%g1, {emit_int (n-1)}, %g1\n`;
-          `{emit_label lbl}:\n`;
-          `	sra	%g1, {emit_int l}, {emit_reg i.res.(0)}\n`
-        end else begin
-          `	sra	{emit_reg i.arg.(0)}, 31, %g1\n`;
-          `	wr	%g1, %y\n`;
-          `	sdiv	{emit_reg i.arg.(0)}, {emit_int n}, {emit_reg i.res.(0)}\n`
-        end
-    | Lop(Iintop_imm(Imod, n)) ->       (* n is a power of 2 *)
-        let lbl = new_label() in
-        `	tst	{emit_reg i.arg.(0)}\n`;
-        `	bge	{emit_label lbl}\n`;
-        `	andcc	{emit_reg i.arg.(0)}, {emit_int (n-1)}, {emit_reg i.res.(0)}\n`; (* in delay slot *)
-        `	be	{emit_label lbl}\n`;
-        `	nop\n`;
-        `	sub	{emit_reg i.res.(0)}, {emit_int n}, {emit_reg i.res.(0)}\n`;
-        `{emit_label lbl}:\n`
     | Lop(Iintop_imm(Icomp cmp, n)) ->
         `	cmp	{emit_reg i.arg.(0)}, {emit_int n}\n`;
         if !arch_version = SPARC_V9 then begin
@@ -496,6 +475,9 @@ let rec emit_instr i dslot =
           `	bleu	{emit_label !range_check_trap}\n`;
           `	nop\n`                  (* delay slot *)
         end
+    | Lop(Iintop_imm(Imulh, n)) ->
+        `	smul	{emit_reg i.arg.(0)}, {emit_int n}, %g1\n`;
+        `	rd	%y, {emit_reg i.res.(0)}\n`
     | Lop(Iintop_imm(op, n)) ->
         let instr = name_for_int_operation op in
         `	{emit_string instr}	{emit_reg i.arg.(0)}, {emit_int n}, {emit_reg i.res.(0)}\n`
@@ -618,7 +600,7 @@ and fill_delay_slot = function
    that does not branch. *)
 
 let is_one_instr_op = function
-    Idiv | Imod | Icomp _ | Icheckbound -> false
+    Imulh | Idiv | Imod | Icomp _ | Icheckbound -> false
   | _ -> true
 
 let is_one_instr i =
diff --git a/asmcomp/sparc/scheduling.ml b/asmcomp/sparc/scheduling.ml
index 048880abd..497722bbc 100644
--- a/asmcomp/sparc/scheduling.ml
+++ b/asmcomp/sparc/scheduling.ml
@@ -47,8 +47,6 @@ method oper_issue_cycles = function
   | Ialloc _ -> 6
   | Iintop(Icomp _) -> 4
   | Iintop(Icheckbound) -> 2
-  | Iintop_imm(Idiv, _) -> 5
-  | Iintop_imm(Imod, _) -> 5
   | Iintop_imm(Icomp _, _) -> 4
   | Iintop_imm(Icheckbound, _) -> 2
   | Inegf -> 2
diff --git a/asmcomp/sparc/selection.ml b/asmcomp/sparc/selection.ml
index 055b78f19..d938c1eff 100644
--- a/asmcomp/sparc/selection.ml
+++ b/asmcomp/sparc/selection.ml
@@ -38,23 +38,13 @@ method select_addressing chunk = function
 method! select_operation op args =
   match (op, args) with
   (* For SPARC V7 multiplication, division and modulus are turned into
-     calls to C library routines, except if the dividend is a power of 2.
+     calls to C library routines.
      For SPARC V8 and V9, use hardware multiplication and division,
      but C library routine for modulus. *)
-    (Cmuli, [arg; Cconst_int n]) when n = 1 lsl (Misc.log2 n) ->
-      (Iintop_imm(Ilsl, Misc.log2 n), [arg])
-  | (Cmuli, [Cconst_int n; arg]) when n = 1 lsl (Misc.log2 n) ->
-      (Iintop_imm(Ilsl, Misc.log2 n), [arg])
-  | (Cmuli, _) when !arch_version = SPARC_V7 ->
+    (Cmuli, _) when !arch_version = SPARC_V7 ->
       (Iextcall(".umul", false), args)
-  | (Cdivi, [arg; Cconst_int n])
-    when self#is_immediate n && n = 1 lsl (Misc.log2 n) ->
-      (Iintop_imm(Idiv, n), [arg])
   | (Cdivi, _) when !arch_version = SPARC_V7 ->
       (Iextcall(".div", false), args)
-  | (Cmodi, [arg; Cconst_int n])
-    when self#is_immediate n && n = 1 lsl (Misc.log2 n) ->
-      (Iintop_imm(Imod, n), [arg])
   | (Cmodi, _) ->
       (Iextcall(".rem", false), args)
   | _ ->