4 files changed, 170 insertions, 195 deletions
diff --git a/asmcomp/arm/emit.mlp b/asmcomp/arm/emit.mlp
index 586d477bd..1e0c91526 100644
--- a/asmcomp/arm/emit.mlp
+++ b/asmcomp/arm/emit.mlp
@@ -42,39 +42,31 @@ let emit_symbol s =
 
 let emit_reg r =
   match r.loc with
-    Reg r -> emit_string (register_name r)
+  | Reg r -> emit_string (register_name r)
   | _ -> fatal_error "Emit_arm.emit_reg"
 
-(* Output the next register after the given pseudo-register *)
-
-let emit_next_reg r =
-  match r.loc with
-    Reg r -> emit_string (register_name(r + 1))
-  | _ -> fatal_error "Emit_arm.emit_next_reg"
-
 (* Layout of the stack frame *)
 
 let stack_offset = ref 0
 
 let frame_size () =
-  !stack_offset +
-  4 * num_stack_slots.(0) + 8 * num_stack_slots.(1) +
-  (if !contains_calls then 4 else 0)
+  let sz =
+    !stack_offset +
+    4 * num_stack_slots.(0) +
+    (if !contains_calls then 4 else 0)
+  in Misc.align sz 8
 
 let slot_offset loc cl =
   match loc with
     Incoming n -> frame_size() + n
-  | Local n ->
-      if cl = 0
-      then !stack_offset + num_stack_slots.(1) * 8 + n * 4
-      else !stack_offset + n * 8
+  | Local n -> !stack_offset + n * 4
   | Outgoing n -> n
 
 (* Output a stack reference *)
 
 let emit_stack r =
   match r.loc with
-    Stack s ->
+  | Stack s ->
       let ofs = slot_offset s (register_class r) in `[sp, #{emit_int ofs}]`
   | _ -> fatal_error "Emit_arm.emit_stack"
 
@@ -158,17 +150,6 @@ let name_for_shift_int_operation = function
   | Ishiftsub -> "sub"
   | Ishiftsubrev -> "rsb"
 
-let name_for_float_operation = function
-    Inegf -> "mnfd"
-  | Iabsf -> "absd"
-  | Iaddf -> "adfd"
-  | Isubf -> "sufd"
-  | Imulf -> "mufd"
-  | Idivf -> "dvfd"
-  | Ifloatofint -> "fltd"
-  | Iintoffloat -> "fixz"
-  | _ -> assert false
-
 (* Recognize immediate operands *)
 
 (* Immediate operands are 8-bit immediate values, zero-extended, and rotated
@@ -239,8 +220,6 @@ let symbol_constants = (Hashtbl.create 11 : (string, int) Hashtbl.t)
 let float_constants = (Hashtbl.create 11 : (string, int) Hashtbl.t)
 (* Total space (in word) occupied by pending literals *)
 let num_literals = ref 0
-(* True if we've at least one pending float literal *)
-let pending_float = ref false
 
 (* Label a symbol or float constant *)
 let label_constant tbl s size =
@@ -265,8 +244,7 @@ let emit_constants () =
     float_constants;
   Hashtbl.clear symbol_constants;
   Hashtbl.clear float_constants;
-  num_literals := 0;
-  pending_float := false
+  num_literals := 0
 
 (* Output the assembly code for an instruction *)
 
@@ -279,19 +257,10 @@ let emit_instr i =
           match (src, dst) with
             {loc = Reg rs; typ = Int|Addr}, {loc = Reg rd; typ = Int|Addr} ->
               `	mov	{emit_reg dst}, {emit_reg src}\n`; 1
-          | {loc = Reg rs; typ = Float}, {loc = Reg rd; typ = Float} ->
-              `	mvfd	{emit_reg dst}, {emit_reg src}\n`; 1
-          | {loc = Reg rs; typ = Float}, {loc = Reg rd; typ = Int|Addr} ->
-              `	stfd	{emit_reg src}, [sp, #-8]!\n`;
-              `	ldmfd	sp!, \{{emit_reg dst}, {emit_next_reg dst}}\n`; 2
           | {loc = Reg rs; typ = Int|Addr}, {loc = Stack sd} ->
               `	str	{emit_reg src}, {emit_stack dst}\n`; 1
-          | {loc = Reg rs; typ = Float}, {loc = Stack sd} ->
-              `	stfd	{emit_reg src}, {emit_stack dst}\n`; 1
           | {loc = Stack ss; typ = Int|Addr}, {loc = Reg rd} ->
               `	ldr	{emit_reg dst}, {emit_stack src}\n`; 1
-          | {loc = Stack ss; typ = Float}, {loc = Reg rd} ->
-              `	ldfd	{emit_reg dst}, {emit_stack src}\n`; 1
           | _ ->
               assert false
         end
@@ -305,15 +274,19 @@ let emit_instr i =
         end else
           emit_complex_intconst r n
     | Lop(Iconst_float s) ->
-        begin match Int64.bits_of_float (float_of_string s) with
-        | 0x0000_0000_0000_0000L ->       (* +0.0 *)
-          `	mvfd	{emit_reg i.res.(0)}, #0.0\n`
-        | _ ->
+        let bits = Int64.bits_of_float (float_of_string s) in
+        let high_bits = Int64.to_nativeint (Int64.shift_right_logical bits 32)
+        and low_bits = Int64.to_nativeint bits in
+        if is_immediate low_bits && is_immediate high_bits then begin
+          `	mov	{emit_reg i.res.(0)}, #{emit_nativeint low_bits} @ {emit_string s}\n`;
+          `	mov	{emit_reg i.res.(1)}, #{emit_nativeint high_bits}\n`;
+          2
+        end else begin      
           let lbl = label_constant float_constants s 2 in
-          pending_float := true;
-          `	ldfd	{emit_reg i.res.(0)}, {emit_label lbl} @ {emit_string s}\n`
-        end;
-        1
+          `	ldr	{emit_reg i.res.(0)}, {emit_label lbl} @ {emit_string s}\n`;
+          `	ldr	{emit_reg i.res.(1)}, {emit_label lbl} + 4\n`;
+          2
+        end
     | Lop(Iconst_symbol s) ->
         let lbl = label_constant symbol_constants s 1 in
         `	ldr	{emit_reg i.res.(0)}, {emit_label lbl} @ {emit_symbol s}\n`; 1
@@ -326,8 +299,9 @@ let emit_instr i =
         let n = frame_size() in
         if !contains_calls then
           `	ldr	lr, [sp, #{emit_int (n-4)}]\n`;
-        ignore (emit_stack_adjustment "add" n);
-        `	mov	pc, {emit_reg i.arg.(0)}\n`; 3
+        let ninstr = emit_stack_adjustment "add" n in
+        `	mov	pc, {emit_reg i.arg.(0)}\n`;
+        2 + ninstr
     | Lop(Itailcall_imm s) ->
         if s = !function_name then begin
           `	b	{emit_label !tailrec_entry_point}\n`; 1
@@ -335,28 +309,35 @@ let emit_instr i =
           let n = frame_size() in
           if !contains_calls then
             `	ldr	lr, [sp, #{emit_int (n-4)}]\n`;
-          ignore (emit_stack_adjustment "add" n);
-          `	b	{emit_symbol s}\n`; 3
+          let ninstr = emit_stack_adjustment "add" n in
+          `	b	{emit_symbol s}\n`;
+          2 + ninstr
         end
     | Lop(Iextcall(s, alloc)) ->
         if alloc then begin
           let lbl = label_constant symbol_constants s 1 in
           `	ldr	r10, {emit_label lbl} @ {emit_symbol s}\n`;
-          `{record_frame i.live}        bl      caml_c_call\n`; 2
+          `{record_frame i.live}	bl	caml_c_call\n`; 2
         end else begin
           `	bl	{emit_symbol s}\n`; 1
         end
     | Lop(Istackoffset n) ->
+        assert (n mod 8 = 0);
         let ninstr =
           if n >= 0
           then emit_stack_adjustment "sub" n
           else emit_stack_adjustment "add" (-n) in
         stack_offset := !stack_offset + n;
         ninstr
-    | Lop(Iload(Single, addr)) ->
-        let r = i.res.(0) in
-        `	ldfs	{emit_reg	r}, {emit_addressing addr i.arg 0}\n`;
-        `	mvfd	{emit_reg	r}, {emit_reg r}\n`;
+    | Lop(Iload((Double | Double_u), addr)) ->
+        let addr' = offset_addressing addr 4 in
+        if i.res.(0).loc <> i.arg.(0).loc then begin
+          `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`;
+          `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`
+        end else begin
+          `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`;
+          `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`
+        end;
         2
     | Lop(Iload(size, addr)) ->
         let r = i.res.(0) in
@@ -366,14 +347,13 @@ let emit_instr i =
           | Byte_signed -> "ldrsb"
           | Sixteen_unsigned -> "ldrh"
           | Sixteen_signed -> "ldrsh"
-          | Double | Double_u -> "ldfd"
-          | _ (* Word | Thirtytwo_signed | Thirtytwo_unsigned *) -> "ldr" in
+          | _ (* 32-bit quantities *) -> "ldr" in
         `	{emit_string	instr}     {emit_reg r}, {emit_addressing addr i.arg 0}\n`;
         1
-    | Lop(Istore(Single, addr)) ->
-        let r = i.arg.(0) in
-        `	mvfs	f7,	{emit_reg r}\n`;
-        `	stfs	f7,	{emit_addressing addr i.arg 1}\n`;
+    | Lop(Istore((Double | Double_u), addr)) ->
+        let addr' = offset_addressing addr 4 in
+        `	str	{emit_reg i.arg.(0)}, {emit_addressing addr i.arg 2}\n`;
+        `	str	{emit_reg i.arg.(1)}, {emit_addressing addr' i.arg 2}\n`;
         2
     | Lop(Istore(size, addr)) ->
         let r = i.arg.(0) in
@@ -381,8 +361,7 @@ let emit_instr i =
           match size with
             Byte_unsigned | Byte_signed -> "strb"
           | Sixteen_unsigned | Sixteen_signed -> "strh"
-          | Double | Double_u -> "stfd"
-          | _ (* Word | Thirtytwo_signed | Thirtytwo_unsigned *) -> "str" in
+          | _ (* 32-bit quantities *) -> "str" in
         `	{emit_string	instr}     {emit_reg r}, {emit_addressing addr i.arg 1}\n`;
         1
     | Lop(Ialloc n) ->
@@ -390,11 +369,11 @@ let emit_instr i =
           `	ldr	r10, [alloc_limit, #0]\n`;
           let ni = emit_alloc_decrement n in
           `	cmp	alloc_ptr, r10\n`;
-          `{record_frame i.live}        blcc    caml_call_gc\n`;
+          `{record_frame i.live}	blcc    caml_call_gc\n`;
           `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
           4 + ni
         end else if n = 8 || n = 12 || n = 16 then begin
-            `{record_frame i.live}      bl      caml_alloc{emit_int ((n-4)/4)}\n`;
+            `{record_frame i.live}	bl	caml_alloc{emit_int ((n-4)/4)}\n`;
             `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`; 2
         end else begin
             let nn = Nativeint.of_int n in
@@ -403,7 +382,7 @@ let emit_instr i =
                 `	mov	r10, #{emit_int n}\n`; 1
               end else
                 emit_complex_intconst (phys_reg 8 (*r10*)) nn in
-            `{record_frame i.live}      bl      caml_allocN\n`;
+            `{record_frame i.live}	bl	caml_allocN\n`;
             `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
             2 + ni
         end
@@ -458,12 +437,12 @@ let emit_instr i =
     | Lop(Iintop_imm(op, n)) ->
         let instr = name_for_int_operation op in
         `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
-    | Lop(Inegf | Iabsf | Ifloatofint | Iintoffloat as op) ->
-        let instr = name_for_float_operation op in
-        `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`; 1
-    | Lop(Iaddf | Isubf | Imulf | Idivf as op) ->
-        let instr = name_for_float_operation op in
-        `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1
+    | Lop(Inegf) -> (* argument and result in (r0, r1) *)
+        `	eor     r1, r1, #0x80000000\n`; 1
+    | Lop(Iabsf) -> (* argument and result in (r0, r1) *)
+        `	bic     r1, r1, #0x80000000\n`; 1
+    | Lop(Ifloatofint | Iintoffloat | Iaddf | Isubf | Imulf | Idivf) ->
+        assert false
     | Lop(Ispecific(Ishiftarith(op, shift))) ->
         let instr = name_for_shift_int_operation op in
         `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}`;
@@ -480,9 +459,9 @@ let emit_instr i =
         let n = frame_size() in
         `	ldr	lr, [sp, #{emit_int(n-4)}]\n`; 1
     | Lreturn ->
-        let n = frame_size() in
-        ignore(emit_stack_adjustment "add" n);
-        `	mov	pc, lr\n`; 2
+        let ninstr = emit_stack_adjustment "add" (frame_size()) in
+        `	mov	pc, lr\n`;
+        ninstr + 1
     | Llabel lbl ->
         `{emit_label lbl}:\n`; 0
     | Lbranch lbl ->
@@ -498,20 +477,13 @@ let emit_instr i =
         | Iinttest cmp ->
             `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
             let comp = name_for_comparison cmp in
-            `	b{emit_string	comp}     {emit_label lbl}\n`
+            `	b{emit_string comp}	{emit_label lbl}\n`
         | Iinttest_imm(cmp, n) ->
             `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
             let comp = name_for_comparison cmp in
-            `	b{emit_string	comp}     {emit_label lbl}\n`
+            `	b{emit_string comp}	{emit_label lbl}\n`
         | Ifloattest(cmp, neg) ->
-            begin match cmp with
-              Ceq | Cne ->
-                `	cmf	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
-            | _ ->
-                `	cmfe	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
-            end;
-            let comp = name_for_float_comparison cmp neg in
-            `	b{emit_string	comp}     {emit_label lbl}\n`
+            assert false
         | Ioddtest ->
             `	tst	{emit_reg i.arg.(0)}, #1\n`;
             `	bne	{emit_label lbl}\n`
@@ -569,8 +541,8 @@ let rec emit_all ninstr i =
   if i.desc = Lend then () else begin
     let n = emit_instr i in
     let ninstr' = ninstr + n in
-    let limit = (if !pending_float then 127 else 511) - !num_literals in
-    if ninstr' >= limit - 32 && no_fallthrough i.desc then begin
+    let limit = 511 - !num_literals in
+    if ninstr' >= limit - 64 && no_fallthrough i.desc then begin
       emit_constants();
       emit_all 0 i.next
     end else
@@ -594,7 +566,7 @@ let fundecl fundecl =
   Hashtbl.clear symbol_constants;
   Hashtbl.clear float_constants;
   `	.text\n`;
-  `	.align	0\n`;
+  `	.align	2\n`;
   `	.global	{emit_symbol fundecl.fun_name}\n`;
   `{emit_symbol fundecl.fun_name}:\n`;
   let n = frame_size() in
@@ -632,7 +604,7 @@ let emit_item = function
   | Clabel_address lbl ->
       `	.word	{emit_label (10000 + lbl)}\n`
   | Cstring s ->
-      emit_string_directive "   .ascii  " s
+      emit_string_directive "	.ascii  " s
   | Cskip n ->
       if n > 0 then `	.space	{emit_int n}\n`
   | Calign n ->
diff --git a/asmcomp/arm/proc.ml b/asmcomp/arm/proc.ml
index 5e8318ba9..0916a8323 100644
--- a/asmcomp/arm/proc.ml
+++ b/asmcomp/arm/proc.ml
@@ -36,33 +36,21 @@ let word_addressed = false
     r13                         stack pointer
     r14                         return address
     r15                         program counter
-
-    f0 - f6                     general purpose (f4 - f6 preserved by C)
-    f7                          temporary
 *)
 
 let int_reg_name = [|
   "r0"; "r1"; "r2"; "r3"; "r4"; "r5"; "r6"; "r7"; "r10"; "r12"
 |]
 
-let float_reg_name = [|
-  "f0"; "f1"; "f2"; "f3"; "f4"; "f5"; "f6"
-|]
-
-let num_register_classes = 2
+let num_register_classes = 1
 
-let register_class r =
-  match r.typ with
-    Int -> 0
-  | Addr -> 0
-  | Float -> 1
+let register_class r = assert (r.typ <> Float); 0
 
-let num_available_registers = [| 10; 7 |]
+let num_available_registers = [| 10 |]
 
-let first_available_register = [| 0; 100 |]
+let first_available_register = [| 0 |]
 
-let register_name r =
-  if r < 100 then int_reg_name.(r) else float_reg_name.(r - 100)
+let register_name r = int_reg_name.(r)
 
 let rotate_registers = true
 
@@ -73,27 +61,22 @@ let hard_int_reg =
   for i = 0 to 9 do v.(i) <- Reg.at_location Int (Reg i) done;
   v
 
-let hard_float_reg =
-  let v = Array.create 7 Reg.dummy in
-  for i = 0 to 6 do v.(i) <- Reg.at_location Float (Reg(100 + i)) done;
-  v
-
-let all_phys_regs =
-  Array.append hard_int_reg hard_float_reg
+let all_phys_regs = hard_int_reg
 
-let phys_reg n =
-  if n < 100 then hard_int_reg.(n) else hard_float_reg.(n - 100)
+let phys_reg n = all_phys_regs.(n)
 
 let stack_slot slot ty =
+  assert (ty <> Float);
   Reg.at_location ty (Stack slot)
 
 (* Calling conventions *)
 
-let calling_conventions first_int last_int first_float last_float
-                        make_stack arg =
+(* XXX float types have already been expanded into pairs of integers.
+   So we cannot align these floats.  See if that causes a problem. *)
+
+let calling_conventions first_int last_int make_stack arg =
   let loc = Array.create (Array.length arg) Reg.dummy in
   let int = ref first_int in
-  let float = ref first_float in
   let ofs = ref 0 in
   for i = 0 to Array.length arg - 1 do
     match arg.(i).typ with
@@ -106,64 +89,32 @@ let calling_conventions first_int last_int first_float last_float
           ofs := !ofs + size_int
         end
     | Float ->
-        if !float <= last_float then begin
-          loc.(i) <- phys_reg !float;
-          incr float
-        end else begin
-          loc.(i) <- stack_slot (make_stack !ofs) Float;
-          ofs := !ofs + size_float
-        end
+        assert false
   done;
-  (loc, !ofs)
+  (loc, Misc.align !ofs 8)
 
 let incoming ofs = Incoming ofs
 let outgoing ofs = Outgoing ofs
 let not_supported ofs = fatal_error "Proc.loc_results: cannot call"
 
 let loc_arguments arg =
-  calling_conventions 0 7 100 103 outgoing arg
+  calling_conventions 0 7 outgoing arg
 let loc_parameters arg =
-  let (loc, ofs) = calling_conventions 0 7 100 103 incoming arg in loc
+  let (loc, ofs) = calling_conventions 0 7 incoming arg in loc
 let loc_results res =
-  let (loc, ofs) = calling_conventions 0 7 100 103 not_supported res in loc
-
-(* Calling conventions for C are as for Caml, except that float arguments
-   are passed in pairs of integer registers. *)
+  let (loc, ofs) = calling_conventions 0 7 not_supported res in loc
 
 let loc_external_arguments arg =
-  let loc = Array.create (Array.length arg) Reg.dummy in
-  let reg = ref 0 in
-  let ofs = ref 0 in
-  for i = 0 to Array.length arg - 1 do
-    match arg.(i).typ with
-      Int | Addr as ty ->
-        if !reg <= 3 then begin
-          loc.(i) <- phys_reg !reg;
-          incr reg
-        end else begin
-          loc.(i) <- stack_slot (outgoing !ofs) ty;
-          ofs := !ofs + size_int
-        end
-    | Float ->
-        if !reg <= 2 then begin
-          loc.(i) <- phys_reg !reg;
-          reg := !reg + 2
-        end else begin
-          loc.(i) <- stack_slot (outgoing !ofs) Float;
-          ofs := !ofs + size_float
-        end
-  done;
-  (loc, !ofs)
-
+  calling_conventions 0 3 outgoing arg
 let loc_external_results res =
-  let (loc, ofs) = calling_conventions 0 0 100 100 not_supported res in loc
+  let (loc, ofs) = calling_conventions 0 1 not_supported res in loc
 
 let loc_exn_bucket = phys_reg 0
 
 (* Registers destroyed by operations *)
 
-let destroyed_at_c_call =               (* r4-r9, f4-f6 preserved *)
-  Array.of_list(List.map phys_reg [0;1;2;3;8;9; 100;101;102;103])
+let destroyed_at_c_call =               (* r4-r9 preserved *)
+  Array.of_list(List.map phys_reg [0;1;2;3;8;9])
 
 let destroyed_at_oper = function
     Iop(Icall_ind | Icall_imm _ | Iextcall(_, true)) -> all_phys_regs
@@ -177,14 +128,14 @@ let destroyed_at_raise = all_phys_regs
 
 let safe_register_pressure = function
     Iextcall(_, _) -> 4
-  | _ -> 7
+  | _ -> 10
 let max_register_pressure = function
-    Iextcall(_, _) -> [| 4; 4 |]
-  | _ -> [| 10; 7 |]
+    Iextcall(_, _) -> [| 4 |]
+  | _ -> [| 10 |]
 
 (* Layout of the stack *)
 
-let num_stack_slots = [| 0; 0 |]
+let num_stack_slots = [| 0 |]
 let contains_calls = ref false
 
 (* Calling the assembler *)
diff --git a/asmcomp/arm/selection.ml b/asmcomp/arm/selection.ml
index e34093acb..3fdbd640a 100644
--- a/asmcomp/arm/selection.ml
+++ b/asmcomp/arm/selection.ml
@@ -18,6 +18,7 @@ open Misc
 open Cmm
 open Reg
 open Arch
+open Proc
 open Mach
 
 (* Immediate operands are 8-bit immediate values, zero-extended, and rotated
@@ -39,11 +40,32 @@ let is_offset n = n < 256 && n > -256
 
 let is_intconst = function Cconst_int n -> true | _ -> false
 
+(* Soft emulation of float comparisons *)
+
+let float_comparison_function = function
+  | Ceq -> "__eqdf2"
+  | Cne -> "__nedf2"
+  | Clt -> "__ltdf2"
+  | Cle -> "__ledf2"
+  | Cgt -> "__gtdf2"
+  | Cge -> "__gedf2"
+
 (* Instruction selection *)
 class selector = object(self)
 
 inherit Selectgen.selector_generic as super
 
+method regs_for tyv =
+  (* Expand floats into pairs of integer registers *)
+  let nty = Array.length tyv in
+  let rec expand i =
+    if i >= nty then [] else begin
+      match tyv.(i) with
+      | Float -> Int :: Int :: expand (i+1)
+      | ty -> ty :: expand (i+1)
+    end in
+  Reg.createv (Array.of_list (expand 0))
+
 method is_immediate n =
   n land 0xFF = n || is_immed n 2
 
@@ -114,17 +136,64 @@ method select_operation op args =
       | _ ->
         super#select_operation op args
       end
+  (* Turn floating-point operations into library function calls *)
+  | Caddf -> (Iextcall("__adddf3", false), args)
+  | Csubf -> (Iextcall("__subdf3", false), args)
+  | Cmulf -> (Iextcall("__muldf3", false), args)
+  | Cdivf -> (Iextcall("__divdf3", false), args)
+  | Cfloatofint -> (Iextcall("__floatsidf", false), args)
+  | Cintoffloat -> (Iextcall("__fixdfsi", false), args)
+  | Ccmpf comp ->
+      (Iintop_imm(Icomp(Isigned comp), 0),
+       [Cop(Cextcall(float_comparison_function comp,
+                     typ_int, false, Debuginfo.none),
+            args)])
+  (* Add coercions around loads and stores of 32-bit floats *)
+  | Cload Single ->
+      (Iextcall("__extendsdfd2", false), [Cop(Cload Word, args)])
+  | Cstore Single ->
+      begin match args with
+      | [arg1; arg2] ->
+          let arg2' =
+            Cop(Cextcall("__truncdfsd2", typ_int, false, Debuginfo.none),
+                [arg2]) in
+          self#select_operation (Cstore Word) [arg1; arg2']
+      | _ -> assert false
+      end
+  (* Other operations are regular *)
   | _ -> super#select_operation op args
 
-(* In mul rd, rm, rs,  the registers rm and rd must be different.
+method select_condition = function
+  | Cop(Ccmpf cmp, args) ->
+      (Iinttest_imm(Isigned cmp, 0),
+       Cop(Cextcall(float_comparison_function cmp, 
+                    typ_int, false, Debuginfo.none),
+           args))
+  | expr ->
+      super#select_condition expr
+
+(* Deal with some register irregularities:
+
+1- In mul rd, rm, rs,  the registers rm and rd must be different.
    We deal with this by pretending that rm is also a result of the mul
-   operation. *)
+   operation.
+
+2- For Inegf and Iabsf, force arguments and results in (r0, r1);
+   this simplifies code generation later.
+*)
 
 method insert_op_debug op dbg rs rd =
-  if op = Iintop(Imul) then begin
-    self#insert_debug (Iop op) dbg rs [| rd.(0); rs.(0) |]; rd
-  end else
-    super#insert_op_debug op dbg rs rd
+  match op with
+  | Iintop(Imul) ->
+      self#insert_debug (Iop op) dbg rs [| rd.(0); rs.(0) |]; rd
+  | Iabsf | Inegf ->
+      let r = [| phys_reg 0; phys_reg 1 |] in
+      self#insert_moves rs r;
+      self#insert_debug (Iop op) dbg r r;
+      self#insert_moves r rd;
+      rd
+  | _ ->
+      super#insert_op_debug op dbg rs rd
 
 end
 
diff --git a/asmrun/arm.S b/asmrun/arm.S
index 98fdfcfe3..674c99de6 100644
--- a/asmrun/arm.S
+++ b/asmrun/arm.S
@@ -18,9 +18,6 @@
 trap_ptr        .req    r11
 alloc_ptr       .req    r8
 alloc_limit     .req    r9
-sp              .req    r13
-lr              .req    r14
-pc              .req    r15
 
         .text
 
@@ -106,15 +103,11 @@ caml_allocN:
         ldr     r10, .Lcaml_bottom_of_stack
         str     sp, [r10, #0]
     /* Save integer registers and return address on stack */
+        sub     sp, sp, #4     /* preserve 8-alignment */
         stmfd   sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r10,r12,lr}
     /* Store pointer to saved integer registers in caml_gc_regs */
         ldr     r10, .Lcaml_gc_regs
         str     sp, [r10, #0]
-    /* Save non-callee-save float registers */
-        stfd    f0, [sp, #-8]!
-        stfd    f1, [sp, #-8]!
-        stfd    f2, [sp, #-8]!
-        stfd    f3, [sp, #-8]!
     /* Save current allocation pointer for debugging purposes */
         ldr     r10, .Lcaml_young_ptr
         str     alloc_ptr, [r10, #0]
@@ -124,10 +117,6 @@ caml_allocN:
     /* Call the garbage collector */
         bl      caml_garbage_collection
     /* Restore the registers from the stack */
-        ldfd    f4, [sp], #8
-        ldfd    f5, [sp], #8
-        ldfd    f6, [sp], #8
-        ldfd    f7, [sp], #8
         ldmfd   sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r10,r12}
     /* Reload return address */
         ldr     r10, .Lcaml_last_return_address
@@ -140,7 +129,9 @@ caml_allocN:
         ldr     alloc_ptr, [r10, #0]
         ldr     alloc_limit, .Lcaml_young_limit
     /* Return to caller */
-        ldmfd   sp!, {pc}
+        ldr     r10, [sp, #0]
+        add     sp, sp, #8
+        mov     pc, r10
 
 /* Call a C function from Caml */
 /* Function to call is in r10 */
@@ -182,13 +173,9 @@ caml_start_program:
 
 .Ljump_to_caml:
     /* Save return address and callee-save registers */
-        stmfd   sp!, {r4,r5,r6,r7,r8,r9,r11,lr}
-        stfd    f7, [sp, #-8]!
-        stfd    f6, [sp, #-8]!
-        stfd    f5, [sp, #-8]!
-        stfd    f4, [sp, #-8]!
+        stmfd   sp!, {r4,r5,r6,r7,r8,r9,r11,lr} /* 8-alignment */
     /* Setup a callback link on the stack */
-        sub     sp, sp, #4*3
+        sub     sp, sp, #4*4                    /* 8-alignment */
         ldr     r4, .Lcaml_bottom_of_stack
         ldr     r4, [r4, #0]
         str     r4, [sp, #0]
@@ -234,15 +221,11 @@ caml_start_program:
         ldr     r4, .Lcaml_gc_regs
         ldr     r5, [sp, #8]
         str     r5, [r4, #0]
-        add     sp, sp, #4*3
+        add     sp, sp, #4*4
     /* Update allocation pointer */
         ldr     r4, .Lcaml_young_ptr
         str     alloc_ptr, [r4, #0]
     /* Reload callee-save registers and return */
-        ldfd    f4, [sp], #8
-        ldfd    f5, [sp], #8
-        ldfd    f6, [sp], #8
-        ldfd    f7, [sp], #8
         ldmfd   sp!, {r4,r5,r6,r7,r8,r9,r11,pc}
 
     /* The trap handler */