From a373fddbd3b129f3f95474533e74f0a52744ff8c Mon Sep 17 00:00:00 2001
From: Mike Pall <mike>
Date: Mon, 30 Jul 2012 18:59:13 +0200
Subject: [PATCH] ARM: Add VFP and hard-float ABI variants to interpreter.

---
 src/lj_frame.h      |   4 +
 src/lj_target_arm.h |  11 +-
 src/vm_arm.dasc     | 447 +++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 434 insertions(+), 28 deletions(-)
diff --git a/src/lj_frame.h b/src/lj_frame.h
index b8429c2a..b8af2349 100644
--- a/src/lj_frame.h
+++ b/src/lj_frame.h
@@ -97,7 +97,11 @@ enum {
 #define CFRAME_OFS_L		12
 #define CFRAME_OFS_PC		8
 #define CFRAME_OFS_MULTRES	4
+#if LJ_ARCH_HASFPU
+#define CFRAME_SIZE		128
+#else
 #define CFRAME_SIZE		64
+#endif
 #define CFRAME_SHIFT_MULTRES	3
 #elif LJ_TARGET_PPC
 #if LJ_ARCH_PPC64
diff --git a/src/lj_target_arm.h b/src/lj_target_arm.h
index a24fc819..20e8ad36 100644
--- a/src/lj_target_arm.h
+++ b/src/lj_target_arm.h
@@ -14,7 +14,9 @@
 #if LJ_SOFTFP
 #define FPRDEF(_)
 #else
-#error "NYI: hard-float support for ARM"
+#define FPRDEF(_) \
+  _(D0) _(D1) _(D2) _(D3) _(D4) _(D5) _(D6) _(D7) \
+  _(D8) _(D9) _(D10) _(D11) _(D12) _(D13) _(D14) _(D15)
 #endif
 #define VRIDDEF(_)
 
@@ -45,7 +47,7 @@ enum {
 #if LJ_SOFTFP
   RID_MAX_FPR = RID_MIN_FPR,
 #else
-#error "NYI: VFP support for ARM"
+  RID_MAX_FPR = RID_D15+1,
 #endif
   RID_NUM_GPR = RID_MAX_GPR - RID_MIN_GPR,
   RID_NUM_FPR = RID_MAX_FPR - RID_MIN_FPR
@@ -68,7 +70,8 @@ enum {
 #define RSET_FPR		0
 #define RSET_ALL		RSET_GPR
 #else
-#error "NYI: VFP support for ARM"
+#define RSET_FPR		(RSET_RANGE(RID_MIN_FPR, RID_MAX_FPR))
+#define RSET_ALL		(RSET_GPR|RSET_FPR)
 #endif
 #define RSET_INIT		RSET_ALL
 
@@ -82,7 +85,7 @@ enum {
 #if LJ_SOFTFP
 #define RSET_SCRATCH_FPR	0
 #else
-#error "NYI: VFP support for ARM"
+#define RSET_SCRATCH_FPR	(RSET_RANGE(RID_D0, RID_D7+1))
 #endif
 #define RSET_SCRATCH		(RSET_SCRATCH_GPR|RSET_SCRATCH_FPR)
 #define REGARG_FIRSTGPR		RID_R0
diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc
index 8ddce49e..26f97aa3 100644
--- a/src/vm_arm.dasc
+++ b/src/vm_arm.dasc
@@ -46,6 +46,7 @@
 |.define CRET2,		r1
 |
 |// Stack layout while in interpreter. Must match with lj_frame.h.
+|.define SAVE_R4,	[sp, #28]
 |.define CFRAME_SPACE,	#28
 |.define SAVE_ERRF,	[sp, #24]
 |.define SAVE_NRES,	[sp, #20]
@@ -60,6 +61,20 @@
 |.define TMPD,		[sp]
 |.define TMPDp,		sp
 |
+|.if FPU
+|.macro saveregs
+|  push {r5, r6, r7, r8, r9, r10, r11, lr}
+|  vpush {d8-d15}
+|  sub sp, sp, CFRAME_SPACE+4
+|  str r4, SAVE_R4
+|.endmacro
+|.macro restoreregs_ret
+|  ldr r4, SAVE_R4
+|  add sp, sp, CFRAME_SPACE+4
+|  vpop {d8-d15}
+|  pop {r5, r6, r7, r8, r9, r10, r11, pc}
+|.endmacro
+|.else
 |.macro saveregs
 |  push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 |  sub sp, sp, CFRAME_SPACE
@@ -68,6 +83,7 @@
 |  add sp, sp, CFRAME_SPACE
 |  pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 |.endmacro
+|.endif
 |
 |// Type definitions. Some of these are only used for documentation.
 |.type L,		lua_State,	LREG
@@ -875,6 +891,29 @@ static void build_subroutines(BuildCtx *ctx)
   |  bhs ->fff_fallback
   |.endmacro
   |
+  |.macro .ffunc_d, name
+  |  .ffunc name
+  |  ldr CARG2, [BASE, #4]
+  |   cmp NARGS8:RC, #8
+  |  vldr d0, [BASE]
+  |   blo ->fff_fallback
+  |  checktp CARG2, LJ_TISNUM
+  |  bhs ->fff_fallback
+  |.endmacro
+  |
+  |.macro .ffunc_dd, name
+  |  .ffunc name
+  |  ldr CARG2, [BASE, #4]
+  |  ldr CARG4, [BASE, #12]
+  |   cmp NARGS8:RC, #16
+  |  vldr d0, [BASE]
+  |  vldr d1, [BASE, #8]
+  |   blo ->fff_fallback
+  |  checktp CARG2, LJ_TISNUM
+  |  cmnlo CARG4, #-LJ_TISNUM
+  |  bhs ->fff_fallback
+  |.endmacro
+  |
   |// Inlined GC threshold check. Caveat: uses CARG1 and CARG2.
   |.macro ffgccheck
   |  ldr CARG1, [DISPATCH, #DISPATCH_GL(gc.total)]
@@ -1327,8 +1366,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  movmi CARG1, #0x80000000
   |  bmi <1
   |4:
+  |.if HFABI
+  |  vmov d0, CARG1, CARG2
+  |  bl ->vm_..func.._hf
+  |  b ->fff_resd
+  |.else
   |  bl ->vm_..func
   |  b ->fff_restv
+  |.endif
   |.endmacro
   |
   |  math_round floor
@@ -1381,22 +1426,48 @@ static void build_subroutines(BuildCtx *ctx)
   |  b <5
   |
   |.macro math_extern, func
+  |.if HFABI
+  |  .ffunc_d math_ .. func
+  |.else
   |  .ffunc_n math_ .. func
+  |.endif
   |  .IOS mov RA, BASE
   |  bl extern func
   |  .IOS mov BASE, RA
+  |.if HFABI
+  |  b ->fff_resd
+  |.else
   |  b ->fff_restv
+  |.endif
   |.endmacro
   |
   |.macro math_extern2, func
+  |.if HFABI
+  |  .ffunc_dd math_ .. func
+  |.else
   |  .ffunc_nn math_ .. func
+  |.endif
   |  .IOS mov RA, BASE
   |  bl extern func
   |  .IOS mov BASE, RA
+  |.if HFABI
+  |  b ->fff_resd
+  |.else
   |  b ->fff_restv
+  |.endif
   |.endmacro
   |
+  |.if FPU
+  |  .ffunc_d math_sqrt
+  |  vsqrt.f64 d0, d0
+  |->fff_resd:
+  |  ldr PC, [BASE, FRAME_PC]
+  |  vstr d0, [BASE, #-8]
+  |  b ->fff_res1
+  |.else
   |  math_extern sqrt
+  |.endif
+  |
   |  math_extern log
   |  math_extern log10
   |  math_extern exp
@@ -1414,11 +1485,34 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern2 fmod
   |
   |->ff_math_deg:
-  |.ffunc_n math_rad
+  |.if FPU
+  |  .ffunc_d math_rad
+  |  vldr d1, CFUNC:CARG3->upvalue[0]
+  |  vmul.f64 d0, d0, d1
+  |  b ->fff_resd
+  |.else
+  |  .ffunc_n math_rad
   |  ldrd CARG34, CFUNC:CARG3->upvalue[0]
   |  bl extern __aeabi_dmul
   |  b ->fff_restv
+  |.endif
   |
+  |.if HFABI
+  |  .ffunc math_ldexp
+  |  ldr CARG4, [BASE, #4]
+  |  ldrd CARG12, [BASE, #8]
+  |   cmp NARGS8:RC, #16
+  |   blo ->fff_fallback
+  |  vldr d0, [BASE]
+  |  checktp CARG4, LJ_TISNUM
+  |  bhs ->fff_fallback
+  |  checktp CARG2, LJ_TISNUM
+  |  bne ->fff_fallback
+  |  .IOS mov RA, BASE
+  |  bl extern ldexp			// (double x, int exp)
+  |  .IOS mov BASE, RA
+  |  b ->fff_resd
+  |.else
   |.ffunc_2 math_ldexp
   |  checktp CARG2, LJ_TISNUM
   |  bhs ->fff_fallback
@@ -1428,7 +1522,22 @@ static void build_subroutines(BuildCtx *ctx)
   |  bl extern ldexp			// (double x, int exp)
   |  .IOS mov BASE, RA
   |  b ->fff_restv
+  |.endif
   |
+  |.if HFABI
+  |.ffunc_d math_frexp
+  |  mov CARG1, sp
+  |  .IOS mov RA, BASE
+  |  bl extern frexp
+  |  .IOS mov BASE, RA
+  |   ldr CARG3, [sp]
+  |   mvn CARG4, #~LJ_TISNUM
+  |    ldr PC, [BASE, FRAME_PC]
+  |  vstr d0, [BASE, #-8]
+  |    mov RC, #(2+1)*8
+  |   strd CARG34, [BASE]
+  |  b ->fff_res
+  |.else
   |.ffunc_n math_frexp
   |  mov CARG3, sp
   |  .IOS mov RA, BASE
@@ -1441,7 +1550,19 @@ static void build_subroutines(BuildCtx *ctx)
   |    mov RC, #(2+1)*8
   |   strd CARG34, [BASE]
   |  b ->fff_res
+  |.endif
   |
+  |.if HFABI
+  |.ffunc_d math_modf
+  |  sub CARG1, BASE, #8
+  |   ldr PC, [BASE, FRAME_PC]
+  |  .IOS mov RA, BASE
+  |  bl extern modf
+  |  .IOS mov BASE, RA
+  |   mov RC, #(2+1)*8
+  |  vstr d0, [BASE]
+  |  b ->fff_res
+  |.else
   |.ffunc_n math_modf
   |  sub CARG3, BASE, #8
   |   ldr PC, [BASE, FRAME_PC]
@@ -1451,8 +1572,56 @@ static void build_subroutines(BuildCtx *ctx)
   |   mov RC, #(2+1)*8
   |  strd CARG12, [BASE]
   |  b ->fff_res
+  |.endif
   |
   |.macro math_minmax, name, cond, fcond
+  |.if FPU
+  |  .ffunc_1 name
+  |   add RB, BASE, RC
+  |  checktp CARG2, LJ_TISNUM
+  |   add RA, BASE, #8
+  |  bne >4
+  |1:  // Handle integers.
+  |  ldrd CARG34, [RA]
+  |   cmp RA, RB
+  |   bhs ->fff_restv
+  |  checktp CARG4, LJ_TISNUM
+  |  bne >3
+  |  cmp CARG1, CARG3
+  |   add RA, RA, #8
+  |  mov..cond CARG1, CARG3
+  |  b <1
+  |3:  // Convert intermediate result to number and continue below.
+  |  vmov s4, CARG1
+  |  bhi ->fff_fallback
+  |  vldr d1, [RA]
+  |  vcvt.f64.s32 d0, s4
+  |  b >6
+  |
+  |4:
+  |  vldr d0, [BASE]
+  |  bhi ->fff_fallback
+  |5:  // Handle numbers.
+  |  ldrd CARG34, [RA]
+  |  vldr d1, [RA]
+  |   cmp RA, RB
+  |   bhs ->fff_resd
+  |  checktp CARG4, LJ_TISNUM
+  |  bhs >7
+  |6:
+  |  vcmp.f64 d0, d1
+  |  vmrs
+  |   add RA, RA, #8
+  |  vmov..fcond.f64 d0, d1
+  |  b <5
+  |7:  // Convert integer to number and continue above.
+  |  vmov s4, CARG3
+  |  bhi ->fff_fallback
+  |  vcvt.f64.s32 d1, s4
+  |  b <6
+  |
+  |.else
+  |
   |  .ffunc_1 name
   |  checktp CARG2, LJ_TISNUM
   |   mov RA, #8
@@ -1467,9 +1636,8 @@ static void build_subroutines(BuildCtx *ctx)
   |   add RA, RA, #8
   |  mov..cond CARG1, CARG3
   |  b <1
-  |3:
+  |3:  // Convert intermediate result to number and continue below.
   |  bhi ->fff_fallback
-  |  // Convert intermediate result to number and continue below.
   |  bl extern __aeabi_i2d
   |  ldrd CARG34, [BASE, RA]
   |  b >6
@@ -1495,6 +1663,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  bl extern __aeabi_i2d
   |  ldrd CARG34, TMPD
   |  b <6
+  |.endif
   |.endmacro
   |
   |  math_minmax math_min, gt, hi
@@ -1959,6 +2128,9 @@ static void build_subroutines(BuildCtx *ctx)
   |  ldr CARG2, [CARG1, #-4]!	// Get exit instruction.
   |   str CARG1, [sp, #56]	// Store exit pc in RID_LR and RID_PC.
   |   str CARG1, [sp, #60]
+  |.if FPU
+  |  vpush {d0-d15}
+  |.endif
   |  lsl CARG2, CARG2, #8
   |  add CARG1, CARG1, CARG2, asr #6
   |   ldr CARG2, [lr, #4]	// Load exit stub group offset.
@@ -2025,8 +2197,53 @@ static void build_subroutines(BuildCtx *ctx)
   |// FP value rounding. Called from JIT code.
   |//
   |// double lj_vm_floor/ceil/trunc(double x);
-  |.macro vm_round, func
-  |->vm_ .. func:
+  |.macro vm_round, func, hf
+  |.if FPU
+  |.if hf == 0
+  |  vmov d0, CARG1, CARG2
+  |  vldr d2, <8			// 2^52
+  |.else
+  |  vldr d2, <8			// 2^52
+  |  vmov CARG1, CARG2, d0
+  |.endif
+  |  vabs.f64 d1, d0
+  |  vcmp.f64 d1, d2			// |x| >= 2^52 or NaN?
+  |  vmrs
+  |.if "func" == "trunc"
+  |  vadd.f64 d0, d1, d2
+  |  bxpl lr				// Return argument unchanged.
+  |  vsub.f64 d0, d0, d2		// (|x| + 2^52) - 2^52
+  |  vldr d2, <9			// +1.0
+  |  vcmp.f64 d1, d0			// |x| < result: subtract +1.0
+  |  vmrs
+  |  vsubmi.f64 d0, d1, d2
+  |  cmp CARG2, #0
+  |  vnegmi.f64 d0, d0			// Merge sign bit back in.
+  |.else
+  |  vadd.f64 d1, d1, d2
+  |  bxpl lr				// Return argument unchanged.
+  |  cmp CARG2, #0
+  |  vsub.f64 d1, d1, d2		// (|x| + 2^52) - 2^52
+  |  vldr d2, <9			// +1.0
+  |  vnegmi.f64 d1, d1			// Merge sign bit back in.
+  |.if "func" == "floor"
+  |  vcmp.f64 d0, d1			// x < result: subtract +1.0.
+  |  vmrs
+  |  vsubmi.f64 d0, d1, d2
+  |.else
+  |  vcmp.f64 d1, d0			// x > result: add +1.0.
+  |  vmrs
+  |  vaddmi.f64 d0, d1, d2
+  |.endif
+  |  vmovpl.f64 d0, d1
+  |.endif
+  |.if hf == 0
+  |  vmov CARG1, CARG2, d0
+  |.endif
+  |  bx lr
+  |
+  |.else
+  |
   |  lsl CARG3, CARG2, #1
   |  adds RB, CARG3, #0x00200000
   |  bpl >2				// |x| < 1?
@@ -2069,15 +2286,40 @@ static void build_subroutines(BuildCtx *ctx)
   |  ldrne CARG4, <9			// hi = sign(x) | (iszero ? 0.0 : 1.0)
   |  orrne CARG2, CARG2, CARG4
   |  bx lr
+  |.endif
   |.endmacro
   |
+  |.if FPU
+  |.align 8
   |9:
-  |  .long 0x3ff00000			// hiword(1.0)
-  |  vm_round floor
-  |  vm_round ceil
+  |  .long 0, 0x3ff00000		// +1.0
+  |8:
+  |  .long 0, 0x43300000		// 2^52
+  |.else
+  |9:
+  |  .long 0x3ff00000			// hiword(+1.0)
+  |.endif
+  |
+  |->vm_floor:
+  |.if not HFABI
+  |  vm_round floor, 0
+  |.endif
+  |->vm_floor_hf:
+  |.if FPU
+  |  vm_round floor, 1
+  |.endif
+  |
+  |->vm_ceil:
+  |.if not HFABI
+  |  vm_round ceil, 0
+  |.endif
+  |->vm_ceil_hf:
+  |.if FPU
+  |  vm_round ceil, 1
+  |.endif
   |
   |->vm_trunc:
-  |.if JIT
+  |.if JIT and not HFABI
   |  lsl CARG3, CARG2, #1
   |  adds RB, CARG3, #0x00200000
   |  andpl CARG2, CARG2, #0x80000000	// |x| < 1? hi = sign(x), lo = 0.
@@ -2093,8 +2335,23 @@ static void build_subroutines(BuildCtx *ctx)
   |  bx lr
   |.endif
   |
+  |->vm_trunc_hf:
+  |.if JIT and FPU
+  |  vm_round trunc, 1
+  |.endif
+  |
   |  // double lj_vm_mod(double dividend, double divisor);
   |->vm_mod:
+  |.if FPU
+  |  // Special calling convention. Also, RC (r11) is not preserved.
+  |  vdiv.f64 d0, d6, d7
+  |   mov RC, lr
+  |  bl ->vm_floor_hf
+  |  vmul.f64 d0, d0, d7
+  |   mov lr, RC
+  |  vsub.f64 d6, d6, d0
+  |  bx lr
+  |.else
   |  push {r0, r1, r2, r3, r4, lr}
   |  bl extern __aeabi_ddiv
   |  bl ->vm_floor
@@ -2105,6 +2362,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  bl extern __aeabi_dadd
   |  add sp, sp, #20
   |  pop {pc}
+  |.endif
   |
   |  // int lj_vm_modi(int dividend, int divisor);
   |->vm_modi:
@@ -2266,6 +2524,38 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ins_next
     |
     |3: // CARG12 is not an integer.
+    |.if FPU
+    |   vldr d0, [RA]
+    |  bhi ->vmeta_comp
+    |  // d0 is a number.
+    |  checktp CARG4, LJ_TISNUM
+    |   vldr d1, [RC]
+    |  blo >5
+    |  // d0 is a number, CARG3 is an integer.
+    |  vmov s4, CARG3
+    |  vcvt.f64.s32 d1, s4
+    |  b >5
+    |4:  // CARG1 is an integer, CARG34 is not an integer.
+    |   vldr d1, [RC]
+    |  bhi ->vmeta_comp
+    |  // CARG1 is an integer, d1 is a number.
+    |  vmov s4, CARG1
+    |  vcvt.f64.s32 d0, s4
+    |5:  // d0 and d1 are numbers.
+    |  vcmp.f64 d0, d1
+    |  vmrs
+    |  // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
+    if (op == BC_ISLT) {
+      |  sublo PC, RB, #0x20000
+    } else if (op == BC_ISGE) {
+      |  subhs PC, RB, #0x20000
+    } else if (op == BC_ISLE) {
+      |  subls PC, RB, #0x20000
+    } else {
+      |  subhi PC, RB, #0x20000
+    }
+    |  b <1
+    |.else
     |  bhi ->vmeta_comp
     |  // CARG12 is a number.
     |  checktp CARG4, LJ_TISNUM
@@ -2282,7 +2572,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  b >5
     |4:  // CARG1 is an integer, CARG34 is not an integer.
     |  bhi ->vmeta_comp
-    |  // CARG1 is an integer, CARG34 is a number
+    |  // CARG1 is an integer, CARG34 is a number.
     |  mov RA, RB			// Save RB.
     |  bl extern __aeabi_i2d
     |  ldrd CARG34, [RC]		// Restore second operand.
@@ -2299,6 +2589,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |  subhi PC, RA, #0x20000
     }
     |  b <1
+    |.endif
     break;
 
   case BC_ISEQV: case BC_ISNEV:
@@ -2439,6 +2730,27 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     }
     |  bhi <2
     |.endif
+    |.if FPU
+    |  checktp CARG4, LJ_TISNUM
+    |  vmov s4, CARG3
+    |   vldr d0, [RA]
+    |  vldrlo d1, [RC]
+    |  vcvths.f64.s32 d1, s4
+    |  b >5
+    |4:  // CARG1 is an integer, d1 is a number.
+    |  vmov s4, CARG1
+    |   vldr d1, [RC]
+    |  vcvt.f64.s32 d0, s4
+    |5:  // d0 and d1 are numbers.
+    |  vcmp.f64 d0, d1
+    |  vmrs
+    if (vk) {
+      |  subeq PC, RB, #0x20000
+    } else {
+      |  subne PC, RB, #0x20000
+    }
+    |  b <2
+    |.else
     |  // CARG12 is a number.
     |  checktp CARG4, LJ_TISNUM
     |  movlo RA, RB			// Save RB.
@@ -2458,6 +2770,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |  subne PC, RA, #0x20000
     }
     |  b <2
+    |.endif
     |
     |.if FFI
     |7:
@@ -2617,20 +2930,55 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
     ||switch (vk) {
     ||case 0:
+    |   .if FPU
+    |   ldrd CARG12, [RB, BASE]!
+    |    ldrd CARG34, [RC, KBASE]!
+    |   .else
     |   ldrd CARG12, [BASE, RB]
     |    ldrd CARG34, [KBASE, RC]
+    |   .endif
     ||  break;
     ||case 1:
+    |   .if FPU
+    |   ldrd CARG34, [RB, BASE]!
+    |    ldrd CARG12, [RC, KBASE]!
+    |   .else
     |   ldrd CARG34, [BASE, RB]
     |    ldrd CARG12, [KBASE, RC]
+    |   .endif
     ||  break;
     ||default:
+    |   .if FPU
+    |   ldrd CARG12, [RB, BASE]!
+    |    ldrd CARG34, [RC, BASE]!
+    |   .else
     |   ldrd CARG12, [BASE, RB]
     |    ldrd CARG34, [BASE, RC]
+    |   .endif
     ||  break;
     ||}
     |.endmacro
     |
+    |.macro ins_arithpre_fpu, reg1, reg2
+    |.if FPU
+    ||if (vk == 1) {
+    |  vldr reg2, [RB]
+    |  vldr reg1, [RC]
+    ||} else {
+    |  vldr reg1, [RB]
+    |  vldr reg2, [RC]
+    ||}
+    |.endif
+    |.endmacro
+    |
+    |.macro ins_arithpost_fpu, reg
+    |   ins_next1
+    |  add RA, BASE, RA
+    |   ins_next2
+    |  vstr reg, [RA]
+    |   ins_next3
+    |.endmacro
+    |
     |.macro ins_arithfallback, ins
     ||switch (vk) {
     ||case 0:
@@ -2645,9 +2993,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     ||}
     |.endmacro
     |
-    |.macro ins_arithdn, intins, fpcall
+    |.macro ins_arithdn, intins, fpins, fpcall
     |  ins_arithpre
-    |.if "intins" ~= "vm_modi"
+    |.if "intins" ~= "vm_modi" and not FPU
     |   ins_next1
     |.endif
     |  ins_arithcheck_int >5
@@ -2665,57 +3013,74 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ins_arithfallback bvs
     |.endif
     |4:
-    |.if "intins" == "vm_modi"
+    |.if "intins" == "vm_modi" or FPU
     |   ins_next1
     |.endif
     |   ins_next2
     |  strd CARG12, [BASE, RA]
     |   ins_next3
     |5:  // FP variant.
+    |  ins_arithpre_fpu d6, d7
     |  ins_arithfallback ins_arithcheck_num
+    |.if FPU
     |.if "intins" == "vm_modi"
     |  bl fpcall
     |.else
+    |  fpins d6, d6, d7
+    |.endif
+    |  ins_arithpost_fpu d6
+    |.else
     |  bl fpcall
-    |   ins_next1
+    |.if "intins" ~= "vm_modi"
+    |  ins_next1
     |.endif
     |  b <4
+    |.endif
     |.endmacro
     |
-    |.macro ins_arithfp, fpcall
+    |.macro ins_arithfp, fpins, fpcall
     |  ins_arithpre
+    |.if "fpins" ~= "extern" or HFABI
+    |  ins_arithpre_fpu d0, d1
+    |.endif
     |  ins_arithfallback ins_arithcheck_num
-    |.if "fpcall" == "extern pow"
+    |.if "fpins" == "extern"
     |  .IOS mov RC, BASE
     |  bl fpcall
     |  .IOS mov BASE, RC
+    |.elif FPU
+    |  fpins d0, d0, d1
     |.else
     |  bl fpcall
     |.endif
+    |.if ("fpins" ~= "extern" or HFABI) and FPU
+    |  ins_arithpost_fpu d0
+    |.else
     |   ins_next1
     |   ins_next2
     |  strd CARG12, [BASE, RA]
     |   ins_next3
+    |.endif
     |.endmacro
 
   case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
-    |  ins_arithdn adds, extern __aeabi_dadd
+    |  ins_arithdn adds, vadd.f64, extern __aeabi_dadd
     break;
   case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
-    |  ins_arithdn subs, extern __aeabi_dsub
+    |  ins_arithdn subs, vsub.f64, extern __aeabi_dsub
     break;
   case BC_MULVN: case BC_MULNV: case BC_MULVV:
-    |  ins_arithdn smull, extern __aeabi_dmul
+    |  ins_arithdn smull, vmul.f64, extern __aeabi_dmul
     break;
   case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
-    |  ins_arithfp extern __aeabi_ddiv
+    |  ins_arithfp vdiv.f64, extern __aeabi_ddiv
     break;
   case BC_MODVN: case BC_MODNV: case BC_MODVV:
-    |  ins_arithdn vm_modi, ->vm_mod
+    |  ins_arithdn vm_modi, vm_mod, ->vm_mod
     break;
   case BC_POW:
     |  // NYI: (partial) integer arithmetic.
-    |  ins_arithfp extern pow
+    |  ins_arithfp extern, extern pow
     break;
 
   case BC_CAT:
@@ -3775,20 +4140,46 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |  cmnlo CARG4, #-LJ_TISNUM
       |  cmnlo RB, #-LJ_TISNUM
       |  bhs ->vmeta_for
+      |.if FPU
+      |  vldr d0, FOR_IDX
+      |  vldr d1, FOR_STOP
+      |  cmp RB, #0
+      |  vstr d0, FOR_EXT
+      |.else
       |  cmp RB, #0
-      |   strd CARG12, FOR_IDX
       |   strd CARG12, FOR_EXT
       |  blt >8
+      |.endif
     } else {
+      |.if FPU
+      |  vldr d0, FOR_IDX
+      |  vldr d2, FOR_STEP
+      |  vldr d1, FOR_STOP
+      |  cmp CARG4, #0
+      |  vadd.f64 d0, d0, d2
+      |.else
       |  cmp CARG4, #0
       |  blt >8
       |  bl extern __aeabi_dadd
       |   strd CARG12, FOR_IDX
       |  ldrd CARG34, FOR_STOP
       |   strd CARG12, FOR_EXT
+      |.endif
     }
     |6:
+    |.if FPU
+    |  vcmpge.f64 d0, d1
+    |  vcmplt.f64 d1, d0
+    |  vmrs
+    |.else
     |  bl extern __aeabi_cdcmple
+    |.endif
+    if (vk) {
+      |.if FPU
+      |  vstr d0, FOR_IDX
+      |  vstr d0, FOR_EXT
+      |.endif
+    }
     if (op == BC_FORI) {
       |  subhi PC, RC, #0x20000
     } else if (op == BC_JFORI) {
@@ -3804,6 +4195,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ins_next2
     |  b <3
     |
+    |.if not FPU
     |8:  // Invert check for negative step.
     if (vk) {
       |  bl extern __aeabi_dadd
@@ -3814,6 +4206,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  mov CARG4, CARG2
     |  ldrd CARG12, FOR_STOP
     |  b <6
+    |.endif
     break;
 
   case BC_ITERL:
@@ -4048,8 +4441,14 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.byte 0xe\n\t.uleb128 %d\n"		/* def_cfa_offset */
 	"\t.byte 0x8e\n\t.uleb128 1\n",		/* offset lr */
 	fcofs, CFRAME_SIZE);
-    for (i = 11; i >= 4; i--)  /* offset r4-r11 */
+    for (i = 11; i >= (LJ_ARCH_HASFPU ? 5 : 4); i--)  /* offset r4-r11 */
       fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 2+(11-i));
+#if LJ_ARCH_HASFPU
+    for (i = 15; i >= 8; i--)  /* offset d8-d15 */
+      fprintf(ctx->fp, "\t.byte 5\n\t.uleb128 %d, %d\n",
+	64+2*i, 10+2*(15-i));
+    fprintf(ctx->fp, "\t.byte 0x84\n\t.uleb128 %d\n", 25);  /* offset r4 */
+#endif
     fprintf(ctx->fp,
 	"\t.align 2\n"
 	".LEFDE0:\n\n");