ARM: Drop hard-fp variants of floor/ceil/trunc.

Soft-fp variants are faster on a Cortex-A9. Duh.
This commit is contained in:
Mike Pall 2012-10-15 16:53:03 +02:00
parent 894d2d6ef4
commit 2621617a92
3 changed files with 57 additions and 88 deletions

View File

@ -1574,15 +1574,23 @@ static void asm_callid(ASMState *as, IRIns *ir, IRCallID id)
static void asm_callround(ASMState *as, IRIns *ir, int id) static void asm_callround(ASMState *as, IRIns *ir, int id)
{ {
/* The modified regs must match with the *.dasc implementation. */ /* The modified regs must match with the *.dasc implementation. */
RegSet drop = RID2RSET(RID_D0)|RID2RSET(RID_D1)|RID2RSET(RID_D2)| RegSet drop = RID2RSET(RID_R0)|RID2RSET(RID_R1)|RID2RSET(RID_R2)|
RID2RSET(RID_R0)|RID2RSET(RID_R1); RID2RSET(RID_R3)|RID2RSET(RID_R12);
if (ra_hasreg(ir->r)) rset_clear(drop, ir->r); RegSet of;
Reg dest, src;
ra_evictset(as, drop); ra_evictset(as, drop);
ra_destreg(as, ir, RID_FPRET); dest = ra_dest(as, ir, RSET_FPR);
emit_call(as, id == IRFPM_FLOOR ? (void *)lj_vm_floor_hf : emit_dnm(as, ARMI_VMOV_D_RR, RID_RETLO, RID_RETHI, (dest & 15));
id == IRFPM_CEIL ? (void *)lj_vm_ceil_hf : emit_call(as, id == IRFPM_FLOOR ? (void *)lj_vm_floor_sf :
(void *)lj_vm_trunc_hf); id == IRFPM_CEIL ? (void *)lj_vm_ceil_sf :
ra_leftov(as, RID_D0, ir->op1); (void *)lj_vm_trunc_sf);
/* Workaround to protect argument GPRs from being used for remat. */
of = as->freeset;
as->freeset &= ~RSET_RANGE(RID_R0, RID_R1+1);
as->cost[RID_R0] = as->cost[RID_R1] = REGCOST(~0u, ASMREF_L);
src = ra_alloc1(as, ir->op1, RSET_FPR); /* May alloc GPR to remat FPR. */
as->freeset |= (of & RSET_RANGE(RID_R0, RID_R1+1));
emit_dnm(as, ARMI_VMOV_RR_D, RID_R0, RID_R1, (src & 15));
} }
#endif #endif

View File

@ -56,8 +56,8 @@ LJ_ASMF void lj_vm_exit_interp(void);
LJ_ASMF double lj_vm_floor(double); LJ_ASMF double lj_vm_floor(double);
LJ_ASMF double lj_vm_ceil(double); LJ_ASMF double lj_vm_ceil(double);
#if LJ_TARGET_ARM #if LJ_TARGET_ARM
LJ_ASMF double lj_vm_floor_hf(double); LJ_ASMF double lj_vm_floor_sf(double);
LJ_ASMF double lj_vm_ceil_hf(double); LJ_ASMF double lj_vm_ceil_sf(double);
#endif #endif
#endif #endif
#if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64 #if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64
@ -81,7 +81,7 @@ LJ_ASMF void lj_vm_powi_sse(void);
#else #else
LJ_ASMF double lj_vm_trunc(double); LJ_ASMF double lj_vm_trunc(double);
#if LJ_TARGET_ARM #if LJ_TARGET_ARM
LJ_ASMF double lj_vm_trunc_hf(double); LJ_ASMF double lj_vm_trunc_sf(double);
#endif #endif
#endif #endif
LJ_ASMF double lj_vm_powi(double, int32_t); LJ_ASMF double lj_vm_powi(double, int32_t);

View File

@ -1368,14 +1368,8 @@ static void build_subroutines(BuildCtx *ctx)
| movmi CARG1, #0x80000000 | movmi CARG1, #0x80000000
| bmi <1 | bmi <1
|4: |4:
|.if HFABI | bl ->vm_..func.._sf
| vmov d0, CARG1, CARG2
| bl ->vm_..func.._hf
| b ->fff_resd
|.else
| bl ->vm_..func
| b ->fff_restv | b ->fff_restv
|.endif
|.endmacro |.endmacro
| |
| math_round floor | math_round floor
@ -2221,52 +2215,9 @@ static void build_subroutines(BuildCtx *ctx)
|// |//
|// double lj_vm_floor/ceil/trunc(double x); |// double lj_vm_floor/ceil/trunc(double x);
|.macro vm_round, func, hf |.macro vm_round, func, hf
|.if FPU |.if hf == 1
|.if hf == 0
| vmov d0, CARG1, CARG2
| vldr d2, <8 // 2^52
|.else
| vldr d2, <8 // 2^52
| vmov CARG1, CARG2, d0 | vmov CARG1, CARG2, d0
|.endif |.endif
| vabs.f64 d1, d0
| vcmp.f64 d1, d2 // |x| >= 2^52 or NaN?
| vmrs
|.if "func" == "trunc"
| bxpl lr // Return argument unchanged.
| vadd.f64 d0, d1, d2
| vsub.f64 d0, d0, d2 // (|x| + 2^52) - 2^52
| vldr d2, <9 // +1.0
| vcmp.f64 d1, d0 // |x| < result: subtract +1.0
| vmrs
| vsubmi.f64 d0, d0, d2
| cmp CARG2, #0
| vnegmi.f64 d0, d0 // Merge sign bit back in.
|.else
| vadd.f64 d1, d1, d2
| bxpl lr // Return argument unchanged.
| cmp CARG2, #0
| vsub.f64 d1, d1, d2 // (|x| + 2^52) - 2^52
| vldr d2, <9 // +1.0
| vnegmi.f64 d1, d1 // Merge sign bit back in.
|.if "func" == "floor"
| vcmp.f64 d0, d1 // x < result: subtract +1.0.
| vmrs
| vsubmi.f64 d0, d1, d2
|.else
| vcmp.f64 d1, d0 // x > result: add +1.0.
| vmrs
| vaddmi.f64 d0, d1, d2
|.endif
| vmovpl.f64 d0, d1
|.endif
|.if hf == 0
| vmov CARG1, CARG2, d0
|.endif
| bx lr
|
|.else
|
| lsl CARG3, CARG2, #1 | lsl CARG3, CARG2, #1
| adds RB, CARG3, #0x00200000 | adds RB, CARG3, #0x00200000
| bpl >2 // |x| < 1? | bpl >2 // |x| < 1?
@ -2286,6 +2237,9 @@ static void build_subroutines(BuildCtx *ctx)
|.else |.else
| bics CARG3, CARG3, CARG2, asr #31 // iszero = ((ztest & ~signmask) == 0) | bics CARG3, CARG3, CARG2, asr #31 // iszero = ((ztest & ~signmask) == 0)
|.endif |.endif
|.if hf == 1
| vmoveq d0, CARG1, CARG2
|.endif
| bxeq lr // iszero: done. | bxeq lr // iszero: done.
| mvn CARG4, #1 | mvn CARG4, #1
| cmp RB, #0 | cmp RB, #0
@ -2294,6 +2248,9 @@ static void build_subroutines(BuildCtx *ctx)
| add RB, RB, #32 | add RB, RB, #32
| subs CARG1, CARG1, CARG4, lsl RB // lo = lo-lomask | subs CARG1, CARG1, CARG4, lsl RB // lo = lo-lomask
| sbc CARG2, CARG2, CARG3 // hi = hi-himask+carry | sbc CARG2, CARG2, CARG3 // hi = hi-himask+carry
|.if hf == 1
| vmov d0, CARG1, CARG2
|.endif
| bx lr | bx lr
| |
|2: // |x| < 1: |2: // |x| < 1:
@ -2308,45 +2265,41 @@ static void build_subroutines(BuildCtx *ctx)
| and CARG2, CARG2, #0x80000000 | and CARG2, CARG2, #0x80000000
| ldrne CARG4, <9 // hi = sign(x) | (iszero ? 0.0 : 1.0) | ldrne CARG4, <9 // hi = sign(x) | (iszero ? 0.0 : 1.0)
| orrne CARG2, CARG2, CARG4 | orrne CARG2, CARG2, CARG4
| bx lr |.if hf == 1
| vmov d0, CARG1, CARG2
|.endif |.endif
| bx lr
|.endmacro |.endmacro
| |
|.if FPU
|.align 8
|9:
| .long 0, 0x3ff00000 // +1.0
|8:
| .long 0, 0x43300000 // 2^52
|.else
|9: |9:
| .long 0x3ff00000 // hiword(+1.0) | .long 0x3ff00000 // hiword(+1.0)
|.endif
| |
|->vm_floor: |->vm_floor:
|.if not HFABI |.if HFABI
| vm_round floor, 0
|.endif
|->vm_floor_hf:
|.if FPU
| vm_round floor, 1 | vm_round floor, 1
|.endif |.endif
|->vm_floor_sf:
| vm_round floor, 0
| |
|->vm_ceil: |->vm_ceil:
|.if not HFABI |.if HFABI
| vm_round ceil, 0
|.endif
|->vm_ceil_hf:
|.if FPU
| vm_round ceil, 1 | vm_round ceil, 1
|.endif |.endif
|->vm_ceil_sf:
| vm_round ceil, 0
| |
|->vm_trunc: |.macro vm_trunc, hf
|.if JIT and not HFABI |.if JIT
|.if hf == 1
| vmov CARG1, CARG2, d0
|.endif
| lsl CARG3, CARG2, #1 | lsl CARG3, CARG2, #1
| adds RB, CARG3, #0x00200000 | adds RB, CARG3, #0x00200000
| andpl CARG2, CARG2, #0x80000000 // |x| < 1? hi = sign(x), lo = 0. | andpl CARG2, CARG2, #0x80000000 // |x| < 1? hi = sign(x), lo = 0.
| movpl CARG1, #0 | movpl CARG1, #0
|.if hf == 1
| vmovpl d0, CARG1, CARG2
|.endif
| bxpl lr | bxpl lr
| mvn CARG4, #0x3cc | mvn CARG4, #0x3cc
| subs RB, CARG4, RB, asr #21 // 2^0: RB = 51, 2^51: RB = 0. | subs RB, CARG4, RB, asr #21 // 2^0: RB = 51, 2^51: RB = 0.
@ -2355,13 +2308,19 @@ static void build_subroutines(BuildCtx *ctx)
| and CARG1, CARG1, CARG4, lsl RB // lo &= lomask | and CARG1, CARG1, CARG4, lsl RB // lo &= lomask
| subs RB, RB, #32 | subs RB, RB, #32
| andpl CARG2, CARG2, CARG4, lsl RB // |x| <= 2^20: hi &= himask | andpl CARG2, CARG2, CARG4, lsl RB // |x| <= 2^20: hi &= himask
|.if hf == 1
| vmov d0, CARG1, CARG2
|.endif
| bx lr | bx lr
|.endif |.endif
|.endmacro
| |
|->vm_trunc_hf: |->vm_trunc:
|.if JIT and FPU |.if HFABI
| vm_round trunc, 1 | vm_trunc 1
|.endif |.endif
|->vm_trunc_sf:
| vm_trunc 0
| |
| // double lj_vm_mod(double dividend, double divisor); | // double lj_vm_mod(double dividend, double divisor);
|->vm_mod: |->vm_mod:
@ -2369,7 +2328,9 @@ static void build_subroutines(BuildCtx *ctx)
| // Special calling convention. Also, RC (r11) is not preserved. | // Special calling convention. Also, RC (r11) is not preserved.
| vdiv.f64 d0, d6, d7 | vdiv.f64 d0, d6, d7
| mov RC, lr | mov RC, lr
| bl ->vm_floor_hf | vmov CARG1, CARG2, d0
| bl ->vm_floor_sf
| vmov d0, CARG1, CARG2
| vmul.f64 d0, d0, d7 | vmul.f64 d0, d0, d7
| mov lr, RC | mov lr, RC
| vsub.f64 d6, d6, d0 | vsub.f64 d6, d6, d0
@ -2377,7 +2338,7 @@ static void build_subroutines(BuildCtx *ctx)
|.else |.else
| push {r0, r1, r2, r3, r4, lr} | push {r0, r1, r2, r3, r4, lr}
| bl extern __aeabi_ddiv | bl extern __aeabi_ddiv
| bl ->vm_floor | bl ->vm_floor_sf
| ldrd CARG34, [sp, #8] | ldrd CARG34, [sp, #8]
| bl extern __aeabi_dmul | bl extern __aeabi_dmul
| ldrd CARG34, [sp] | ldrd CARG34, [sp]