From ad03eba715e5e0d0bd0f3c0ddef4b8f5bbb0c626 Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Mon, 8 Dec 2014 02:02:34 +0100 Subject: [PATCH] x86/x64: Drop internal x87 math functions. Use libm functions. --- src/lj_arch.h | 6 +- src/lj_asm.c | 4 - src/lj_asm_x86.h | 82 +++------ src/lj_ircall.h | 24 +-- src/lj_vm.h | 12 +- src/lj_vmmath.c | 16 +- src/vm_x86.dasc | 427 +++++++---------------------------------------- 7 files changed, 115 insertions(+), 456 deletions(-) diff --git a/src/lj_arch.h b/src/lj_arch.h index da16a193..36b38886 100644 --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -426,11 +426,11 @@ #define LJ_TARGET_UNALIGNED 0 #endif -/* Various workarounds for embedded operating systems. */ -#if (defined(__ANDROID__) && !defined(LJ_TARGET_X86ORX64)) || defined(__symbian__) || LJ_TARGET_XBOX360 +/* Various workarounds for embedded operating systems or weak C runtimes. */ +#if (defined(__ANDROID__) && !defined(LJ_TARGET_X86ORX64)) || defined(__symbian__) || LJ_TARGET_XBOX360 || LJ_TARGET_WINDOWS #define LUAJIT_NO_LOG2 #endif -#if defined(__symbian__) +#if defined(__symbian__) || LJ_TARGET_WINDOWS #define LUAJIT_NO_EXP2 #endif diff --git a/src/lj_asm.c b/src/lj_asm.c index 0b6738da..aaab3255 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1262,9 +1262,6 @@ static void asm_call(ASMState *as, IRIns *ir) } #if !LJ_SOFTFP -static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref); - -#if !LJ_TARGET_X86ORX64 static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref) { const CCallInfo *ci = &lj_ir_callinfo[IRCALL_pow]; @@ -1274,7 +1271,6 @@ static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref) asm_setupresult(as, ir, ci); asm_gencall(as, ci, args); } -#endif static int asm_fpjoin_pow(ASMState *as, IRIns *ir) { diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index 8b541250..bd97764f 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -1593,26 +1593,9 @@ static void asm_x87load(ASMState *as, IRRef ref) } } -static void asm_fppow(ASMState *as, IRIns *ir, IRRef lref, IRRef rref) -{ - /* The modified regs must match with the *.dasc implementation. */ - RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX); - IRIns *irx; - if (ra_hasreg(ir->r)) - rset_clear(drop, ir->r); /* Dest reg handled below. */ - ra_evictset(as, drop); - ra_destreg(as, ir, RID_XMM0); - emit_call(as, lj_vm_pow_sse); - irx = IR(lref); - if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1) - irx->r = RID_INIT; /* Avoid allocating xmm1 for x. */ - ra_left(as, RID_XMM0, lref); - ra_left(as, RID_XMM1, rref); -} - static void asm_fpmath(ASMState *as, IRIns *ir) { - IRFPMathOp fpm = ir->o == IR_FPMATH ? (IRFPMathOp)ir->op2 : IRFPM_OTHER; + IRFPMathOp fpm = (IRFPMathOp)ir->op2; if (fpm == IRFPM_SQRT) { Reg dest = ra_dest(as, ir, RSET_FPR); Reg left = asm_fuseload(as, ir->op1, RSET_FPR); @@ -1645,53 +1628,28 @@ static void asm_fpmath(ASMState *as, IRIns *ir) } } else if (fpm == IRFPM_EXP2 && asm_fpjoin_pow(as, ir)) { /* Rejoined to pow(). */ - } else { /* Handle x87 ops. */ - int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ - Reg dest = ir->r; - if (ra_hasreg(dest)) { - ra_free(as, dest); - ra_modified(as, dest); - emit_rmro(as, XO_MOVSD, dest, RID_ESP, ofs); - } - emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs); - switch (fpm) { /* st0 = lj_vm_*(st0) */ - case IRFPM_EXP: emit_call(as, lj_vm_exp_x87); break; - case IRFPM_EXP2: emit_call(as, lj_vm_exp2_x87); break; - case IRFPM_SIN: emit_x87op(as, XI_FSIN); break; - case IRFPM_COS: emit_x87op(as, XI_FCOS); break; - case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break; - case IRFPM_LOG: case IRFPM_LOG2: case IRFPM_LOG10: - /* Note: the use of fyl2xp1 would be pointless here. When computing - ** log(1.0+eps) the precision is already lost after 1.0 is added. - ** Subtracting 1.0 won't recover it. OTOH math.log1p would make sense. - */ - emit_x87op(as, XI_FYL2X); break; - case IRFPM_OTHER: - switch (ir->o) { - case IR_ATAN2: - emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break; - case IR_LDEXP: - emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break; - default: lua_assert(0); break; - } - break; - default: lua_assert(0); break; - } - asm_x87load(as, ir->op1); - switch (fpm) { - case IRFPM_LOG: emit_x87op(as, XI_FLDLN2); break; - case IRFPM_LOG2: emit_x87op(as, XI_FLD1); break; - case IRFPM_LOG10: emit_x87op(as, XI_FLDLG2); break; - case IRFPM_OTHER: - if (ir->o == IR_LDEXP) asm_x87load(as, ir->op2); - break; - default: break; - } + } else { + asm_callid(as, ir, IRCALL_lj_vm_floor + fpm); } } -#define asm_atan2(as, ir) asm_fpmath(as, ir) -#define asm_ldexp(as, ir) asm_fpmath(as, ir) +#define asm_atan2(as, ir) asm_callid(as, ir, IRCALL_atan2) + +static void asm_ldexp(ASMState *as, IRIns *ir) +{ + int32_t ofs = sps_scale(ir->s); /* Use spill slot or temp slots. */ + Reg dest = ir->r; + if (ra_hasreg(dest)) { + ra_free(as, dest); + ra_modified(as, dest); + emit_rmro(as, XO_MOVSD, dest, RID_ESP, ofs); + } + emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs); + emit_x87op(as, XI_FPOP1); + emit_x87op(as, XI_FSCALE); + asm_x87load(as, ir->op1); + asm_x87load(as, ir->op2); +} static void asm_fppowi(ASMState *as, IRIns *ir) { diff --git a/src/lj_ircall.h b/src/lj_ircall.h index 9bf46918..e71f0432 100644 --- a/src/lj_ircall.h +++ b/src/lj_ircall.h @@ -169,18 +169,18 @@ typedef struct CCallInfo { _(FPMATH, lj_vm_ceil, 1, N, NUM, XA_FP) \ _(FPMATH, lj_vm_trunc, 1, N, NUM, XA_FP) \ _(FPMATH, sqrt, 1, N, NUM, XA_FP) \ - _(FPMATH, exp, 1, N, NUM, XA_FP) \ - _(FPMATH, lj_vm_exp2, 1, N, NUM, XA_FP) \ - _(FPMATH, log, 1, N, NUM, XA_FP) \ - _(FPMATH, lj_vm_log2, 1, N, NUM, XA_FP) \ - _(FPMATH, log10, 1, N, NUM, XA_FP) \ - _(FPMATH, sin, 1, N, NUM, XA_FP) \ - _(FPMATH, cos, 1, N, NUM, XA_FP) \ - _(FPMATH, tan, 1, N, NUM, XA_FP) \ - _(FPMATH, lj_vm_powi, 2, N, NUM, XA_FP) \ - _(FPMATH, pow, 2, N, NUM, XA2_FP) \ - _(FPMATH, atan2, 2, N, NUM, XA2_FP) \ - _(FPMATH, ldexp, 2, N, NUM, XA_FP) \ + _(ANY, exp, 1, N, NUM, XA_FP) \ + _(ANY, lj_vm_exp2, 1, N, NUM, XA_FP) \ + _(ANY, log, 1, N, NUM, XA_FP) \ + _(ANY, lj_vm_log2, 1, N, NUM, XA_FP) \ + _(ANY, log10, 1, N, NUM, XA_FP) \ + _(ANY, sin, 1, N, NUM, XA_FP) \ + _(ANY, cos, 1, N, NUM, XA_FP) \ + _(ANY, tan, 1, N, NUM, XA_FP) \ + _(ANY, lj_vm_powi, 2, N, NUM, XA_FP) \ + _(ANY, pow, 2, N, NUM, XA2_FP) \ + _(ANY, atan2, 2, N, NUM, XA2_FP) \ + _(ANY, ldexp, 2, N, NUM, XA_FP) \ _(SOFTFP, lj_vm_tobit, 2, N, INT, 0) \ _(SOFTFP, softfp_add, 4, N, NUM, 0) \ _(SOFTFP, softfp_sub, 4, N, NUM, 0) \ diff --git a/src/lj_vm.h b/src/lj_vm.h index 83883e2c..a69d699f 100644 --- a/src/lj_vm.h +++ b/src/lj_vm.h @@ -55,15 +55,13 @@ LJ_ASMF void lj_vm_exit_interp(void); #define lj_vm_ceil ceil #else LJ_ASMF double lj_vm_floor(double); -#if !LJ_TARGET_X86ORX64 LJ_ASMF double lj_vm_ceil(double); -#endif #if LJ_TARGET_ARM LJ_ASMF double lj_vm_floor_sf(double); LJ_ASMF double lj_vm_ceil_sf(double); #endif #endif -#if defined(LUAJIT_NO_LOG2) || LJ_TARGET_X86ORX64 +#ifdef LUAJIT_NO_LOG2 LJ_ASMF double lj_vm_log2(double); #else #define lj_vm_log2 log2 @@ -74,11 +72,11 @@ LJ_ASMF double lj_vm_log2(double); LJ_ASMF void lj_vm_floor_sse(void); LJ_ASMF void lj_vm_ceil_sse(void); LJ_ASMF void lj_vm_trunc_sse(void); -LJ_ASMF void lj_vm_exp_x87(void); -LJ_ASMF void lj_vm_exp2_x87(void); -LJ_ASMF void lj_vm_pow_sse(void); LJ_ASMF void lj_vm_powi_sse(void); +#define lj_vm_powi NULL #else +LJ_ASMF double lj_vm_powi(double, int32_t); +#endif #if LJ_TARGET_PPC #define lj_vm_trunc trunc #else @@ -87,13 +85,11 @@ LJ_ASMF double lj_vm_trunc(double); LJ_ASMF double lj_vm_trunc_sf(double); #endif #endif -LJ_ASMF double lj_vm_powi(double, int32_t); #ifdef LUAJIT_NO_EXP2 LJ_ASMF double lj_vm_exp2(double); #else #define lj_vm_exp2 exp2 #endif -#endif LJ_ASMF int32_t LJ_FASTCALL lj_vm_modi(int32_t, int32_t); #if LJ_HASFFI LJ_ASMF int lj_vm_errno(void); diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c index b60858b2..6ea99d15 100644 --- a/src/lj_vmmath.c +++ b/src/lj_vmmath.c @@ -17,14 +17,25 @@ #if LJ_TARGET_X86 && __ELF__ && __PIC__ /* Wrapper functions to deal with the ELF/x86 PIC disaster. */ +LJ_FUNCA double lj_wrap_log(double x) { return log(x); } +LJ_FUNCA double lj_wrap_log10(double x) { return log10(x); } +LJ_FUNCA double lj_wrap_exp(double x) { return exp(x); } +LJ_FUNCA double lj_wrap_sin(double x) { return sin(x); } +LJ_FUNCA double lj_wrap_cos(double x) { return cos(x); } +LJ_FUNCA double lj_wrap_tan(double x) { return tan(x); } +LJ_FUNCA double lj_wrap_asin(double x) { return asin(x); } +LJ_FUNCA double lj_wrap_acos(double x) { return acos(x); } +LJ_FUNCA double lj_wrap_atan(double x) { return atan(x); } LJ_FUNCA double lj_wrap_sinh(double x) { return sinh(x); } LJ_FUNCA double lj_wrap_cosh(double x) { return cosh(x); } LJ_FUNCA double lj_wrap_tanh(double x) { return tanh(x); } +LJ_FUNCA double lj_wrap_atan2(double x, double y) { return atan2(x, y); } +LJ_FUNCA double lj_wrap_pow(double x, double y) { return pow(x, y); } +LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); } #endif /* -- Helper functions for generated machine code ------------------------- */ -#if !LJ_TARGET_X86ORX64 double lj_vm_foldarith(double x, double y, int op) { switch (op) { @@ -45,7 +56,6 @@ double lj_vm_foldarith(double x, double y, int op) default: return x; } } -#endif #if LJ_HASJIT @@ -109,6 +119,7 @@ double lj_vm_powi(double x, int32_t k) else return 1.0 / lj_vm_powui(x, (uint32_t)-k); } +#endif /* Computes fpm(x) for extended math functions. */ double lj_vm_foldfpm(double x, int fpm) @@ -130,7 +141,6 @@ double lj_vm_foldfpm(double x, int fpm) } return 0; } -#endif #if LJ_HASFFI int lj_vm_errno(void) diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc index cd43afbd..290054dc 100644 --- a/src/vm_x86.dasc +++ b/src/vm_x86.dasc @@ -373,7 +373,6 @@ | fpop |.endmacro | -|.macro fdup; fld st0; .endmacro |.macro fpop1; fstp st1; .endmacro | |// Synthesize SSE FP constants. @@ -1329,19 +1328,6 @@ static void build_subroutines(BuildCtx *ctx) | cmp NARGS:RD, 2+1; jb ->fff_fallback |.endmacro | - |.macro .ffunc_n, name - | .ffunc_1 name - | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback - | fld qword [BASE] - |.endmacro - | - |.macro .ffunc_n, name, op - | .ffunc_1 name - | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback - | op - | fld qword [BASE] - |.endmacro - | |.macro .ffunc_nsse, name, op | .ffunc_1 name | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback @@ -1352,14 +1338,6 @@ static void build_subroutines(BuildCtx *ctx) | .ffunc_nsse name, movsd |.endmacro | - |.macro .ffunc_nn, name - | .ffunc_2 name - | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback - | cmp dword [BASE+12], LJ_TISNUM; jae ->fff_fallback - | fld qword [BASE] - | fld qword [BASE+8] - |.endmacro - | |.macro .ffunc_nnsse, name | .ffunc_2 name | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback @@ -2029,6 +2007,12 @@ static void build_subroutines(BuildCtx *ctx) | mov RAa, -8 // Results start at BASE+RA = BASE-8. | jmp ->vm_return | + |.if X64 + |.define fff_resfp, fff_resxmm0 + |.else + |.define fff_resfp, fff_resn + |.endif + | |.macro math_round, func | .ffunc math_ .. func |.if DUALNUM @@ -2061,22 +2045,14 @@ static void build_subroutines(BuildCtx *ctx) |.ffunc math_log | cmp NARGS:RD, 1+1; jne ->fff_fallback // Exactly one argument. | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback - | fldln2; fld qword [BASE]; fyl2x; jmp ->fff_resn - | - |.ffunc_n math_log10, fldlg2; fyl2x; jmp ->fff_resn - |.ffunc_n math_exp; call ->vm_exp_x87; jmp ->fff_resn - | - |.ffunc_n math_sin; fsin; jmp ->fff_resn - |.ffunc_n math_cos; fcos; jmp ->fff_resn - |.ffunc_n math_tan; fptan; fpop; jmp ->fff_resn - | - |.ffunc_n math_asin - | fdup; fmul st0; fld1; fsubrp st1; fsqrt; fpatan - | jmp ->fff_resn - |.ffunc_n math_acos - | fdup; fmul st0; fld1; fsubrp st1; fsqrt; fxch; fpatan - | jmp ->fff_resn - |.ffunc_n math_atan; fld1; fpatan; jmp ->fff_resn + | movsd xmm0, qword [BASE] + |.if not X64 + | movsd FPARG1, xmm0 + |.endif + | mov RB, BASE + | call extern log + | mov BASE, RB + | jmp ->fff_resfp | |.macro math_extern, func | .ffunc_nsse math_ .. func @@ -2086,18 +2062,36 @@ static void build_subroutines(BuildCtx *ctx) | mov RB, BASE | call extern func | mov BASE, RB - |.if X64 - | jmp ->fff_resxmm0 - |.else - | jmp ->fff_resn - |.endif + | jmp ->fff_resfp |.endmacro | + |.macro math_extern2, func + | .ffunc_nnsse math_ .. func + |.if not X64 + | movsd FPARG1, xmm0 + | movsd FPARG3, xmm1 + |.endif + | mov RB, BASE + | call extern func + | mov BASE, RB + | jmp ->fff_resfp + |.endmacro + | + | math_extern log10 + | math_extern exp + | math_extern sin + | math_extern cos + | math_extern tan + | math_extern asin + | math_extern acos + | math_extern atan | math_extern sinh | math_extern cosh | math_extern tanh + | math_extern2 pow + | math_extern2 atan2 + | math_extern2 fmod | - |.ffunc_nn math_atan2; fpatan; jmp ->fff_resn |.ffunc_nnr math_ldexp; fscale; fpop1; jmp ->fff_resn | |.ffunc_1 math_frexp @@ -2151,13 +2145,6 @@ static void build_subroutines(BuildCtx *ctx) |4: | xorps xmm4, xmm4; jmp <1 // Return +-Inf and +-0. | - |.ffunc_nnr math_fmod - |1: ; fprem; fnstsw ax; and ax, 0x400; jnz <1 - | fpop1 - | jmp ->fff_resn - | - |.ffunc_nnsse math_pow; call ->vm_pow_sse; jmp ->fff_resxmm0 - | |.macro math_minmax, name, cmovop, sseop | .ffunc name | mov RA, 2 @@ -2899,7 +2886,16 @@ static void build_subroutines(BuildCtx *ctx) | |// FP value rounding. Called by math.floor/math.ceil fast functions |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified. - |.macro vm_round, name, mode + |.macro vm_round, name, mode, cond + |->name: + |.if not X64 and cond + | movsd xmm0, qword [esp+4] + | call ->name .. _sse + | movsd qword [esp+4], xmm0 // Overwrite callee-owned arg. + | fld qword [esp+4] + | ret + |.endif + | |->name .. _sse: | sseconst_abs xmm2, RDa | sseconst_2p52 xmm3, RDa @@ -2936,18 +2932,9 @@ static void build_subroutines(BuildCtx *ctx) | ret |.endmacro | - |->vm_floor: - |.if not X64 - | movsd xmm0, qword [esp+4] - | call ->vm_floor_sse - | movsd qword [esp+4], xmm0 // Overwrite callee-owned arg. - | fld qword [esp+4] - | ret - |.endif - | - | vm_round vm_floor, 0 - | vm_round vm_ceil, 1 - | vm_round vm_trunc, 2 + | vm_round vm_floor, 0, 1 + | vm_round vm_ceil, 1, JIT + | vm_round vm_trunc, 2, JIT | |// FP modulo x%y. Called by BC_MOD* and vm_arith. |->vm_mod: @@ -2979,65 +2966,6 @@ static void build_subroutines(BuildCtx *ctx) | subsd xmm0, xmm1 | ret | - |// FP log2(x). Called by math.log(x, base). - |->vm_log2: - |.if X64WIN - | movsd qword [rsp+8], xmm0 // Use scratch area. - | fld1 - | fld qword [rsp+8] - | fyl2x - | fstp qword [rsp+8] - | movsd xmm0, qword [rsp+8] - |.elif X64 - | movsd qword [rsp-8], xmm0 // Use red zone. - | fld1 - | fld qword [rsp-8] - | fyl2x - | fstp qword [rsp-8] - | movsd xmm0, qword [rsp-8] - |.else - | fld1 - | fld qword [esp+4] - | fyl2x - |.endif - | ret - | - |// FP exponentiation e^x and 2^x. Called by math.exp fast function and - |// from JIT code. Arg/ret on x87 stack. No int/xmm regs modified. - |// Caveat: needs 3 slots on x87 stack! - |->vm_exp_x87: - | fldl2e; fmulp st1 // e^x ==> 2^(x*log2(e)) - |->vm_exp2_x87: - | .if X64WIN - | .define expscratch, dword [rsp+8] // Use scratch area. - | .elif X64 - | .define expscratch, dword [rsp-8] // Use red zone. - | .else - | .define expscratch, dword [esp+4] // Needs 4 byte scratch area. - | .endif - | fst expscratch // Caveat: overwrites ARG1. - | cmp expscratch, 0x7f800000; je >1 // Special case: e^+Inf = +Inf - | cmp expscratch, 0xff800000; je >2 // Special case: e^-Inf = 0 - |->vm_exp2raw: // Entry point for vm_pow. Without +-Inf check. - | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part. - | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int - |1: - | ret - |2: - | fpop; fldz; ret - | - |// Generic power function x^y. Called by BC_POW, math.pow fast function, - |// and vm_arith. - |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified. - |// Needs 16 byte scratch area for x86. Also called from JIT code. - |->vm_pow_sse: - | cvttsd2si eax, xmm1 - | cvtsi2sd xmm2, eax - | ucomisd xmm1, xmm2 - | jnz >8 // Branch for FP exponents. - | jp >9 // Branch for NaN exponent. - | // Fallthrough. - | |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified. |->vm_powi_sse: | cmp eax, 1; jle >6 // i<=1? @@ -3073,246 +3001,6 @@ static void build_subroutines(BuildCtx *ctx) | sseconst_1 xmm0, RDa | ret | - |8: // FP/FP power function x^y. - |.if X64 - | movd rax, xmm1; shl rax, 1 - | rol rax, 12; cmp rax, 0xffe; je >2 // x^+-Inf? - | movd rax, xmm0; shl rax, 1; je >4 // +-0^y? - | rol rax, 12; cmp rax, 0xffe; je >5 // +-Inf^y? - | .if X64WIN - | movsd qword [rsp+16], xmm1 // Use scratch area. - | movsd qword [rsp+8], xmm0 - | fld qword [rsp+16] - | fld qword [rsp+8] - | .else - | movsd qword [rsp-16], xmm1 // Use red zone. - | movsd qword [rsp-8], xmm0 - | fld qword [rsp-16] - | fld qword [rsp-8] - | .endif - |.else - | movsd qword [esp+12], xmm1 // Needs 16 byte scratch area. - | movsd qword [esp+4], xmm0 - | cmp dword [esp+12], 0; jne >1 - | mov eax, [esp+16]; shl eax, 1 - | cmp eax, 0xffe00000; je >2 // x^+-Inf? - |1: - | cmp dword [esp+4], 0; jne >1 - | mov eax, [esp+8]; shl eax, 1; je >4 // +-0^y? - | cmp eax, 0xffe00000; je >5 // +-Inf^y? - |1: - | fld qword [esp+12] - | fld qword [esp+4] - |.endif - | fyl2x // y*log2(x) - | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part. - | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int - |.if X64WIN - | fstp qword [rsp+8] // Use scratch area. - | movsd xmm0, qword [rsp+8] - |.elif X64 - | fstp qword [rsp-8] // Use red zone. - | movsd xmm0, qword [rsp-8] - |.else - | fstp qword [esp+4] // Needs 8 byte scratch area. - | movsd xmm0, qword [esp+4] - |.endif - | ret - | - |9: // Handle x^NaN. - | sseconst_1 xmm2, RDa - | ucomisd xmm0, xmm2; je >1 // 1^NaN ==> 1 - | movaps xmm0, xmm1 // x^NaN ==> NaN - |1: - | ret - | - |2: // Handle x^+-Inf. - | sseconst_abs xmm2, RDa - | andpd xmm0, xmm2 // |x| - | sseconst_1 xmm2, RDa - | ucomisd xmm0, xmm2; je <1 // +-1^+-Inf ==> 1 - | movmskpd eax, xmm1 - | xorps xmm0, xmm0 - | mov ah, al; setc al; xor al, ah; jne <1 // |x|<>1, x^+-Inf ==> +Inf/0 - |3: - | sseconst_hi xmm0, RDa, 7ff00000 // +Inf - | ret - | - |4: // Handle +-0^y. - | movmskpd eax, xmm1; test eax, eax; jnz <3 // y < 0, +-0^y ==> +Inf - | xorps xmm0, xmm0 // y >= 0, +-0^y ==> 0 - | ret - | - |5: // Handle +-Inf^y. - | movmskpd eax, xmm1; test eax, eax; jz <3 // y >= 0, +-Inf^y ==> +Inf - | xorps xmm0, xmm0 // y < 0, +-Inf^y ==> 0 - | ret - | - |// Callable from C: double lj_vm_foldfpm(double x, int fpm) - |// Computes fpm(x) for extended math functions. ORDER FPM. - |->vm_foldfpm: - |.if JIT - |.if X64 - | .if X64WIN - | .define fpmop, CARG2d - | .else - | .define fpmop, CARG1d - | .endif - | cmp fpmop, 1; jb ->vm_floor_sse; je ->vm_ceil_sse - | cmp fpmop, 3; jb ->vm_trunc_sse; ja >2 - | sqrtsd xmm0, xmm0; ret - |2: - | .if X64WIN - | movsd qword [rsp+8], xmm0 // Use scratch area. - | fld qword [rsp+8] - | .else - | movsd qword [rsp-8], xmm0 // Use red zone. - | fld qword [rsp-8] - | .endif - | cmp fpmop, 5; ja >2 - | .if X64WIN; pop rax; .endif - | je >1 - | call ->vm_exp_x87 - | .if X64WIN; push rax; .endif - | jmp >7 - |1: - | call ->vm_exp2_x87 - | .if X64WIN; push rax; .endif - | jmp >7 - |2: ; cmp fpmop, 7; je >1; ja >2 - | fldln2; fxch; fyl2x; jmp >7 - |1: ; fld1; fxch; fyl2x; jmp >7 - |2: ; cmp fpmop, 9; je >1; ja >2 - | fldlg2; fxch; fyl2x; jmp >7 - |1: ; fsin; jmp >7 - |2: ; cmp fpmop, 11; je >1; ja >9 - | fcos; jmp >7 - |1: ; fptan; fpop - |7: - | .if X64WIN - | fstp qword [rsp+8] // Use scratch area. - | movsd xmm0, qword [rsp+8] - | .else - | fstp qword [rsp-8] // Use red zone. - | movsd xmm0, qword [rsp-8] - | .endif - | ret - |.else // x86 calling convention. - | .define fpmop, eax - | mov fpmop, [esp+12] - | movsd xmm0, qword [esp+4] - | cmp fpmop, 1; je >1; ja >2 - | call ->vm_floor_sse; jmp >7 - |1: ; call ->vm_ceil_sse; jmp >7 - |2: ; cmp fpmop, 3; je >1; ja >2 - | call ->vm_trunc_sse; jmp >7 - |1: - | sqrtsd xmm0, xmm0 - |7: - | movsd qword [esp+4], xmm0 // Overwrite callee-owned args. - | fld qword [esp+4] - | ret - |2: ; fld qword [esp+4] - | cmp fpmop, 5; jb ->vm_exp_x87; je ->vm_exp2_x87 - |2: ; cmp fpmop, 7; je >1; ja >2 - | fldln2; fxch; fyl2x; ret - |1: ; fld1; fxch; fyl2x; ret - |2: ; cmp fpmop, 9; je >1; ja >2 - | fldlg2; fxch; fyl2x; ret - |1: ; fsin; ret - |2: ; cmp fpmop, 11; je >1; ja >9 - | fcos; ret - |1: ; fptan; fpop; ret - |.endif - |9: ; int3 // Bad fpm. - |.endif - | - |// Callable from C: double lj_vm_foldarith(double x, double y, int op) - |// Compute x op y for basic arithmetic operators (+ - * / % ^ and unary -) - |// and basic math functions. ORDER ARITH - |->vm_foldarith: - |.if X64 - | - | .if X64WIN - | .define foldop, CARG3d - | .else - | .define foldop, CARG1d - | .endif - | cmp foldop, 1; je >1; ja >2 - | addsd xmm0, xmm1; ret - |1: ; subsd xmm0, xmm1; ret - |2: ; cmp foldop, 3; je >1; ja >2 - | mulsd xmm0, xmm1; ret - |1: ; divsd xmm0, xmm1; ret - |2: ; cmp foldop, 5; jb ->vm_mod; je ->vm_pow_sse - | cmp foldop, 7; je >1; ja >2 - | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; ret - |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; ret - |2: ; cmp foldop, 9; ja >2 - |.if X64WIN - | movsd qword [rsp+8], xmm0 // Use scratch area. - | movsd qword [rsp+16], xmm1 - | fld qword [rsp+8] - | fld qword [rsp+16] - |.else - | movsd qword [rsp-8], xmm0 // Use red zone. - | movsd qword [rsp-16], xmm1 - | fld qword [rsp-8] - | fld qword [rsp-16] - |.endif - | je >1 - | fpatan - |7: - |.if X64WIN - | fstp qword [rsp+8] // Use scratch area. - | movsd xmm0, qword [rsp+8] - |.else - | fstp qword [rsp-8] // Use red zone. - | movsd xmm0, qword [rsp-8] - |.endif - | ret - |1: ; fxch; fscale; fpop1; jmp <7 - |2: ; cmp foldop, 11; je >1; ja >9 - | minsd xmm0, xmm1; ret - |1: ; maxsd xmm0, xmm1; ret - |9: ; int3 // Bad op. - | - |.else // x86 calling convention. - | - | .define foldop, eax - | mov foldop, [esp+20] - | movsd xmm0, qword [esp+4] - | movsd xmm1, qword [esp+12] - | cmp foldop, 1; je >1; ja >2 - | addsd xmm0, xmm1 - |7: - | movsd qword [esp+4], xmm0 // Overwrite callee-owned args. - | fld qword [esp+4] - | ret - |1: ; subsd xmm0, xmm1; jmp <7 - |2: ; cmp foldop, 3; je >1; ja >2 - | mulsd xmm0, xmm1; jmp <7 - |1: ; divsd xmm0, xmm1; jmp <7 - |2: ; cmp foldop, 5 - | je >1; ja >2 - | call ->vm_mod; jmp <7 - |1: ; pop edx; call ->vm_pow_sse; push edx; jmp <7 // Writes to scratch area. - |2: ; cmp foldop, 7; je >1; ja >2 - | sseconst_sign xmm1, RDa; xorps xmm0, xmm1; jmp <7 - |1: ; sseconst_abs xmm1, RDa; andps xmm0, xmm1; jmp <7 - |2: ; cmp foldop, 9; ja >2 - | fld qword [esp+4] // Reload from stack - | fld qword [esp+12] - | je >1 - | fpatan; ret - |1: ; fxch; fscale; fpop1; ret - |2: ; cmp foldop, 11; je >1; ja >9 - | minsd xmm0, xmm1; jmp <7 - |1: ; maxsd xmm0, xmm1; jmp <7 - |9: ; int3 // Bad op. - | - |.endif - | |//----------------------------------------------------------------------- |//-- Miscellaneous functions -------------------------------------------- |//----------------------------------------------------------------------- @@ -4107,8 +3795,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) break; case BC_POW: | ins_arithpre movsd, xmm1 - | call ->vm_pow_sse + | mov RB, BASE + |.if not X64 + | movsd FPARG1, xmm0 + | movsd FPARG3, xmm1 + |.endif + | call extern pow + | movzx RA, PC_RA + | mov BASE, RB + |.if X64 | ins_arithpost + |.else + | fstp qword [BASE+RA*8] + |.endif | ins_next break;