diff --git a/src/lj_asm.c b/src/lj_asm.c index 0b6ebc09..f38ceaef 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1292,21 +1292,52 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) { RegSet allow = RSET_ALL; uint32_t n, nargs = CCI_NARGS(ci); - int32_t ofs = 0; + int32_t ofs = STACKARG_OFS; + uint32_t gprs = REGARG_GPRS; +#if LJ_64 + Reg fpr = REGARG_FIRSTFPR; +#endif lua_assert(!(nargs > 2 && (ci->flags&CCI_FASTCALL))); /* Avoid stack adj. */ emit_call(as, ci->func); for (n = 0; n < nargs; n++) { /* Setup args. */ -#if LJ_64 -#error "NYI: 64 bit mode call argument setup" -#endif IRIns *ir = IR(args[n]); + Reg r; +#if LJ_64 && defined(_WIN64) + /* Windows/x64 argument registers are strictly positional. */ + r = irt_isnum(ir->t) ? (fpr <= REGARG_LASTFPR ? fpr : 0) : (gprs & 31); + fpr++; gprs >>= 5; +#elif LJ_64 + /* POSIX/x64 argument registers are used in order of appearance. */ if (irt_isnum(ir->t)) { - if ((ofs & 4) && irref_isk(args[n])) { + r = fpr <= REGARG_LASTFPR ? fpr : 0; fpr++; + } else { + r = gprs & 31; gprs >>= 5; + } +#else + if (irt_isnum(ir->t) || !(ci->flags & CCI_FASTCALL)) { + r = 0; + } else { + r = gprs & 31; gprs >>= 5; + } +#endif + if (r) { /* Argument is in a register. */ + if (args[n] < ASMREF_TMP1) { + emit_loadi(as, r, ir->i); + } else { + lua_assert(rset_test(as->freeset, r)); /* Must have been evicted. */ + if (ra_hasreg(ir->r)) { + ra_noweak(as, ir->r); + ra_movrr(as, ir, r, ir->r); + } else { + ra_allocref(as, args[n], RID2RSET(r)); + } + } + } else if (irt_isnum(ir->t)) { /* FP argument is on stack. */ + if (!LJ_64 && (ofs & 4) && irref_isk(args[n])) { /* Split stores for unaligned FP consts. */ emit_movmroi(as, RID_ESP, ofs, (int32_t)ir_knum(ir)->u32.lo); emit_movmroi(as, RID_ESP, ofs+4, (int32_t)ir_knum(ir)->u32.hi); } else { - Reg r; if ((allow & RSET_FPR) == RSET_EMPTY) lj_trace_err(as->J, LJ_TRERR_NYICOAL); r = ra_alloc1(as, args[n], allow & RSET_FPR); @@ -1314,34 +1345,18 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) emit_rmro(as, XO_MOVSDto, r, RID_ESP, ofs); } ofs += 8; - } else { - if ((ci->flags & CCI_FASTCALL) && n < 2) { - Reg r = n == 0 ? RID_ECX : RID_EDX; - if (args[n] < ASMREF_TMP1) { - emit_loadi(as, r, ir->i); - } else { - lua_assert(rset_test(as->freeset, r)); /* Must have been evicted. */ - allow &= ~RID2RSET(r); - if (ra_hasreg(ir->r)) { - ra_noweak(as, ir->r); - ra_movrr(as, ir, r, ir->r); - } else { - ra_allocref(as, args[n], RID2RSET(r)); - } - } + } else { /* Non-FP argument is on stack. */ + /* NYI: no widening for 64 bit parameters on x64. */ + if (args[n] < ASMREF_TMP1) { + emit_movmroi(as, RID_ESP, ofs, ir->i); } else { - if (args[n] < ASMREF_TMP1) { - emit_movmroi(as, RID_ESP, ofs, ir->i); - } else { - Reg r; - if ((allow & RSET_GPR) == RSET_EMPTY) - lj_trace_err(as->J, LJ_TRERR_NYICOAL); - r = ra_alloc1(as, args[n], allow & RSET_GPR); - allow &= ~RID2RSET(r); - emit_movtomro(as, r, RID_ESP, ofs); - } - ofs += 4; + if ((allow & RSET_GPR) == RSET_EMPTY) + lj_trace_err(as->J, LJ_TRERR_NYICOAL); + r = ra_alloc1(as, args[n], allow & RSET_GPR); + allow &= ~RID2RSET(r); + emit_movtomro(as, REX_64LU(ir, r), RID_ESP, ofs); } + ofs += sizeof(intptr_t); } } } @@ -2561,7 +2576,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc) asm_guardcc(as, cc); if (usetest && left != RID_MRM) { /* Use test r,r instead of cmp r,0. */ - emit_rr(as, XO_TEST, left, left); + emit_rr(as, XO_TEST, REX_64LU(ir, left), left); if (irl+1 == ir) /* Referencing previous ins? */ as->testmcp = as->mcp; /* Set flag to drop test r,r if possible. */ } else { @@ -2580,11 +2595,7 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc) Reg left = ra_alloc1(as, lref, RSET_GPR); Reg right = asm_fuseload(as, rref, rset_exclude(RSET_GPR, left)); asm_guardcc(as, cc); -#if LJ_64 - if (irt_islightud(ir->t)) - left |= REX_64; -#endif - emit_mrm(as, XO_CMP, left, right); + emit_mrm(as, XO_CMP, REX_64LU(ir, left), right); } } } @@ -2732,14 +2743,14 @@ static void asm_gc_check(ASMState *as, SnapShot *snap) /* We don't know spadj yet, so get the C frame from L->cframe. */ emit_movmroi(as, tmp, CFRAME_OFS_PC, (int32_t)as->T->snapmap[snap->mapofs+snap->nent]); - emit_gri(as, XG_ARITHi(XOg_AND), tmp, CFRAME_RAWMASK); + emit_gri(as, XG_ARITHi(XOg_AND), tmp|REX_64, CFRAME_RAWMASK); lstate = IR(ASMREF_L)->r; - emit_rmro(as, XO_MOV, tmp, lstate, offsetof(lua_State, cframe)); + emit_rmro(as, XO_MOV, tmp|REX_64, lstate, offsetof(lua_State, cframe)); /* It's ok if lstate is already in a non-scratch reg. But all allocations ** in the non-fast path must use a scratch reg. See comment above. */ base = ra_alloc1(as, REF_BASE, rset_exclude(RSET_SCRATCH & RSET_GPR, lstate)); - emit_movtomro(as, base, lstate, offsetof(lua_State, base)); + emit_movtomro(as, base|REX_64, lstate, offsetof(lua_State, base)); asm_gc_sync(as, snap, base); /* BASE/L get restored anyway, better do it inside the slow path. */ if (as->parent || as->curins == as->loopref) ra_restore(as, REF_BASE); @@ -3447,7 +3458,12 @@ static void asm_setup_regsp(ASMState *as, Trace *T) case IR_CALLN: case IR_CALLL: case IR_CALLS: { const CCallInfo *ci = &lj_ir_callinfo[ir->op2]; #if LJ_64 - /* NYI: add stack slots for calls with more than 4/6 args. */ + /* NYI: add stack slots for x64 calls with many args. */ +#ifdef _WIN64 + lua_assert(CCI_NARGS(ci) <= 4); +#else + lua_assert(CCI_NARGS(ci) <= 6); /* Safe lower bound. */ +#endif ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET); #else /* NYI: not fastcall-aware, but doesn't matter (yet). */ diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index 8e9a8788..83eba0ec 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h @@ -78,14 +78,27 @@ enum { /* Windows x64 ABI. */ #define RSET_SCRATCH \ (RSET_ACD|RSET_RANGE(RID_R8D, RID_R11D+1)|RSET_RANGE(RID_XMM0, RID_XMM5+1)) +#define REGARG_GPRS \ + (RID_ECX|((RID_EDX|((RID_R8D|(RID_R9D<<5))<<5))<<5)) +#define REGARG_FIRSTFPR RID_XMM0 +#define REGARG_LASTFPR RID_XMM3 +#define STACKARG_OFS (4*8) #else /* The rest of the civilized x64 world has a common ABI. */ #define RSET_SCRATCH \ (RSET_ACD|RSET_RANGE(RID_ESI, RID_R11D+1)|RSET_FPR) +#define REGARG_GPRS \ + (RID_EDI|((RID_ESI|((RID_EDX|((RID_ECX|((RID_R8D|(RID_R9D \ + <<5))<<5))<<5))<<5))<<5)) +#define REGARG_FIRSTFPR RID_XMM0 +#define REGARG_LASTFPR RID_XMM7 +#define STACKARG_OFS 0 #endif #else /* Common x86 ABI. */ #define RSET_SCRATCH (RSET_ACD|RSET_FPR) +#define REGARG_GPRS (RID_ECX|(RID_EDX<<5)) /* Fastcall only. */ +#define STACKARG_OFS 0 #endif #if LJ_64 @@ -96,23 +109,26 @@ enum { /* -- Spill slots --------------------------------------------------------- */ -/* Available fixed spill slots in interpreter frame. +/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs. +** +** SPS_FIXED: Available fixed spill slots in interpreter frame. ** This definition must match with the *.dasc file(s). +** +** SPS_FIRST: First spill slot for general use. Reserve min. two 32 bit slots. */ #if LJ_64 #ifdef _WIN64 #define SPS_FIXED (5*2) +#define SPS_FIRST (4*2) /* Don't use callee register save area. */ #else #define SPS_FIXED 2 +#define SPS_FIRST 2 #endif #else #define SPS_FIXED 6 +#define SPS_FIRST 2 #endif -/* First spill slot for general use. Reserve one 64 bit slot. */ -#define SPS_FIRST 2 - -/* Spill slots are 32 bit wide. An even/odd pair is used for FPRs. */ #define sps_scale(slot) (4 * (int32_t)(slot)) /* -- Exit state ---------------------------------------------------------- */