From f7331e946487dd7b9c02c9db0a2ff8b4a3aa18d3 Mon Sep 17 00:00:00 2001 From: fsfod Date: Tue, 29 Mar 2016 11:14:43 +0100 Subject: [PATCH] Added JIT support for intrinsics. Support for vector registers is NYI. --- src/lj_asm.c | 72 +++++++ src/lj_asm_x86.h | 419 +++++++++++++++++++++++++++++++++++++-- src/lj_crecord.c | 149 +++++++++++++- src/lj_intrinsic.c | 12 +- src/lj_intrinsic.h | 4 + src/lj_ir.h | 3 + src/lj_opt_fold.c | 2 + src/lj_traceerr.h | 1 + tests/intrinsic_spec.lua | 9 + tests/runtests.lua | 20 +- 10 files changed, 655 insertions(+), 36 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index e9d4ed8f..93f2eb2d 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -1305,6 +1305,58 @@ static uint32_t asm_callx_flags(ASMState *as, IRIns *ir) return (nargs | (ir->t.irt << CCI_OTSHIFT)); } +#if LJ_HASINTRINSICS +static RegSet asm_intrinsichints(ASMState *as, IRIns *ir) +{ + CTState *cts = ctype_ctsG(J2G(as->J)); + CIntrinsic* intrins = lj_intrinsic_get(cts, ir->op2); + RegSet mod = intrin_getmodrset(cts, intrins); + IRIns *ira = IR(ir->op1), *irval; + int i; + int dynreg = intrin_regmode(intrins); + + /* Propagate the fixed registers of the arguments to refs passed in for them */ + for (i = intrins->insz-1; i >= 0; i--) { + Reg r = reg_rid(intrins->in[i]); + + if (dynreg && i < intrins->dyninsz) { + /* Dynamic register so no hint needed */ + ira = IR(ira->op1); + continue; + } + + rset_set(mod, r); + + if (!irref_isk(ira->op2)) { + irval = IR(ira->op2); + + /* Back propagate the register to the arguments value if it has no register set */ + if (irval->prev == REGSP_INIT) { + irval->prev = REGSP_HINT(r); + } + } + + ira = IR(ira->op1); + } + + if (intrins->outsz > 0) { + i = intrin_dynrout(intrins) ? 1 : 0; + + for (; i < intrins->outsz; i++) { + mod |= 1 << reg_rid(intrins->out[i]); + } + + if (intrin_dynrout(intrins)) { + ir->prev = REGSP_INIT; + } else { + ir->prev = REGSP_HINT(reg_rid(intrins->out[0])); + } + } + + return mod; +} +#endif + static void asm_callid(ASMState *as, IRIns *ir, IRCallID id) { const CCallInfo *ci = &lj_ir_callinfo[id]; @@ -1742,6 +1794,10 @@ static void asm_ir(ASMState *as, IRIns *ir) case IR_CALLN: case IR_CALLL: case IR_CALLS: asm_call(as, ir); break; case IR_CALLXS: asm_callx(as, ir); break; case IR_CARG: break; + + case IR_INTRN: asm_intrinsic(as, ir, NULL); break; + case IR_ASMEND: asm_intrinsic(as, IR(ir->op2), ir); break; + case IR_ASMRET: break; default: setintV(&as->J->errinfo, ir->o); @@ -2103,6 +2159,22 @@ static void asm_setup_regsp(ASMState *as) as->modset |= RSET_SCRATCH; continue; } +#if LJ_HASINTRINSICS + case IR_INTRN: { + Reg mod = asm_intrinsichints(as, ir); + if (inloop) + as->modset |= mod; + continue; + } + + case IR_ASMRET: { + Reg r = reg_rid(ir->op2); + ir->prev = REGSP_HINT(r); + if (inloop) + rset_set(as->modset, r); + continue; + } +#endif case IR_CALLN: case IR_CALLA: case IR_CALLL: case IR_CALLS: { const CCallInfo *ci = &lj_ir_callinfo[ir->op2]; ir->prev = asm_setup_call_slots(as, ir, ci); diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h index af54dc7f..fe3cac68 100644 --- a/src/lj_asm_x86.h +++ b/src/lj_asm_x86.h @@ -397,6 +397,31 @@ static Reg asm_fuseloadk64(ASMState *as, IRIns *ir) return RID_MRM; } +static int asm_fusexload(ASMState *as, IRRef ref, RegSet xallow, IRRef skip) +{ + IRIns *ir = IR(ref); + IRRef i = as->curins; + lua_assert(ir->o == IR_XLOAD); + + /* Generic fusion is not ok for 8/16 bit operands (but see asm_comp). + ** Fusing unaligned memory operands is ok on x86 (except for SIMD types). + */ + if (irt_typerange(ir->t, IRT_I8, IRT_U16)) { + return 0; + } + if (i > ref + CONFLICT_SEARCH_LIM) + return 0; /* Give up, ref is too far away. */ + ir = as->ir; + while (--i > ref) { + if (ir[i].o == IR_XSTORE) + return 0; /* Conflict found. */ + else if ((ir[i].op1 == ref || ir[i].op2 == ref) && i != skip) + return 0; + } + asm_fusexref(as, IR(ref)->op1, xallow); + return 1; +} + /* Fuse load into memory operand. ** ** Important caveat: this may emit RIP-relative loads! So don't place any @@ -467,12 +492,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow) return RID_MRM; } } else if (ir->o == IR_XLOAD) { - /* Generic fusion is not ok for 8/16 bit operands (but see asm_comp). - ** Fusing unaligned memory operands is ok on x86 (except for SIMD types). - */ - if ((!irt_typerange(ir->t, IRT_I8, IRT_U16)) && - noconflict(as, ref, IR_XSTORE, 0)) { - asm_fusexref(as, ir->op1, xallow); + if (asm_fusexload(as, ref, xallow, REF_NIL)) { return RID_MRM; } } else if (ir->o == IR_VLOAD && !(LJ_GC64 && irt_isaddr(ir->t))) { @@ -642,6 +662,366 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) #endif } +#if LJ_HASINTRINSICS + +typedef struct IntrinsInfo { + CIntrinsic *intrins; + IRIns *asmend; + IRRef args[LJ_INTRINS_MAXREG]; + /* input register list that gets mutated for opcode intrinsics */ + uint8_t inregs[LJ_INTRINS_MAXREG]; + RegSet inset, outset, modset; + /* First CARG ref used as limit for duplicate load checking when fusing */ + IRRef a1; +} IntrinsInfo; + +static int asm_swaprefs(ASMState *as, IRIns *ir, IRRef lref, IRRef rref); + +static void asm_asmsetupargs(ASMState *as, IntrinsInfo *ininfo) +{ + MSize n; + CIntrinsic *intrins = ininfo->intrins; + + /* move or load args into input registers */ + for (n = 0; n < intrins->insz; n++) { + IRRef ref = ininfo->args[n]; + IRIns *ir = IR(ref); + Reg r = reg_rid(ininfo->inregs[n]); + + /* Skip any dynamic registers already setup by opcode intrinsics */ + if (ininfo->inregs[n] == 0xff) { + lua_assert(n < intrins->dyninsz); + continue; + } + + if (!ra_hasreg(ir->r) && r < RID_MAX_GPR && ref < ASMREF_TMP1) { +#if LJ_64 + if (ir->o == IR_KINT64) + emit_loadu64(as, r, ir_kint64(ir)->u64); + else +#endif + emit_loadi(as, r, ir->i); + } else { + /* if we have a fixed register it must of been evicted earlier */ + lua_assert(rset_test(as->freeset, r) || regcost_ref(as->cost[r]) == ref || + n < intrins->dyninsz); + + if (ra_hasreg(ir->r)) { + ra_noweak(as, ir->r); + if (r != ir->r) { + lua_assert(n >= intrins->dyninsz); + emit_movrr(as, ir, r, ir->r); + } + } else { + /* Dynamic registers should never end up here */ + lua_assert(!intrin_regmode(intrins) || n >= intrins->dyninsz); + ra_allocref(as, ref, RID2RSET(r)); + } + } + checkmclim(as); + } +} + +static void asm_intrin_opcode(ASMState *as, IRIns *ir, IntrinsInfo *ininfo) +{ + CIntrinsic *intrins = ininfo->intrins; + IRRef *args = ininfo->args; + uint8_t *in = ininfo->inregs; + uint32_t dynreg = intrin_regmode(intrins); + RegSet allow; + IRRef lref = 0, rref = 0; + Reg right, dest = RID_NONE; + int dynrout = intrins->outsz > 0 && intrin_dynrout(intrins); + + /* Swap to refs to native ordering */ + if (dynreg >= DYNREG_SWAPREGS) { + IRRef temp = ininfo->args[0]; + ininfo->args[0] = ininfo->args[1]; ininfo->args[1] = temp; + } + + rref = args[0]; + right = IR(rref)->r; + + if (intrins->dyninsz > 1) { + lref = args[1]; + dest = IR(lref)->r; + + if (ra_hasreg(dest)) + in[1] = reg_setrid(in[1], dest); + } + + as->mrm.idx = as->mrm.base = RID_NONE; + as->mrm.scale = as->mrm.ofs = 0; + + /* Allocate the dynamic output register if we have one */ + if (dynrout) { + allow = reg_torset(intrins->out[0]); + if (ra_hasreg(right)) { + rset_clear(allow, right); + ra_noweak(as, right); + } + dest = ra_dest(as, ir, allow); + if (dynreg == DYNREG_OPEXT) { + /* Set input register the same as the output since the op is destructive */ + right = dest; + } + } + + if (intrins->dyninsz > 1 && dynreg != DYNREG_TWOSTORE) { + if (lref == rref) { + if (dynreg == DYNREG_INOUT) + right = dest; + /* Only load/move the value to register once. + ** ra_left will do the move for INOUT. + */ + in[0] = 0xff; + } else if (ra_noreg(right)) { + if (intrin_iscomm(intrins) && asm_swaprefs(as, ir, lref, rref)) { + IRRef tmp = lref; lref = rref; rref = tmp; + /* Must be same register kinds and RID register type ranges */ + lua_assert(reg_isgpr(in[0]) == reg_isgpr(in[1]) && + reg_kind(in[0]) == reg_kind(in[1])); + + args[0] = rref; + args[1] = lref; + /* lref(now swapped to rref) may already have a register set so update + ** the right register to it in case we don't fuse a load. + */ + right = IR(rref)->r; + } + if (!(intrins->flags & INTRINSFLAG_NOFUSE) && !ra_hasreg(right)) { + RegSet rallow = reg_torset(in[0]); + rset_clear(rallow, dest); + /* Handle XLOAD directly so we can tell noconflict to skip our IR_CARG + ** that holds the ref of the load were fusing. + */ + if (IR(rref)->o == IR_XLOAD) { + if (mayfuse(as, rref) && asm_fusexload(as, rref, rallow, ininfo->a1)) { + right = RID_MRM; + } + } else { + right = asm_fuseload(as, rref, rallow); + } + } + } + } else if (intrins->flags & INTRINSFLAG_INDIRECT) { + /* force indirect MODRM mode. rref should always be a memory address */ + if (ra_noreg(right)) { + allow = RSET_GPR & ~ininfo->inset; + /* If part of the opcode is encoded in ModRM avoid picking a register that + ** will corrupt it */ + if (dynreg == DYNREG_OPEXT) + rset_clear(allow, RID_EBP); + asm_fusexref(as, rref, allow); + } else { + as->mrm.base = IR(rref)->r; + } + right = RID_MRM; + } + + /* Handle second input reg for any two input dynamic in register modes + ** which isn't DYNREG_INOUT + */ + if (intrins->dyninsz > 1 && ra_noreg(dest)) { + Reg r; + allow = reg_torset(in[1]) & ~ininfo->inset; + if (ra_hasreg(right) && right != RID_MRM) + rset_clear(allow, right); + + r = ra_allocref(as, args[1], allow); + in[1] = reg_setrid(in[1], r); + dest = r; + } + + if (right == RID_MRM) { + /* Skip trying to load what we fused into the instruction */ + in[0] = 0xff; + } else { + if (ra_noreg(right)) { + lua_assert(ra_noreg(IR(rref)->r)); + allow = reg_torset(in[0]); + rset_clear(allow, dest); + if(dynreg == DYNREG_OPEXT) + rset_clear(allow, RID_EBP); + right = ra_allocref(as, rref, allow); + } + in[0] = reg_setrid(in[0], right); + } + + lua_assert(ra_hasreg(right) && (ra_hasreg(dest) || intrins->dyninsz < 2)); + emit_intrins(as, intrins, right, dest); + + if (dynreg == DYNREG_INOUT) { + lua_assert(lref); + ra_left(as, dest, lref); + /* no need to load the register since ra_left already did */ + in[1] = 0xff; + } else if (dynreg == DYNREG_OPEXT && dynrout) { + /* Handle destructive ONEOPEXT opcodes */ + lua_assert(rref); + ra_left(as, dest, rref); + in[0] = 0xff; + } + + checkmclim(as); +} + +void asm_intrin_results(ASMState *as, IRIns *ir, CIntrinsic* intrins, IntrinsInfo* ininfo) +{ + IRRef results[LJ_INTRINS_MAXREG]; + RegSet evict = 0, outset = 0, aout = 0; + int32_t i = intrin_regmode(intrins) ? intrins->dyninsz : 0; + int32_t dynout = intrin_dynrout(intrins) ? 1 : 0; + + /* Gather the output register IR instructions */ + if (intrins->outsz > 0) { + IRIns *irret = ininfo->asmend ? IR(ininfo->asmend->op1) : ir; + int32_t n; + + for (n = intrins->outsz-1;; n--) { + lua_assert(n >= 0 && (irret->o == IR_ASMRET || irret->o == IR_INTRN)); + results[n] = (IRRef)(irret - as->ir); + + if (ra_used(irret)) { + if (n >= dynout && irret->r == reg_rid(ininfo->inregs[n])) { + rset_set(aout, irret->r); + } + } + + if (irret->o == IR_INTRN) { + break; + } + irret = IR(irret->op1); + } + } + + evict = ininfo->modset; + + /* Check what registers need evicting for fixed input registers */ + i = intrin_regmode(intrins) ? intrins->dyninsz : 0; + for (; i < intrins->insz; i++) { + Reg r = reg_rid(intrins->in[i]); + IRIns *arg = IR(ininfo->args[i]); + + ininfo->inset |= RID2RSET(r); + /* Don't evict if the arg was allocated the correct register */ + if (!rset_test(as->freeset, r) && arg->r != r) { + evict |= RID2RSET(r); + } + } + + for (i = dynout; i < intrins->outsz; i++) { + outset |= RID2RSET(reg_rid(intrins->out[i])); + } + ininfo->outset = outset; + /* Don't evict register that currently have our output values live in them */ + evict &= ~aout; + + /* Evict any values in input and modified registers and any fixed out registers + ** that are unused or didn't get allocated the same register as there fixed one. + */ + ra_evictset(as, evict); + + /* Handle any fixed output registers */ + if (intrins->outsz > dynout) { + int32_t stop = dynout; + for (i = intrins->outsz-1; i >= stop; i--) { + IRIns *irret = IR(results[i]); + Reg r = intrins->out[i]; + + if (!ra_used(irret) || (!rset_test(as->freeset, r) && irret->r != r)) { + ra_evictset(as, RID2RSET(r)); + if (!ra_used(irret)) + continue; + } + + ra_destreg(as, irret, r); + } + } +} + +static void asm_intrinsic(ASMState *as, IRIns *ir, IRIns *asmend) +{ + CTState *cts = ctype_ctsG(J2G(as->J)); + CIntrinsic *intrins = lj_intrinsic_get(cts, ir->op2); + IRIns *ira = ir; + uintptr_t target = 0; + uint32_t n = 0; + IntrinsInfo ininfo; + memset(&ininfo, 0, sizeof(IntrinsInfo)); + ininfo.intrins = intrins; + ininfo.modset = intrin_getmodrset(cts, intrins); + ininfo.asmend = asmend; + memcpy(ininfo.inregs, intrins->in, sizeof(ininfo.inregs)); + + if (!intrins->wrapped) { + /* Last CARG in the chain is the wrapper pointer */ + ira = IR(ira->op1); +#if LJ_64 + if (IR(ira->op2)->o == IR_KINT64) { + target = (uintptr_t)ir_k64(IR(ira->op2))->u64; + } +#endif + if (!target) { + target = (uintptr_t)IR(ira->op2)->i; + } + } else { + target = (uintptr_t)intrins->wrapped; + } + + n = intrins->insz; + /* Collect the input register argument refs */ + while (ira->op1 != REF_NIL) { + ira = IR(ira->op1); + lua_assert(ira->o == IR_CARG); + ininfo.args[--n] = ira->op2; + /* Save the ref of our first CARG so we can use it to skip the arg chain + ** when looking for conflicts during when fusing a XLOAD. + */ + if (n == 0) + ininfo.a1 = (IRRef)(ira-as->ir); + } + lua_assert(n == 0); + + asm_intrin_results(as, ir, intrins, &ininfo); + + if (intrin_regmode(intrins)) { + asm_intrin_opcode(as, ir, &ininfo); + } else { + Reg r1 = 0; + + if (intrins->flags & INTRINSFLAG_CALLED) { + AsmHeader *hdr = ((AsmHeader*)target)-1; + MCode *p; + target = intrins->flags & INTRINSFLAG_INDIRECT ? + hdr->target : (target+hdr->asmofs); + p = (MCode*)target; + if (LJ_64 && (p-as->mcp) != (int32_t)(p-as->mcp)) { + r1 = ra_scratch(as, RSET_GPR & ~(ininfo.inset | ininfo.outset)); + } + } + emit_intrins(as, intrins, r1, target); + } + + asm_asmsetupargs(as, &ininfo); + if (ininfo.asmend) { + /* Skip over our IR_INTRN since were emitting from the tail */ + as->curins = (IRRef)(ir - as->ir); + } +} +#else +static void asm_intrinsic(ASMState *as, IRIns *ir, IRIns *asmend) +{ + UNUSED(as); UNUSED(ir); +} + +void asm_asmret(ASMState *as, IRIns *ir) +{ + UNUSED(as); UNUSED(ir); +} +#endif + + /* Setup result reg/sp for call. Evict scratch regs. */ static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci) { @@ -1395,7 +1775,7 @@ static void asm_fxload(ASMState *as, IRIns *ir) asm_fusefref(as, ir, RSET_GPR); else asm_fusexref(as, ir->op1, RSET_GPR); - /* ir->op2 is ignored -- unaligned loads are ok on x86. */ + /* ir->op2 is ignored for non vectors -- unaligned loads are ok on x86. */ switch (irt_type(ir->t)) { case IRT_I8: xo = XO_MOVSXb; break; case IRT_U8: xo = XO_MOVZXb; break; @@ -1462,7 +1842,9 @@ static void asm_fxstore(ASMState *as, IRIns *ir) xo = XO_MOVto; break; } + emit_mrm(as, xo, src, RID_MRM); + if (!LJ_64 && src != osrc) { ra_noweak(as, osrc); emit_rr(as, XO_MOV, src, osrc); @@ -2004,23 +2386,21 @@ static void asm_pow(ASMState *as, IRIns *ir) asm_fppowi(as, ir); } -static int asm_swapops(ASMState *as, IRIns *ir) +static int asm_swaprefs(ASMState *as, IRIns *ir, IRRef lref, IRRef rref) { - IRIns *irl = IR(ir->op1); - IRIns *irr = IR(ir->op2); - lua_assert(ra_noreg(irr->r)); - if (!irm_iscomm(lj_ir_mode[ir->o])) - return 0; /* Can't swap non-commutative operations. */ - if (irref_isk(ir->op2)) + IRIns *irl = IR(lref); + IRIns *irr = IR(rref); + lua_assert(ra_noreg(irr->r)); + if (irref_isk(rref)) return 0; /* Don't swap constants to the left. */ if (ra_hasreg(irl->r)) return 1; /* Swap if left already has a register. */ if (ra_samehint(ir->r, irr->r)) return 1; /* Swap if dest and right have matching hints. */ if (as->curins > as->loopref) { /* In variant part? */ - if (ir->op2 < as->loopref && !irt_isphi(irr->t)) + if (rref < as->loopref && !irt_isphi(irr->t)) return 0; /* Keep invariants on the right. */ - if (ir->op1 < as->loopref && !irt_isphi(irl->t)) + if (lref < as->loopref && !irt_isphi(irl->t)) return 1; /* Swap invariants to the right. */ } if (opisfusableload(irl->o)) @@ -2028,6 +2408,13 @@ static int asm_swapops(ASMState *as, IRIns *ir) return 0; /* Otherwise don't swap. */ } +static int asm_swapops(ASMState *as, IRIns *ir) +{ + if (!irm_iscomm(lj_ir_mode[ir->o])) + return 0; /* Can't swap non-commutative operations. */ + return asm_swaprefs(as, ir, ir->op1, ir->op2); +} + static void asm_fparith(ASMState *as, IRIns *ir, x86Op xo) { IRRef lref = ir->op1; diff --git a/src/lj_crecord.c b/src/lj_crecord.c index 49a67f57..75fafa13 100644 --- a/src/lj_crecord.c +++ b/src/lj_crecord.c @@ -33,6 +33,7 @@ #include "lj_dispatch.h" #include "lj_strfmt.h" #include "lj_intrinsic.h" +#include "lj_target.h" /* Some local macros to save typing. Undef'd at the end. */ #define IR(ref) (&J->cur.ir[(ref)]) @@ -1201,6 +1202,8 @@ static void crec_snap_caller(jit_State *J) J->base[-1-LJ_FR2] = ftr; J->pc = pc; } +void crec_call_intrins(jit_State *J, RecordFFData *rd, CType *cts); + /* Record function call. */ static int crec_call(jit_State *J, RecordFFData *rd, GCcdata *cd) { @@ -1212,7 +1215,8 @@ static int crec_call(jit_State *J, RecordFFData *rd, GCcdata *cd) ct = ctype_rawchild(cts, ct); } if (ctype_isintrinsic(ct->info)) { - lj_trace_err(J, LJ_TRERR_NYICALL); + crec_call_intrins(J, rd, ct); + return 1; }else if (ctype_isfunc(ct->info)) { TRef func = emitir(IRT(IR_FLOAD, tp), J->base[0], IRFL_CDATA_PTR); CType *ctr = ctype_rawchild(cts, ct); @@ -1273,6 +1277,138 @@ static int crec_call(jit_State *J, RecordFFData *rd, GCcdata *cd) return 0; } +#if LJ_HASINTRINSICS + +static IRType intrins_retit(jit_State *J, CTState *cts, CType *arg) +{ + uint32_t reg = arg->size; + + if (reg_isgpr(reg)) { + IRType irt = crec_ct2irt(cts, ctype_rawchild(cts, arg)); + lua_assert(irt != IRT_CDATA); + return irt; + } else { + if (reg_isvec(reg)) { + /* NYI: support for vectors */ + lj_trace_err(J, LJ_TRERR_NYIVEC); + } + return reg_irt(reg); + } +} + +void crec_call_intrins(jit_State *J, RecordFFData *rd, CType *func) +{ + CTState *cts = ctype_ctsG(J2G(J)); + TRef arg = TREF_NIL; + CIntrinsic *intrins = lj_intrinsic_get(cts, func->size); + void* target = *(void**)cdataptr(cdataV(&rd->argv[0])); + MSize i; + IRType it; + int argofs = 1; + CTypeID sib = func->sib, retid = 0; + + /* Fetch the parameter list chain */ + retid = ctype_cid(func->info); + + if (intrins->wrapped == 0) { + TRef tr = emitir(IRT(IR_FLOAD, IRT_INTP), J->base[0], IRFL_CDATA_PTR); + emitir(IRTG(IR_EQ, IRT_INTP), tr, lj_ir_kintp(J, target)); + } + + /* Convert parameters and load them into the input registers */ + for (i = 0; i < intrins->insz; i++) { + CType *ct = ctype_get(cts, sib); + TRef tra = J->base[i+argofs]; + CType *d = ctype_rawchild(cts, ct); + sib = ct->sib; + + if (reg_isvec(ct->size)) { + /* NYI: support for vectors */ + lj_trace_err(J, LJ_TRERR_NYIVEC); + } + + tra = crec_ct_tv(J, d, 0, tra, &rd->argv[i+argofs]); + arg = emitir(IRT(IR_CARG, IRT_NIL), arg, tra); + } + + /* Append the wrapper pointer if were created from a template */ + if (intrins->wrapped == NULL) { + arg = emitir(IRT(IR_CARG, IRT_NIL), arg, lj_ir_kintp(J, target)); + } + + it = IRT_NIL; + if (intrins->outsz > 0) { + it = intrins_retit(J, cts, ctype_get(cts, retid)); + } + + J->base[0] = emitir(IRT(IR_INTRN, it), arg, (func->size & LJ_INTRINS_MAXID)); + + if (intrins->flags & INTRINSFLAG_MEMORYSIDE) { + emitir(IRT(IR_XBAR, IRT_NIL), 0, 0); + } + + arg = J->base[0]; + sib = retid; + for (i = 1; i < intrins->outsz; i++) { + CType *ct = ctype_get(cts, sib); + uint32_t reg = ct->size; + IRType irt = 0; + sib = ct->sib; + + if (reg_isgpr(reg)) { + irt = intrins_retit(J, cts, ct); + lua_assert(irt != IRT_CDATA); + } else { + irt = reg_irt(reg); + } + + J->base[i] = arg = emitir(IRT(IR_ASMRET, irt), arg, reg_rid(reg)); + } + + if (intrins->outsz > 1) { + emitir(IRT(IR_ASMEND, IRT_NIL), arg, J->base[0]); + } + + sib = retid; + /* Second pass to box values after all ASMRET have run to shuffle/spill the + * output registers. + */ + for (i = 0; i < intrins->outsz; i++) { + CType *ct = ctype_get(cts, sib); + CTypeID id = ctype_cid(ct->info); + uint32_t reg = ct->size; + uint32_t kind = reg_kind(reg); + sib = ct->sib; + + if (reg_isgpr(reg)) { + CTypeID cid = ctype_typeid(cts, ctype_raw(cts, id)); + if (cid != CTID_INT32) { + /* Box the u32/64 bit value in the register */ + J->base[i] = emitir(IRT(IR_CNEWI, IRT_CDATA), lj_ir_kint(J, id), J->base[i]); + } + } else { + if (kind == REGKIND_FPR32) { + J->base[i] = emitconv(J->base[i], IRT_NUM, IRT_FLOAT, 0); + } else if(rk_isvec(kind)) { + /* NYI: support for vectors */ + lj_trace_err(J, LJ_TRERR_NYIVEC); + } else { + lua_assert(kind == REGKIND_FPR64); + } + } + } + + /* Intrinsics are assumed to always have side effects */ + J->needsnap = 1; + rd->nres = intrins->outsz; +} +#else +void crec_call_intrins(jit_State *J, RecordFFData *rd, CType *func) +{ + UNUSED(J);UNUSED(rd);UNUSED(func); +} +#endif + void LJ_FASTCALL recff_cdata_call(jit_State *J, RecordFFData *rd) { CTState *cts = ctype_ctsG(J2G(J)); @@ -1568,9 +1704,16 @@ void LJ_FASTCALL recff_clib_index(jit_State *J, RecordFFData *rd) CLibrary *cl = (CLibrary *)uddata(udataV(&rd->argv[0])); GCstr *name = strV(&rd->argv[1]); CType *ct; - CTypeID id = lj_ctype_getname(cts, &ct, name, CLNS_INDEX); + CTypeID id; cTValue *tv = lj_tab_getstr(cl->cache, name); - rd->nres = rd->data; + rd->nres = rd->data > 0 ? 1 : 0; + if (rd->data < 2) { + id = lj_ctype_getname(cts, &ct, name, CLNS_INDEX); + } else { + /* set some dummy values for the intrinsic namespace */ + id = CTID_VOID; + ct = ctype_get(cts, id); + } if (id && tv && !tvisnil(tv)) { /* Specialize to the symbol name and make the result a constant. */ emitir(IRTG(IR_EQ, IRT_STR), J->base[1], lj_ir_kstr(J, name)); diff --git a/src/lj_intrinsic.c b/src/lj_intrinsic.c index b92f994f..7a39abb7 100644 --- a/src/lj_intrinsic.c +++ b/src/lj_intrinsic.c @@ -313,6 +313,9 @@ static int parse_opmode(const char *op, MSize len) case 'C': flags |= INTRINSFLAG_CALLED; break; + case 'c': + flags |= INTRINSFLAG_ISCOMM; + break; case 'X': flags |= INTRINSFLAG_REXW; break; @@ -681,6 +684,13 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm) if (opcode) { setopcode(L, intrins, opcode); } + + if (intrin_iscomm(intrins) && + (intrins->insz < 2 || intrins->in[0] != intrins->in[1])) { + lj_err_callerv(L, LJ_ERR_FFI_BADOPSTR, strdata(opstr), + "bad registers for commutative mode"); + } + if (intrin_regmode(intrins) == DYNREG_FIXED) { /* dyninsz is overlapped by input registers 6/7/8 */ if ((intrins->insz < 6 && intrins->dyninsz > 0) || dynout) { @@ -837,7 +847,7 @@ int lj_intrinsic_call(CTState *cts, CType *ct) } /* Swap input values around to match the platform ordering the wrapper expects */ - if (intrin_regmode(intrins) >= DYNREG_SWAPREGS && + if (intrin_regmode(intrins) >= DYNREG_SWAPREGS && !intrin_iscomm(intrins) && reg_isgpr(intrins->in[0]) == reg_isgpr(intrins->in[1])) { if (reg_isgpr(intrins->in[0])) { intptr_t temp = context.gpr[0]; diff --git a/src/lj_intrinsic.h b/src/lj_intrinsic.h index d8ab7670..bcf4618e 100644 --- a/src/lj_intrinsic.h +++ b/src/lj_intrinsic.h @@ -59,6 +59,8 @@ typedef enum INTRINSFLAGS { INTRINSFLAG_IMMB = 0x400, /* Opcode is larger than the emit system normally handles x86/x64(4 bytes) */ INTRINSFLAG_LARGEOP = 0x800, + /* Opcode is commutative allowing the input registers to be swapped to allow better fusing */ + INTRINSFLAG_ISCOMM = 0x1000, /* Opcode uses ymm registers */ INTRINSFLAG_VEX256 = 0x4000, @@ -86,6 +88,7 @@ typedef struct AsmHeader { #define intrin_regmode(intrins) ((intrins)->flags & INTRINSFLAG_REGMODEMASK) #define intrin_setregmode(intrins, mode) \ (intrins)->flags = ((intrins)->flags & ~INTRINSFLAG_REGMODEMASK)|(mode) +#define intrin_iscomm(intrins) ((intrins)->flags & INTRINSFLAG_ISCOMM) #define intrin_getopextb(intrins) ((intrins)->out[3]) #define intrin_setopextb(intrins, opext) \ @@ -140,6 +143,7 @@ CTypeID1 regkind_ct[16]; #define reg_isfp(reg) (reg_rid(reg) >= RID_MIN_FPR) #define reg_isvec(reg) (reg_rid(reg) >= RID_MIN_FPR && reg_kind(reg) >= REGKIND_VEC_START) #define reg_isdyn(reg) (reg_rid(reg) == RID_DYN_GPR || reg_rid(reg) == RID_DYN_FPR) +#define reg_torset(reg) (reg_isgpr(reg) ? RSET_GPR : RSET_FPR) #define reg_irt(reg) (reg_isgpr(reg) ? rk_irtgpr(reg_kind(reg)) : rk_irtfpr(reg_kind(reg))) #define rk_irtgpr(kind) ((IRType)regkind_it[(kind)]) diff --git a/src/lj_ir.h b/src/lj_ir.h index ae889850..39ca0025 100644 --- a/src/lj_ir.h +++ b/src/lj_ir.h @@ -145,6 +145,9 @@ _(CALLS, S , ref, lit) \ _(CALLXS, S , ref, ref) \ _(CARG, N , ref, ref) \ + _(ASMRET, L, ref, lit) \ + _(INTRN, S, ref, lit) \ + _(ASMEND, S, ref, ref) \ \ /* End of list. */ diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c index 276dc040..ffe32788 100644 --- a/src/lj_opt_fold.c +++ b/src/lj_opt_fold.c @@ -2405,6 +2405,8 @@ LJFOLD(TDUP any) LJFOLD(CNEW any any) LJFOLD(XSNEW any any) LJFOLD(BUFHDR any any) +LJFOLD(INTRN any any) +LJFOLD(ASMRET any any) LJFOLDX(lj_ir_emit) /* ------------------------------------------------------------------------ */ diff --git a/src/lj_traceerr.h b/src/lj_traceerr.h index 1363c4f3..462e11b4 100644 --- a/src/lj_traceerr.h +++ b/src/lj_traceerr.h @@ -38,6 +38,7 @@ TREDEF(NYITMIX, "NYI: mixed sparse/dense table") TREDEF(NOCACHE, "symbol not in cache") TREDEF(NYICONV, "NYI: unsupported C type conversion") TREDEF(NYICALL, "NYI: unsupported C function type") +TREDEF(NYIVEC, "NYI: unsupported vector operation or type") /* Optimizations. */ TREDEF(GFAIL, "guard would always fail") diff --git a/tests/intrinsic_spec.lua b/tests/intrinsic_spec.lua index cb36bf0a..e95257ff 100644 --- a/tests/intrinsic_spec.lua +++ b/tests/intrinsic_spec.lua @@ -334,6 +334,15 @@ context("__mcode", function() --xmm register number too large assert_cdeferr([[void badreg_fpr1(float xmm20) __mcode("90_E");]], "invalid") end) + + it("invalid commutative mode registers", function() + assert_cdef([[int4 valid_comm(int4 v1, int4 v2) __mcode("90rMc");]], "valid_comm") + --must have 1+ input argument + assert_cdeferr([[int4 invalid_comm1(int4 v1) __mcode("90rMc");]]) + -- input register types must match + assert_cdeferr([[void invalid_comm2(int32_t i, int4 v1) __mcode("90rMc");]]) + assert_cdeferr([[void invalid_comm3(int4 v1, int32_t i) __mcode("90rMc");]]) + end) it("multidef rollback", function() diff --git a/tests/runtests.lua b/tests/runtests.lua index 6b663a34..c50342fd 100644 --- a/tests/runtests.lua +++ b/tests/runtests.lua @@ -4,22 +4,10 @@ local telescope = require("telescope") local ffi = require("ffi") local C = ffi.C -local function check(expect, func, ...) - local result = func(...) - assert(result == expect, tostring(result)) - return true -end - -telescope.make_assertion("jit", "", check) -telescope.make_assertion("exit", "", check) -telescope.make_assertion("noexit", "", check) - -telescope.make_assertion("jitchecker", "", function(checker, func, ...) - - local expected, value = checker(1, func(1, ...)) - assert(expected == value) - return true -end) +telescope.make_assertion("jit", "", tester.testsingle) +telescope.make_assertion("jitchecker", "", tester.testwithchecker) +telescope.make_assertion("noexit", "", tester.testnoexit) +telescope.make_assertion("exit", "", tester.testexit) telescope.make_assertion("cdef", "", function(cdef, name) assert(not name or type(name) == "string")