From de4c0b6ea519e536da09823a18a92db39a4d96e7 Mon Sep 17 00:00:00 2001 From: fsfod Date: Tue, 29 Mar 2016 11:10:15 +0100 Subject: [PATCH] Implement support for opcodes with dynamic registers --- src/lj_asm.c | 149 ++++++++++++- src/lj_cparse.c | 23 ++ src/lj_ctype.h | 10 +- src/lj_emit_x86.h | 24 ++- src/lj_intrinsic.c | 315 +++++++++++++++++++++++++-- src/lj_intrinsic.h | 47 ++++ src/lj_target_x86.h | 3 + tests/intrinsic_spec.lua | 449 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 996 insertions(+), 24 deletions(-) diff --git a/src/lj_asm.c b/src/lj_asm.c index c4f97a5c..e9d4ed8f 100644 --- a/src/lj_asm.c +++ b/src/lj_asm.c @@ -2452,41 +2452,53 @@ typedef struct IntrinBuildState { RegSet inset, outset, modregs; uint32_t spadj, contexspill, contexofs; uint8_t outcontext; - char vzeroupper; + char vzeroupper, fuse; } IntrinBuildState; static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info) { MSize offset = 0, i; + int dynreg = intrin_regmode(intrins); + int dynout = intrin_dynrout(intrins); memcpy(info->in, intrins->in, LJ_INTRINS_MAXREG); memcpy(info->out, intrins->out, LJ_INTRINS_MAXREG); info->contexofs = -1; + info->fuse = dynreg && + !(intrins->flags & (INTRINSFLAG_NOFUSE|INTRINSFLAG_INDIRECT)); for (i = 0; i < intrins->insz; i++) { Reg r = reg_rid(info->in[i]); + int isdyn = dynreg && i < intrins->dyninsz; + lua_assert(!isdyn || reg_isdyn(info->in[i])); if (reg_kind(info->in[i]) == REGKIND_V256) info->vzeroupper = 1; if (reg_isgpr(info->in[i])) { - if (r == RID_CONTEXT) { + if (!isdyn && r == RID_CONTEXT) { /* Save the offset in the input context so we can load it last */ info->contexofs = offset; } offset += sizeof(intptr_t); } - rset_set(info->inset, r); + if (!isdyn) + rset_set(info->inset, r); } for (i = 0; i < intrins->outsz; i++) { if (reg_kind(info->out[i]) == REGKIND_V256) info->vzeroupper = 1; + if (i == 0 && dynout) continue; rset_set(info->outset, reg_rid(info->out[i])); } + /* Don't try to fuse if a fixed register is the same as the input context */ + if (info->contexofs != -1) + info->fuse = 0; + /* TODO: dynamic output context register selection */ info->outcontext = RID_OUTCONTEXT; info->modregs |= info->outset|info->inset; @@ -2506,6 +2518,7 @@ static void intrins_loadregs(ASMState *as, CIntrinsic *intrins, IntrinBuildState /* Finally load the input register conflicting with the input context */ if (rset_test(info->inset, RID_CONTEXT) && info->contexofs != -1) { + lua_assert(!info->fuse); emit_loadofsirt(as, IRT_INTP, RID_CONTEXT, RID_CONTEXT, info->contexofs); } @@ -2513,6 +2526,16 @@ static void intrins_loadregs(ASMState *as, CIntrinsic *intrins, IntrinBuildState for (i = 0; i < intrins->insz; i++) { uint32_t reg = info->in[i]; Reg r = reg_rid(reg); + if (i == 0 && info->fuse) { + lua_assert(info->contexofs == -1); + /* The load is fused into the modrm of the opcode emitted in emit_intrins */ + if (reg_isgpr(reg)) { + gpr++; + } else { + fpr++; + } + continue; + } if (reg_isgpr(reg)) { if (r != RID_CONTEXT) @@ -2559,6 +2582,32 @@ static void intrins_saveregs(ASMState *as, CIntrinsic *intrins, IntrinBuildState } } +/* Replace placeholder register ids with platform specific registers */ +static RegSet pickdynlist(uint8_t *list, MSize sz, RegSet freeset) +{ + MSize i; + RegSet free = freeset; + + for (i = 0; i < sz; i++) { + RegSet rset = free & (reg_isgpr(list[i]) ? RSET_GPR : RSET_FPR); + Reg r; + + /* Try to use scratch register first */ + if ((rset & RSET_SCRATCH) != 0) { + rset = rset & RSET_SCRATCH; + } + + r = rset_pickbot(rset); + lua_assert(rset_test(free, r)); + + list[i] = reg_setrid(list[i], r); + rset_clear(free, r); + } + + /* Return register set of extra used registers */ + return freeset & ~free; +} + /* ** Stack spill slots and gpr slots in the context are always the size of a native pointer ** The output context register is always spilled to a fixed stack offset @@ -2577,7 +2626,10 @@ static void wrap_intrins(jit_State *J, CIntrinsic *intrins, IntrinWrapState *sta AsmHeader *hdr; MCode *asmofs = NULL, *origtop; void* target = state->target; + uint8_t *in = info.in, *out = info.out; int spadj = 0; + int dynreg = intrin_regmode(intrins); + Reg rout = RID_NONE, rin = RID_NONE; lj_asm_setup_intrins(J, as); origtop = as->mctop; @@ -2586,6 +2638,60 @@ static void wrap_intrins(jit_State *J, CIntrinsic *intrins, IntrinWrapState *sta info.modregs = state->mod; intrins_setup(intrins, &info); + /* Pick some ABI specific scratch registers for the opcode's input/output registers */ + if (dynreg) { + RegSet scatch = RSET_ALL & ~info.inset; + int inofs = 0; + lua_assert(intrins->dyninsz <= intrins->insz); + /* Avoid unnecessary spill of the output context */ + if (intrins->outsz != 0) + rset_clear(scatch, info.outcontext); + + if (dynreg == DYNREG_OPEXT || dynreg == DYNREG_TWOSTORE || reg_isvec(in[0])) { + info.fuse = 0; + } + + if (info.fuse) { + inofs++; + rin = RID_CONTEXT; + rset_clear(scatch, RID_CONTEXT); + } + + if ((intrins->dyninsz-inofs) > 0) { + rset_clear(scatch, RID_CONTEXT); + /* Merge in registers used for dynamic input registers */ + info.inset |= pickdynlist(in+inofs, intrins->dyninsz-inofs, scatch); + } + + if (rin == RID_NONE) + rin = reg_rid(in[0]); + + /* Allocate the dynamic output register */ + if (intrins->outsz > 0 && intrin_dynrout(intrins)) { + if (dynreg == DYNREG_INOUT) { + rout = reg_rid(in[1]); + out[0] = reg_setrid(out[0], rout); + } else if (dynreg == DYNREG_OPEXT) { + /* Destructive single register opcode */ + rout = out[0] = reg_setrid(out[0], rin); + } else { + scatch = RSET_INIT & ~info.outset; + rset_clear(scatch, info.outcontext); + scatch = pickdynlist(out, 1, scatch); + rout = reg_rid(out[0]); + } + + rset_set(info.outset, rout); + } + + if (rout == RID_NONE && intrins->dyninsz > 1) { + lua_assert(reg_isdyn(intrins->in[1])); + rout = reg_rid(in[1]); + } + + info.modregs |= info.inset|info.outset; + } + /* Used for picking scratch register when loading or saving boxed values */ as->modset = info.modregs|RID_CONTEXT; @@ -2651,16 +2757,49 @@ restart: emit_storeofsirt(as, IRT_INTP, info.outcontext, RID_SP, TEMPSPILL); } - if (intrins->flags & INTRINSFLAG_CALLED) { - Reg rin = 0; +#if LJ_TARGET_X86ORX64 +/* Setup modrm to tobe a load from the input context pointer we assume offset + * will be to the first value in either the gpr or fpr part of the context + * because the first input register should always be the dynamic one for opcodes + */ + as->mrm.idx = RID_NONE; + as->mrm.scale = XM_SCALE1; + as->mrm.ofs = 0; + + if (dynreg) { + if (info.fuse || (intrins->flags & INTRINSFLAG_INDIRECT)) { + lua_assert(!reg_isvec(in[0])); + as->mrm.base = rin; + rin = RID_MRM; + + if (info.fuse) { + /* Set the fused offset into the input context */ + if (reg_isfp(in[0])) { + as->mrm.ofs = offsetof(RegContext, fpr); + } else { + as->mrm.ofs = offsetof(RegContext, gpr); + } + } + } else { + as->mrm.base = RID_NONE; + lua_assert(rin != RID_NONE); + } + } else if(intrins->flags & INTRINSFLAG_CALLED) { #if LJ_64 /* Pick a scratch register in case the relative distance for the call is ** larger than a signed 32bit value */ rin = intrinsic_scratch(as, RSET_GPR); #endif + } +#endif + + if (intrins->flags & INTRINSFLAG_CALLED) { /* emit a call to the target which may be collocated after us */ emit_intrins(as, intrins, rin, (uintptr_t)target); + } else if (dynreg) { + /* Write an opcode to the wrapper */ + asmofs = emit_intrins(as, intrins, rin, rout); } else { /* Append the user supplied machine code */ asmofs = asm_mcode(as, state->target, state->targetsz); diff --git a/src/lj_cparse.c b/src/lj_cparse.c index 44f808d9..f26b4582 100644 --- a/src/lj_cparse.c +++ b/src/lj_cparse.c @@ -1204,6 +1204,18 @@ static void cp_decl_msvcattribute(CPState *cp, CPDecl *decl) #if LJ_HASINTRINSICS +static uint8_t getsignedbyte(CPState *cp) +{ + int32_t val = cp->val.i32; + + if (cp->tok != CTOK_INTEGER) + cp_err_token(cp, CTOK_INTEGER); + + /* Flatten negative values to a signed 8 bit number */ + /* NYI: immediate values larger than 8 bits */ + return (val < 0 ? (uint8_t)(int8_t)val : val); +} + static void cp_decl_mcode(CPState *cp, CPDecl *decl) { /* Check were declared after a function definition */ @@ -1224,6 +1236,17 @@ static void cp_decl_mcode(CPState *cp, CPDecl *decl) decl->redir = cp->str; cp_next(cp); + /* Check if we have immediate and prefix byte values */ + if (cp_opt(cp, ',')) { + /* NYI: immediate values larger than 8 bits */ + decl->bits = (CTSize)getsignedbyte(cp); + cp_next(cp); + + if (cp_opt(cp, ',')) { + decl->bits |= getsignedbyte(cp) << 8; + cp_next(cp); + } + } cp_check(cp, ')'); /* Mark the function as an intrinsic */ decl->stack[decl->top-1].info |= CTF_INTRINS; diff --git a/src/lj_ctype.h b/src/lj_ctype.h index c3b081dc..1661cb9d 100644 --- a/src/lj_ctype.h +++ b/src/lj_ctype.h @@ -175,7 +175,15 @@ typedef int (LJ_FASTCALL *IntrinsicWrapper)(void *incontext, void* outcontext); typedef struct CIntrinsic { IntrinsicWrapper wrapped; - uint8_t in[8]; + union { + uint8_t in[8]; + struct { + uint8_t opregs[5]; /* cmpxchg8b */ + uint8_t immb; + uint8_t prefix; /* prefix byte see INTRINSFLAG_PREFIX */ + uint8_t dyninsz; /* dynamic input register count */ + }; + }; union { uint8_t out[8]; struct { diff --git a/src/lj_emit_x86.h b/src/lj_emit_x86.h index 8a1a0975..25993695 100644 --- a/src/lj_emit_x86.h +++ b/src/lj_emit_x86.h @@ -650,7 +650,29 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs) static MCode* emit_intrins(ASMState *as, CIntrinsic *intrins, Reg r1, uintptr_t r2) { - if (intrins->flags & INTRINSFLAG_CALLED) { + uint32_t regmode = intrin_regmode(intrins); + if (regmode) { + if (regmode == DYNREG_OPEXT) { + r2 = intrin_getopextb(intrins); + } + + /* force 64 bit operands */ + if (intrins->flags & INTRINSFLAG_REXW) { + r2 |= REX_64; + } + + if (intrins->flags & INTRINSFLAG_IMMB) { + *--as->mcp = intrins->immb; + } + + emit_mrm(as, intrins->opcode, (Reg)r2, r1); + + if (intrins->flags & INTRINSFLAG_PREFIX) { + *--as->mcp = intrins->prefix; + } + + checkmclim(as); + } else if (intrins->flags & INTRINSFLAG_CALLED) { lua_assert(r2); emit_call_(as, (MCode*)r2, r1); return NULL; diff --git a/src/lj_intrinsic.c b/src/lj_intrinsic.c index 8cea8bdf..a8419c84 100644 --- a/src/lj_intrinsic.c +++ b/src/lj_intrinsic.c @@ -24,6 +24,7 @@ typedef enum RegFlags { REGFLAG_64BIT = REGKIND_GPR64 << 6, /* 64 bit override */ REGFLAG_BLACKLIST = 1 << 17, + REGFLAG_DYN = 1 << 18, }RegFlags; typedef struct RegEntry { @@ -55,6 +56,8 @@ RegEntry reglut[] = { #if LJ_64 GPRDEF_R64(MKREG_GPR64) #endif + {"gpr32", REGFLAG_DYN|RID_DYN_GPR}, + {"gpr64", REGFLAG_64BIT|REGFLAG_DYN|RID_DYN_GPR} }; static CTypeID register_intrinsic(lua_State *L, CIntrinsic* src, CType *func) @@ -118,7 +121,9 @@ static int parse_fprreg(const char *name, uint32_t len) } rid += RID_MIN_FPR; } else { - return -1; + /* Unnumbered reg is considered a placeholder for a dynamic reg */ + flags = REGFLAG_DYN; + rid = RID_DYN_FPR; } if (name[0] == 'y') { @@ -192,7 +197,7 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid, CTypeID liststart) { CTState *cts = ctype_cts(L); - uint32_t i, count = 0; + uint32_t i, count = 0, dyncount = 0; RegSet rset = 0; const char *listname; uint8_t *regout = NULL; @@ -231,12 +236,21 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid, setarg_casttype(cts, ctarg, ctype_rawchild(cts, ctarg)); r = reg_rid(reg); + + if (reg & REGFLAG_DYN) { + if (regsetid == REGSET_MOD) + lj_err_callerv(L, LJ_ERR_FFI_BADREG, "cannot use dynamic register", strdata(str), listname); - /* Check for duplicate registers in the list */ - if (rset_test(rset, r)) { - lj_err_callerv(L, LJ_ERR_FFI_BADREG, "duplicate", strdata(str), listname); + if (++dyncount > LJ_INTRINS_MAXDYNREG) { + lj_err_callerv(L, LJ_ERR_FFI_BADREG, "too many dynamic", strdata(str), listname); + } + } else { + /* Check for duplicate fixed registers in the list */ + if (rset_test(rset, r)) { + lj_err_callerv(L, LJ_ERR_FFI_BADREG, "duplicate", strdata(str), listname); + } + rset_set(rset, r); } - rset_set(rset, r); if (regsetid == REGSET_OUT && reg_isgpr(reg)) { CType *ct = ctype_rawchild(cts, ctarg); @@ -261,6 +275,9 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid, if (regsetid == REGSET_IN) { intrins->insz = (uint8_t)count; + if (dyncount != 0) { + intrins->dyninsz = dyncount; + } } else if (regsetid == REGSET_OUT) { intrins->outsz = (uint8_t)count; } @@ -268,15 +285,91 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid, return rset; } +static int parse_opmode(const char *op, MSize len) +{ + MSize i = 0; + int m = 0; + int r = 0; + int flags = 0; + + for (; i < len; i++) { + switch (op[i]) { + case 'm': + m = 1; + break; + case 'M': + m = 2; + break; + /* modrm register */ + case 'r': + r = 1; + break; + case 'R': + r = r == 0 ? 2 : 3; + break; + case 'U': + flags |= INTRINSFLAG_IMMB; + break; + case 'C': + flags |= INTRINSFLAG_CALLED; + break; + case 'X': + flags |= INTRINSFLAG_REXW; + break; + case 'P': + flags |= INTRINSFLAG_PREFIX; + break; + case 'I': + flags |= INTRINSFLAG_INDIRECT; + break; + case 'E': + flags |= INTRINSFLAG_EXPLICTREGS; + break; + + default: + /* return index of invalid flag */ + return -(int)(i+1); + } + } + + if ((r || m) & !(flags & INTRINSFLAG_REGMODEMASK)) { + + /* 'Rm' mem/r is left reg is right */ + if (r == 2 && m == 1) { + flags |= DYNREG_TWOSTORE; /* MR */ + } else if(r == 0 && m == 1) { + flags |= DYNREG_OPEXT; + } else if ((r == 1 && m == 2) || r == 3) { + flags |= DYNREG_ONE; /* RM */ + } else { + return -1; + } + + /* if neither of the operands is listed as memory disable trying to fuse a load in */ + if (r == 3) { + flags |= INTRINSFLAG_NOFUSE; /* rR */ + } + } + + return flags; +} + static void setopcode(lua_State *L, CIntrinsic *intrins, uint32_t opcode) { int len; + uint32_t opext = 0; if (opcode == 0) { lj_err_callermsg(L, "bad opcode literal"); } #if LJ_TARGET_X86ORX64 + /* the LSB of the opcode should be the register number */ + if (intrin_regmode(intrins) == DYNREG_OPEXT) { + opext = (opcode & 7); + opcode = opcode >> 4; + } + if (opcode <= 0xff) { len = 1; } else if (opcode <= 0xffff) { @@ -288,11 +381,16 @@ static void setopcode(lua_State *L, CIntrinsic *intrins, uint32_t opcode) } opcode = lj_bswap(opcode); + if (len < 4) { opcode |= (uint8_t)(int8_t)-(len+1); } else { lj_err_callermsg(L, "bad opcode literal"); } + + if (intrin_regmode(intrins) == DYNREG_OPEXT) { + intrin_setopextb(intrins, opext); + } #endif intrins->opcode = opcode; @@ -303,6 +401,7 @@ static int parse_opstr(lua_State *L, GCstr *opstr, CIntrinsic *intrins, int* bui const char *op = strdata(opstr); uint32_t opcode = 0; uint32_t i; + int flags; /* Parse the opcode number if this is not a template */ if (op[0] != '?') { @@ -320,10 +419,21 @@ static int parse_opstr(lua_State *L, GCstr *opstr, CIntrinsic *intrins, int* bui opcode = (opcode << 4) + (d & 15); } + if (*op == '_') op++; } else { *buildflags |= INTRINSFLAG_TEMPLATE; op++; } + + flags = parse_opmode(op, opstr->len - (MSize)(op-strdata(opstr))); + + if (flags < 0) { + lj_err_callerv(L, LJ_ERR_FFI_BADOPSTR, strdata(opstr), "bad mode flags"); + } else { + intrins->flags |= flags; + } + /* Flags only used during construction of the intrinsic in the upper bits*/ + *buildflags |= flags & 0xffff0000; return opcode; } @@ -378,7 +488,8 @@ CTypeID lj_intrinsic_template(lua_State *L, int narg) intrins = lj_intrinsic_get(cts, ct->size); /* Can't be a template if it an opcode */ - if ((intrins->opcode && intrins->outsz <= 4) || intrins->wrapped) + if (intrin_regmode(intrins) != DYNREG_FIXED || (intrins->opcode && intrins->outsz <= 4) || + intrins->wrapped) lj_err_arg(L, narg, LJ_ERR_FFI_INVTYPE); return id; @@ -407,21 +518,80 @@ int lj_intrinsic_create(lua_State *L) return 1; } +static int inferreg(CTState *cts, CType *ct) { + CTSize sz = ct->size; + int rid = -1, kind = -1; + + if (ctype_isnum(ct->info)) { + if (ctype_isfp(ct->info)) { + rid = RID_DYN_FPR; + if (sz > 8) + return -1; + kind = sz == 4 ? REGKIND_FPR32 : REGKIND_FPR64; + } else { + rid = RID_DYN_GPR; + if (sz == 8) { + if (LJ_32) + return -1; /* NYI: 64 bit pair registers */ + kind = REGKIND_GPR64; + rid |= INTRINSFLAG_REXW; + } else { + kind = ct->info & CTF_UNSIGNED ? REGKIND_GPR32CD : REGKIND_GPRI32; + } + } + } else if (ctype_isptr(ct->info)) { + ct = ctype_raw(cts, ctype_cid(ct->info)); + if (ctype_isvector(ct->info)) { + goto vec; + } else { + rid = RID_DYN_GPR; + kind = LJ_32 ? REGKIND_GPR32CD : REGKIND_GPR64; + } + } else if (ctype_isvector(ct->info)) { + CType *vtype; + vec: + vtype = ctype_raw(cts, ctype_cid(ct->info)); + if (ctype_typeid(cts, vtype) < CTID_BOOL || ctype_typeid(cts, vtype) > CTID_DOUBLE || + (ct->size != 16 && ct->size != 32)) { + return -1; + } + + if (ct->size == 32) { + kind = REGKIND_V256; + rid = RID_DYN_FPR | INTRINSFLAG_VEX256; + } else { + kind = REGKIND_V128; + rid = RID_DYN_FPR; + } + + } else { + lua_assert(ctype_iscomplex(ct->info)); + return -1; + } + + return reg_make(rid, kind); +} + GCcdata *lj_intrinsic_createffi(CTState *cts, CType *func) { GCcdata *cd; CIntrinsic *intrins = lj_intrinsic_get(cts, func->size); CTypeID id = ctype_typeid(cts, func); RegSet mod = intrin_getmodrset(cts, intrins); - uint32_t op = intrins->opcode; - void* mcode = ((char*)&op) + (4-intrin_oplen(intrins)); - + if (intrins->opcode == 0) { lj_err_callermsg(cts->L, "expected non template intrinsic"); } - intrins->wrapped = lj_intrinsic_buildwrap(cts->L, intrins, mcode, - intrin_oplen(intrins), mod); + /* Build the interpreter wrapper */ + if (intrin_regmode(intrins) == DYNREG_FIXED) { + uint32_t op = intrins->opcode; + void* mcode = ((char*)&op) + (4-intrin_oplen(intrins)); + intrins->wrapped = lj_intrinsic_buildwrap(cts->L, intrins, mcode, + intrin_oplen(intrins), mod); + } else { + intrins->wrapped = lj_intrinsic_buildwrap(cts->L, intrins, NULL, 0, mod); + } cd = lj_cdata_new(cts, id, CTSIZE_PTR); *(void **)cdataptr(cd) = intrins->wrapped; @@ -433,8 +603,9 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm) CTState *cts = ctype_cts(L); CType *func = ctype_get(cts, fid); CTypeID sib = func->sib, retid = ctype_cid(func->info); + RegSet routset = 0; uint32_t opcode; - int buildflags = 0; + int buildflags = 0, dynout = 0; CIntrinsic _intrins; CIntrinsic* intrins = &_intrins; memset(intrins, 0, sizeof(CIntrinsic)); @@ -445,18 +616,62 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm) return 0; } - if (sib) { + if (buildflags & INTRINSFLAG_EXPLICTREGS) { process_reglist(L, intrins, REGSET_IN, sib); - } - + } else { + /* Infer the types of input register based on parameter types */ + while (sib != 0) { + CType *arg = ctype_get(cts, sib); + CType *ct = ctype_rawchild(cts, arg); + int reg = inferreg(cts, ct); + sib = arg->sib; + + if (reg == -1) { + return 0; + } + + /* Save the register info in place of the argument index */ + arg->size = reg & 0xff; + setarg_casttype(cts, arg, ct); + + /* Merge shared register flags */ + intrins->flags |= reg & 0xff00; + + intrins->in[intrins->insz++] = reg & 0xff; + intrins->dyninsz++; + if (intrins->dyninsz > LJ_INTRINS_MAXDYNREG) + return 0; + + if (sib != 0 && intrins->insz == LJ_INTRINS_MAXREG) { + return 0; + } + } + } if (retid != CTID_VOID) { CType *ct = ctype_get(cts, retid); /* Check if the intrinsic had __reglist declared on it */ if (ctype_isfield(ct->info)) { - process_reglist(L, intrins, REGSET_OUT, retid); + routset = process_reglist(L, intrins, REGSET_OUT, retid); sib = retid; + } else { + int reg = inferreg(cts, ct); + + if (reg == -1) { + return 0; + } + /* Merge shared register flags */ + intrins->flags |= reg & 0xff00; + + /* Create a field entry for the return value that we make the ctype child + ** of the function. + */ + sib = lj_ctype_new(cts, &ct); + ct->info = CTINFO(CT_FIELD, retid); + ct->size = reg; + intrins->out[intrins->outsz++] = reg & 0xff; + dynout = 1; } } else { sib = retid; @@ -466,6 +681,60 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm) if (opcode) { setopcode(L, intrins, opcode); } + if (intrin_regmode(intrins) == DYNREG_FIXED) { + /* dyninsz is overlapped by input registers 6/7/8 */ + if ((intrins->insz < 6 && intrins->dyninsz > 0) || dynout) { + lj_err_callerv(L, LJ_ERR_FFI_BADOPSTR, strdata(opstr), + "no register mode specified for dynamic registers"); + } + } + +#if LJ_TARGET_X86ORX64 + /* Validate dynamic register count for the specified register mode*/ + if (intrin_regmode(intrins) == DYNREG_ONE){ + if (intrins->dyninsz == 2 && intrins->outsz == 1 && routset == 0) { + /* Infer destructive opcode if the single out */ + intrin_setregmode(intrins, DYNREG_INOUT); + } else if(intrins->dyninsz == 2){ + intrin_setregmode(intrins, DYNREG_TWOIN); + } else if (intrins->dyninsz == 0 || intrins->outsz == 0 || + !reg_isdyn(intrins->out[0])) { + return 0; + } + }else if (intrin_regmode(intrins) == DYNREG_TWOSTORE) { + if (intrins->dyninsz == 1 && intrins->outsz != 0) { + intrin_setregmode(intrins, DYNREG_ONESTORE); + } else if (intrins->insz == 0 || intrins->dyninsz == 0) { + /* Store opcodes need at least an address the value could be an immediate */ + return 0; + } + } else if (intrin_regmode(intrins) == DYNREG_OPEXT) { + if (intrins->dyninsz != 1) + return 0; + } + + /* Swap the registers from there declared order to match how there + ** processed + */ + if (intrin_regmode(intrins) >= DYNREG_SWAPREGS) { + uint8_t temp = intrins->in[0]; + intrins->in[0] = intrins->in[1]; intrins->in[1] = temp; + } +#endif + + if (intrins->flags & INTRINSFLAG_PREFIX) { + intrins->prefix = (uint8_t)imm; + /* Prefix value should be declared before an immediate value in the + ** __mcode definition the second number declared is shifted right when + ** packed in the ctype. + */ + imm >>= 8; + } + + if (intrins->flags & INTRINSFLAG_IMMB) { + intrins->immb = (uint8_t)(imm & 0xff); + } + register_intrinsic(L, intrins, ctype_get(cts, fid)); lua_assert(sib > 0 && sib < cts->top); @@ -567,6 +836,18 @@ int lj_intrinsic_call(CTState *cts, CType *ct) lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg+1) | CCF_INTRINS_ARG); } + /* Swap input values around to match the platform ordering the wrapper expects */ + if (intrin_regmode(intrins) >= DYNREG_SWAPREGS && + reg_isgpr(intrins->in[0]) == reg_isgpr(intrins->in[1])) { + if (reg_isgpr(intrins->in[0])) { + intptr_t temp = context.gpr[0]; + context.gpr[0] = context.gpr[1]; context.gpr[1] = temp; + } else { + double temp = context.fpr[0]; + context.fpr[0] = context.fpr[1]; context.fpr[1] = temp; + } + } + /* Pass in the return type chain so the results are typed */ outcontent = setup_results(L, intrins, ctype_cid(ctype_get(cts, funcid)->info)); diff --git a/src/lj_intrinsic.h b/src/lj_intrinsic.h index bae8741e..e275c891 100644 --- a/src/lj_intrinsic.h +++ b/src/lj_intrinsic.h @@ -14,20 +14,54 @@ #define LJ_INTRINS_MAXREG 8 #endif +/* The max number of dynamic registers in each reglist(in/out)*/ +#define LJ_INTRINS_MAXDYNREG 2 + typedef struct LJ_ALIGN(16) RegContext { intptr_t gpr[LJ_INTRINS_MAXREG]; double fpr[LJ_INTRINS_MAXREG]; } RegContext; +typedef enum REGMODE { + DYNREG_FIXED = 0, + /* one input register and optionally one output */ + DYNREG_ONE, + /* 1(R) register in, 1 out(M) which can be a memory address to store the value */ + DYNREG_ONESTORE, + /* 2 in 0 out first must always be treated as indirect */ + DYNREG_TWOSTORE, + /* one input(M) register and the second is part of part of the opcode */ + DYNREG_OPEXT, + /* Two input register and one output same register that's same RID the second input */ + DYNREG_INOUT, + /* Two input registers with M dynamic output register */ + DYNREG_TWOIN, + + DYNREG_SWAPREGS = DYNREG_INOUT, +} REGMODE; + typedef enum INTRINSFLAGS { + INTRINSFLAG_REGMODEMASK = 7, + INTRINSFLAG_MEMORYSIDE = 0x08, /* has memory side effects so needs an IR memory barrier */ /* Intrinsic should be emitted as a naked function that is called */ INTRINSFLAG_CALLED = 0x20, /* MODRM should always be set as indirect mode */ INTRINSFLAG_INDIRECT = 0x40, + /* Don't fuse load into op */ + INTRINSFLAG_NOFUSE = 0x80, + /* Force REX.w 64 bit size override bit to be set for x64 */ + INTRINSFLAG_REXW = 0x100, + /* Append a user supplied prefixed before the opcode and its REX byte */ + INTRINSFLAG_PREFIX = 0x200, + /* Opcode has an immediate byte that needs to be set at construction time */ + INTRINSFLAG_IMMB = 0x400, + /* Opcode uses ymm registers */ INTRINSFLAG_VEX256 = 0x4000, + /* Input parameters names explicitly declare input registers */ + INTRINSFLAG_EXPLICTREGS = 0x10000, /* Intrinsic is a template with no machine code set until instantiate at runtime with ** user supplied code. */ @@ -47,7 +81,19 @@ typedef struct AsmHeader { uint32_t totalzs; } AsmHeader; +#define intrin_regmode(intrins) ((intrins)->flags & INTRINSFLAG_REGMODEMASK) +#define intrin_setregmode(intrins, mode) \ + (intrins)->flags = ((intrins)->flags & ~INTRINSFLAG_REGMODEMASK)|(mode) + +#define intrin_getopextb(intrins) ((intrins)->out[3]) +#define intrin_setopextb(intrins, opext) \ + lua_assert((intrins)->outsz < 4); \ + ((intrins)->out[3] = (opext)) #define intrin_oplen(intrins) ((-(int8_t)(intrins)->opcode)-1) + +/* odd numbered have an dynamic output */ +#define intrin_dynrout(intrins) (intrin_regmode(intrins) && reg_isdyn(intrins->out[0])) +/* Get the optional RegSet of registers modified by the intrinsic */ #define intrin_getmodrset(cts, intrins) \ ((ctype_get(cts, (intrins)->id)->size >> 16) ? \ ctype_get(cts, ctype_get(cts, (intrins)->id)->size >> 16)->size : 0) @@ -91,6 +137,7 @@ CTypeID1 regkind_ct[16]; #define reg_isgpr(reg) (reg_rid(reg) < RID_MAX_GPR) #define reg_isfp(reg) (reg_rid(reg) >= RID_MIN_FPR) #define reg_isvec(reg) (reg_rid(reg) >= RID_MIN_FPR && reg_kind(reg) >= REGKIND_VEC_START) +#define reg_isdyn(reg) (reg_rid(reg) == RID_DYN_GPR || reg_rid(reg) == RID_DYN_FPR) #define reg_irt(reg) (reg_isgpr(reg) ? rk_irtgpr(reg_kind(reg)) : rk_irtfpr(reg_kind(reg))) #define rk_irtgpr(kind) ((IRType)regkind_it[(kind)]) diff --git a/src/lj_target_x86.h b/src/lj_target_x86.h index c5f23082..ffcd7411 100644 --- a/src/lj_target_x86.h +++ b/src/lj_target_x86.h @@ -74,6 +74,9 @@ enum { RID_CONTEXT = RID_ECX, RID_OUTCONTEXT = RID_EDX, #endif + /* Placeholder register ids for dynamic register entries in intrinsics */ + RID_DYN_FPR = RID_MAX_FPR-1, + RID_DYN_GPR = RID_SP, }; /* -- Register sets ------------------------------------------------------- */ diff --git a/tests/intrinsic_spec.lua b/tests/intrinsic_spec.lua index ea33c1ee..2728c95f 100644 --- a/tests/intrinsic_spec.lua +++ b/tests/intrinsic_spec.lua @@ -353,6 +353,21 @@ context("__mcode", function() assert_equal(ffi.C.multi1(1.1), 1) end) + it("bad dynamic registers", function() + --No modrm specifed for the implicit output register decleared having a non void return type + assert_cdeferr([[int32_t dynerr1() __mcode("90");]]) + assert_cdeferr([[void dynerr2(int32_t a) __mcode("90");]]) + assert_cdeferr([[int32_t dynerr3(int32_t a) __mcode("90");]]) + -- no dynamic registers listed + assert_cdeferr([[void dynerr4() __mcode("90m");]]) + assert_cdeferr([[void dynerr5() __mcode("90rM");]]) + assert_cdeferr([[void dynerr6() __mcode("90Mr");]]) + --need 2 in or 1 in and a return type + assert_cdeferr([[void dynerr7(int32_t a) __mcode("90rM");]]) + --too many dynamic registers + assert_cdeferr([[void dynerr8(int a, int b, int c) __mcode("90rR");]]) + end) + it("bad ffi types mcode", function() assert_cdeferr([[void testffi1(float a2, ...) __mcode("90");]]) assert_cdeferr([[void testffi2(complex a2) __mcode("90");]]) @@ -382,6 +397,131 @@ context("__mcode", function() assert_error(function() idiv(1, 2, 3, 4) end) end) + it("output pointers", function() + assert_cdef([[const char* addptr(const char* nptr, int32_t n) __mcode("03rM");]], "addptr") + local s = "0123456789abcdefghijklmnopqrstvwxyz" + + local ptr = ffi.C.addptr(s, 0) + assert_equal(ptr, ffi.cast("const char*", s)) + assert_equal(ptr[0], string.byte(s)) + + local function checker(i, sptr) + assert(tostring(sptr), tostring(ptr+i)) + assert(sptr == ptr+i) + end + + assert_jitchecker(checker, function(i) + return (ffi.C.addptr(s, i)) + end) + end) + + it("signed/unsigned numbers", function() + assert_cdef([[int32_t sub_signed(int32_t n, int32_t i) __mcode("2brM");]], "sub_signed") + assert_cdef([[uint32_t sub_unsigned(uint32_t n, uint32_t i) __mcode("2brM");]], "sub_unsigned") + assert_cdef([[uint32_t sub_signedun(int32_t n, int32_t i) __mcode("2brM");]], "sub_signedun") + + assert_equal(tonumber(ffi.C.sub_unsigned(3, 1)), 2) + + local function unsignedtest(n1, n2) + return (tonumber(ffi.C.sub_unsigned(n1, n2))) + end + + assert_jit(2, unsignedtest, 3, 1) + assert_jit(2999999999, unsignedtest, 3000000000, 1) + --wrap around + assert_jit(4294967295, unsignedtest, 300, 301) + + local function unsignedtest_boxed(n1, n2) + return (ffi.C.sub_unsigned(n1, n2)) + end + + assert_jit(ffi.new("uint32_t", 2), unsignedtest_boxed, 3, 1) + assert_jit(ffi.new("uint32_t", 2999999999), unsignedtest_boxed, 3000000000, 1) + --wrap around + assert_jit(ffi.new("uint32_t", 4294967295), unsignedtest_boxed, 300, 301) + + local function signedtest(n1, n2) + return (ffi.C.sub_signed(n1, n2)) + end + + assert_jit(-2, signedtest, -1, 1) + assert_noexit(3, signedtest, -1, -4) + end) + + it("op encode", function() + assert_cdef([[int32_t not32(int32_t n) __mcode("F72m");]], "not32") + + local function test_not(i) + return (ffi.C.not32(i)) + end + + assert_jit(-1, test_not, 0) + assert_noexit(0, test_not, -1) + + assert_cdef([[int32_t add_imm3(int32_t n) __mcode("830mU", 3);]], "add_imm3") + + local function checker(i, n) + return i+3, n + end + assert_jitchecker(checker, function(i) + return (ffi.C.add_imm3(i)) + end) + end) + + it("prefix byte", function() + assert_cdef([[void atomicadd(int32_t* nptr, int32_t n) __mcode("01mRIP", 0xF0);]], "atomicadd") + + local sum = 0 + local function checker(i, jsum) + sum = sum+i + if(jsum ~= sum) then + return jsum, sum + end + end + + local numptr = ffi.new("int32_t[1]", 0) + + assert_jitchecker(checker, function(i) + ffi.C.atomicadd(numptr, i) + return numptr[0] + end) + end) + + if ffi.arch == "x64" then + it("prefix64", function() + assert_cdef([[void atomicadd64(int64_t* nptr, int64_t n) __mcode("01mRIP", 0xF0);]], "atomicadd64") + + local sum = 0 + local function checker(i, jsum) + sum = sum+i + assert(jsum == sum) + end + + local numptr = ffi.new("int64_t[1]", 0) + + assert_jitchecker(checker, function(i) + ffi.C.atomicadd64(numptr, i) + return numptr[0] + end) + end) + end + + it("prefix and imm byte", function() + assert_cdef([[void atomicadd1(int32_t* nptr) __mcode("830mIUP", 0xF0, 0x01);]], "atomicadd1") + + local function checker(i, jsum) + if(jsum ~= i) then + return i, jsum + end + end + + local numptr = ffi.new("int32_t[1]", 0) + + assert_jitchecker(checker, function(i) + ffi.C.atomicadd1(numptr) + return numptr[0] + end) + end) it("idiv(template)", function() assert_cdef([[void idivT(int32_t eax, int32_t ecx) __mcode("?E") __reglist(out, int32_t eax, int32_t edx)]]) --trying to create template intrinsic through C library should always fail @@ -416,6 +556,117 @@ context("__mcode", function() assert_exit(10, test_idiv, 10, 5) end) + it("prefetch", function() + assert_cdef([[void prefetch0(void* mem) __mcode("0F181mI")]], "prefetch0") + assert_cdef([[void prefetch1(void* mem) __mcode("0F182mI")]], "prefetch1") + assert_cdef([[void prefetch2(void* mem) __mcode("0F183mI")]], "prefetch2") + assert_cdef([[void prefetchnta(void* mem) __mcode("0F180mI")]], "prefetchnta") + + local asm = ffi.C + local kmem = ffi.new("int[4]") + local mem = 1 + mem = mem and ffi.new("int[8]", 1, 2, 3, 4, 5, 6, 7, 8) + + local function testprefetch(a, b, c) + local n = a+b + local ptr = mem+c + + asm.prefetch2(ptr) + asm.prefetch1(kmem) + asm.prefetch0(mem+a) + asm.prefetchnta(mem) + + asm.prefetch0(kmem+a) + asm.prefetch1(kmem+b) + return (ptr) ~= 0 and ptr[0] + ptr[3] + end + + assert_jit(11, testprefetch, 1, 2, 3) + end) + + it("cmpxchg", function() + assert_cdef([[void cmpxchg(int32_t* gpr32, int32_t gpr32, int32_t eax) __mcode("0FB1mRPEI", 0xF0) __reglist(out, int32_t eax);]], "cmpxchg") + + local kptr32 = ffi.new("int32_t[1]", 0) + int4[0] = 0 + + local function checker(i, n, eax) + assert(n == i) + assert(kptr32[0] == i) + assert(eax == i-1) + end + + local function test_cmpxchg(i) + local eax = ffi.C.cmpxchg(kptr32, i, i-1) + return kptr32[0], eax + end + + assert_jitchecker(checker, test_cmpxchg) + --test not equal non swapping + local num, eax = test_cmpxchg(0) + assert_equal(eax, kptr32[0]) + + num, eax = test_cmpxchg(kptr32[0]+1) + assert_equal(eax, kptr32[0]-1) + end) + +if ffi.arch == "x64" then + it("cmpxchg64", function() + assert_cdef([[void cmpxchg64(int64_t* gpr64, int64_t gpr64, int64_t rax) __mcode("0FB1mRPEIX", 0xF0) __reglist(out, int64_t rax);]], "cmpxchg64") + + local kptr64 = ffi.new("int64_t[1]", 0) + + local function test_cmpxchg64(i) + local rax = ffi.C.cmpxchg64(kptr64, -i, -(i-1)) + return kptr64[0], rax + end + + local function checker(i, newval, rax) + assert(newval == -i) + assert(kptr64[0] == -i) + assert(rax == -(i-1)) + end + + assert_jitchecker(checker, test_cmpxchg64, 2) + + --test not equal non swapping + local num, rax = test_cmpxchg64(0, 1) + assert_equal(rax, kptr64[0]) + end) +end + + it("cmpxchg8b", function() + + ffi.cdef([[typedef struct int32pair { + int32_t i1; + int32_t i2; + } __attribute__((aligned(8))) int32pair;]]) + + assert_cdef([[void cmpxchg8b(void* gpr32, int32_t eax, int32_t edx, int32_t ebx, int32_t ecx) __mcode("0FC71mPEI", 0xf0) + __reglist(out, int32_t eax, int32_t edx);]], "cmpxchg8b") + + local int32pair = ffi.new("int32pair") + int32pair.i1 = 1 + int32pair.i2 = -1 + + local function test_cmpxchg8b(i) + local eax,edx = ffi.C.cmpxchg8b(int32pair, i, -i, i+1, -(i+1)) + return int32pair.i1, int32pair.i2, eax, edx + end + + local function checker(i, n1, n2, eax, edx) + assert(n1 == i+1) + assert(n2 == -(i+1)) + assert(int32pair.i1 == i+1) + assert(int32pair.i2 == -(i+1)) + + assert(eax == i) + assert(edx == -i) + end + + assert_jitchecker(checker, test_cmpxchg8b) + end) + it("cpuid_brand", function() assert_cdef([[void cpuid(int32_t eax, int32_t ecx) __mcode("0FA2_E") __reglist(out, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx);]], "cpuid") @@ -526,6 +777,204 @@ context("__reglist", function() end) end) +it("popcnt", function() + assert_cdef([[int32_t popcnt(int32_t n) __mcode("f30fb8rM");]], "popcnt") + + local popcnt = ffi.C.popcnt + + assert_equal(popcnt(7), 3) + assert_equal(popcnt(1024), 1) + assert_equal(popcnt(1023), 10) + + local function testpopcnt(num) + return (popcnt(num)) + end + + assert_jit(10, testpopcnt, 1023) + assert_noexit(32, testpopcnt, -1) + assert_noexit(0, testpopcnt, 0) + assert_noexit(1, testpopcnt, 1) + + ffi.cdef([[int32_t popcntuf(int32_t n) __mcode("f30fb8rR");]]) + --check unfused + popcnt = ffi.C.popcntuf + + assert_equal(popcnt(7), 3) + assert_equal(popcnt(1024), 1) +end) + +it("addsd", function() + assert_cdef([[double addsd(double n1, double n2) __mcode("F20F58rM");]], "addsd") + local addsd = ffi.C.addsd + + function test_addsd(n1, n2) + return (addsd(n1, n2)) + end + + assert_equal(3, addsd(1, 2)) + assert_equal(0, addsd(0, 0)) + + assert_jit(-3, test_addsd, -4.5, 1.5) + assert_noexit(3, test_addsd, 4.5, -1.5) + + --check dual num exit + assert_equal(5, test_addsd(3 , 2)) + + --test same ref input + function test_addsd2(n) + return (addsd(n, n)) + end + + assert_jit(3, test_addsd2, 1.5) + assert_noexit(-3, test_addsd2, -1.5) + + --check dual num exit + assert_equal(6, test_addsd2(3)) + + --check unfused + ffi.cdef([[double addsduf(double n1, double n2) __mcode("F20F58rR");]]) + addsd = ffi.C.addsduf + + assert_equal(3, addsd(1, 2)) + assert_equal(0, addsd(0, 0)) +end) + +it("addss", function() + assert_cdef([[float addss(float n1, float n2) __mcode("F30F58rM");]], "addss") + local addsd = ffi.C.addss + + function test_addsd(n1, n2) + return (addsd(n1, n2)) + end + + assert_equal(3, addsd(1, 2)) + assert_equal(0, addsd(0, 0)) + + assert_jit(-3, test_addsd, -4.5, 1.5) + assert_noexit(3, test_addsd, 4.5, -1.5) + --check dual num exit + assert_equal(5, test_addsd(3, 2)) + + --test same ref input + function test_addss2(n) + return (addsd(n, n)) + end + + assert_jit(-9, test_addss2, -4.5) + assert_noexit(3, test_addss2, 1.5) + + --check unfused + ffi.cdef[[float addssuf(float n1, float n2) __mcode("F30F58rR");]] + addsd = ffi.C.addssuf + + assert_equal(3, addsd(1, 2)) + assert_equal(0, addsd(0, 0)) +end) + +it("shufps", function() + assert_cdef([[float4 shufps(float4 v1, float4 v2) __mcode("0FC6rMU", 0);]], "shufps") + + local shufps = ffi.C.shufps + + local v = ffi.new("float4", 1.5, 2.25, 3.125, 4.0625) + local vzero = ffi.new("float4", 1) + + function test_shufps(v1, v2) + return (shufps(v1, v2)) + end + + local vout = shufps(v, v) + assert_equal(vout[0], 1.5) + assert_equal(vout[1], 1.5) + assert_equal(vout[2], 1.5) + assert_equal(vout[3], 1.5) + + assert_cdef([[float4 shufpsrev(float4 v1, float4 v2) __mcode("0FC6rMU", 0x1b);]], "shufpsrev") + + local vout = ffi.C.shufpsrev(v, v) + + assert_equal(vout[0], 4.0625) + assert_equal(vout[1], 3.125) + assert_equal(vout[2], 2.25) + assert_equal(vout[3], 1.5) +end) + +context("mixed register type opcodes", function() + + it("cvttsd2s", function() + assert_cdef([[int cvttsd2s(double n) __mcode("F20F2CrM");]], "cvttsd2s") + local cvttsd2s = ffi.C.cvttsd2s + + function test_cvttsd2s(n) + return (cvttsd2s(n)) + end + + assert_equal(0, cvttsd2s(-0)) + assert_equal(1, cvttsd2s(1)) + assert_equal(1, cvttsd2s(1.2)) + + assert_jit(3, test_cvttsd2s, 3.3) + assert_noexit(-1, test_cvttsd2s, -1.5) + --check dual num exit + assert_equal(5, test_cvttsd2s(5)) + + --check unfused + ffi.cdef([[int cvttsd2suf(double n) __mcode("F20F2CrR");]]) + cvttsd2s = ffi.C.cvttsd2suf + + assert_equal(0, cvttsd2s(-0)) + assert_equal(1, cvttsd2s(1)) + assert_equal(1, cvttsd2s(1.2)) + end) + + it("cvtsi2sd", function() + assert_cdef([[double cvtsi2sd(int32_t n) __mcode("F20F2ArM");]], "cvtsi2sd") + local cvtsi2sd = ffi.C.cvtsi2sd + + function test_cvtsi2sd(n1, n2) + return (cvtsi2sd(n1)+n2) + end + + assert_equal(0.5, test_cvtsi2sd(0, 0.5)) + assert_equal(1.25, test_cvtsi2sd(1.0, 0.25)) + assert_equal(-1.5, test_cvtsi2sd(-2, 0.5)) + + assert_jit(3.25, test_cvtsi2sd, 3, 0.25) + assert_noexit(-1.5, test_cvtsi2sd, -2, 0.5) + + --check dual num exit + assert_equal(11, test_cvtsi2sd(5, 6)) + + --check unfused + ffi.cdef([[double cvtsi2sduf(int32_t n) __mcode("F20F2ArR");]]) + cvtsi2sd = ffi.C.cvtsi2sduf + assert_equal(0.5, test_cvtsi2sd(0, 0.5)) + assert_equal(1.25, test_cvtsi2sd(1.0, 0.25)) + assert_equal(-1.5, test_cvtsi2sd(-2, 0.5)) + end) + + it("pextrw", function() + local v = ffi.new("byte16", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + + assert_cdef([[int32_t pextrw_0(byte16 v) __mcode("660FC5mRU", 0);]], "pextrw_0") + assert_equal(0x0201, ffi.C.pextrw_0(v)) + + assert_cdef([[int32_t pextrw_7(byte16 v) __mcode("660FC5mRU", 7);]], "pextrw_7") + assert_equal(0x100f, ffi.C.pextrw_7(v)) + end) + + it("pinsrw", function() + assert_cdef([[int4 pinsrw_0(byte16 v, int32_t word) __mcode("660FC4rMU", 0);]], "pinsrw_0") + + local v = ffi.new("byte16", 0) + local vout = ffi.C.pinsrw_0(v, 0xf0f1) + assert_equal(0xf0f1, vout[0]) + + assert_cdef([[int4 pinsrw_7(byte16 v, int32_t word) __mcode("660FC4rMU", 7);]], "pinsrw_7") + vout = ffi.C.pinsrw_0(v, 0xf0f1) + assert_equal(0xf0f1, vout[0]) + end) +end) end)