Implement support for opcodes with dynamic registers

This commit is contained in:
fsfod 2016-03-29 11:10:15 +01:00
parent e6fecee925
commit de4c0b6ea5
8 changed files with 996 additions and 24 deletions

View File

@ -2452,41 +2452,53 @@ typedef struct IntrinBuildState {
RegSet inset, outset, modregs; RegSet inset, outset, modregs;
uint32_t spadj, contexspill, contexofs; uint32_t spadj, contexspill, contexofs;
uint8_t outcontext; uint8_t outcontext;
char vzeroupper; char vzeroupper, fuse;
} IntrinBuildState; } IntrinBuildState;
static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info) static void intrins_setup(CIntrinsic *intrins, IntrinBuildState *info)
{ {
MSize offset = 0, i; MSize offset = 0, i;
int dynreg = intrin_regmode(intrins);
int dynout = intrin_dynrout(intrins);
memcpy(info->in, intrins->in, LJ_INTRINS_MAXREG); memcpy(info->in, intrins->in, LJ_INTRINS_MAXREG);
memcpy(info->out, intrins->out, LJ_INTRINS_MAXREG); memcpy(info->out, intrins->out, LJ_INTRINS_MAXREG);
info->contexofs = -1; info->contexofs = -1;
info->fuse = dynreg &&
!(intrins->flags & (INTRINSFLAG_NOFUSE|INTRINSFLAG_INDIRECT));
for (i = 0; i < intrins->insz; i++) { for (i = 0; i < intrins->insz; i++) {
Reg r = reg_rid(info->in[i]); Reg r = reg_rid(info->in[i]);
int isdyn = dynreg && i < intrins->dyninsz;
lua_assert(!isdyn || reg_isdyn(info->in[i]));
if (reg_kind(info->in[i]) == REGKIND_V256) if (reg_kind(info->in[i]) == REGKIND_V256)
info->vzeroupper = 1; info->vzeroupper = 1;
if (reg_isgpr(info->in[i])) { if (reg_isgpr(info->in[i])) {
if (r == RID_CONTEXT) { if (!isdyn && r == RID_CONTEXT) {
/* Save the offset in the input context so we can load it last */ /* Save the offset in the input context so we can load it last */
info->contexofs = offset; info->contexofs = offset;
} }
offset += sizeof(intptr_t); offset += sizeof(intptr_t);
} }
if (!isdyn)
rset_set(info->inset, r); rset_set(info->inset, r);
} }
for (i = 0; i < intrins->outsz; i++) { for (i = 0; i < intrins->outsz; i++) {
if (reg_kind(info->out[i]) == REGKIND_V256) if (reg_kind(info->out[i]) == REGKIND_V256)
info->vzeroupper = 1; info->vzeroupper = 1;
if (i == 0 && dynout) continue;
rset_set(info->outset, reg_rid(info->out[i])); rset_set(info->outset, reg_rid(info->out[i]));
} }
/* Don't try to fuse if a fixed register is the same as the input context */
if (info->contexofs != -1)
info->fuse = 0;
/* TODO: dynamic output context register selection */ /* TODO: dynamic output context register selection */
info->outcontext = RID_OUTCONTEXT; info->outcontext = RID_OUTCONTEXT;
info->modregs |= info->outset|info->inset; info->modregs |= info->outset|info->inset;
@ -2506,6 +2518,7 @@ static void intrins_loadregs(ASMState *as, CIntrinsic *intrins, IntrinBuildState
/* Finally load the input register conflicting with the input context */ /* Finally load the input register conflicting with the input context */
if (rset_test(info->inset, RID_CONTEXT) && info->contexofs != -1) { if (rset_test(info->inset, RID_CONTEXT) && info->contexofs != -1) {
lua_assert(!info->fuse);
emit_loadofsirt(as, IRT_INTP, RID_CONTEXT, RID_CONTEXT, info->contexofs); emit_loadofsirt(as, IRT_INTP, RID_CONTEXT, RID_CONTEXT, info->contexofs);
} }
@ -2513,6 +2526,16 @@ static void intrins_loadregs(ASMState *as, CIntrinsic *intrins, IntrinBuildState
for (i = 0; i < intrins->insz; i++) { for (i = 0; i < intrins->insz; i++) {
uint32_t reg = info->in[i]; uint32_t reg = info->in[i];
Reg r = reg_rid(reg); Reg r = reg_rid(reg);
if (i == 0 && info->fuse) {
lua_assert(info->contexofs == -1);
/* The load is fused into the modrm of the opcode emitted in emit_intrins */
if (reg_isgpr(reg)) {
gpr++;
} else {
fpr++;
}
continue;
}
if (reg_isgpr(reg)) { if (reg_isgpr(reg)) {
if (r != RID_CONTEXT) if (r != RID_CONTEXT)
@ -2559,6 +2582,32 @@ static void intrins_saveregs(ASMState *as, CIntrinsic *intrins, IntrinBuildState
} }
} }
/* Replace placeholder register ids with platform specific registers */
static RegSet pickdynlist(uint8_t *list, MSize sz, RegSet freeset)
{
MSize i;
RegSet free = freeset;
for (i = 0; i < sz; i++) {
RegSet rset = free & (reg_isgpr(list[i]) ? RSET_GPR : RSET_FPR);
Reg r;
/* Try to use scratch register first */
if ((rset & RSET_SCRATCH) != 0) {
rset = rset & RSET_SCRATCH;
}
r = rset_pickbot(rset);
lua_assert(rset_test(free, r));
list[i] = reg_setrid(list[i], r);
rset_clear(free, r);
}
/* Return register set of extra used registers */
return freeset & ~free;
}
/* /*
** Stack spill slots and gpr slots in the context are always the size of a native pointer ** Stack spill slots and gpr slots in the context are always the size of a native pointer
** The output context register is always spilled to a fixed stack offset ** The output context register is always spilled to a fixed stack offset
@ -2577,7 +2626,10 @@ static void wrap_intrins(jit_State *J, CIntrinsic *intrins, IntrinWrapState *sta
AsmHeader *hdr; AsmHeader *hdr;
MCode *asmofs = NULL, *origtop; MCode *asmofs = NULL, *origtop;
void* target = state->target; void* target = state->target;
uint8_t *in = info.in, *out = info.out;
int spadj = 0; int spadj = 0;
int dynreg = intrin_regmode(intrins);
Reg rout = RID_NONE, rin = RID_NONE;
lj_asm_setup_intrins(J, as); lj_asm_setup_intrins(J, as);
origtop = as->mctop; origtop = as->mctop;
@ -2586,6 +2638,60 @@ static void wrap_intrins(jit_State *J, CIntrinsic *intrins, IntrinWrapState *sta
info.modregs = state->mod; info.modregs = state->mod;
intrins_setup(intrins, &info); intrins_setup(intrins, &info);
/* Pick some ABI specific scratch registers for the opcode's input/output registers */
if (dynreg) {
RegSet scatch = RSET_ALL & ~info.inset;
int inofs = 0;
lua_assert(intrins->dyninsz <= intrins->insz);
/* Avoid unnecessary spill of the output context */
if (intrins->outsz != 0)
rset_clear(scatch, info.outcontext);
if (dynreg == DYNREG_OPEXT || dynreg == DYNREG_TWOSTORE || reg_isvec(in[0])) {
info.fuse = 0;
}
if (info.fuse) {
inofs++;
rin = RID_CONTEXT;
rset_clear(scatch, RID_CONTEXT);
}
if ((intrins->dyninsz-inofs) > 0) {
rset_clear(scatch, RID_CONTEXT);
/* Merge in registers used for dynamic input registers */
info.inset |= pickdynlist(in+inofs, intrins->dyninsz-inofs, scatch);
}
if (rin == RID_NONE)
rin = reg_rid(in[0]);
/* Allocate the dynamic output register */
if (intrins->outsz > 0 && intrin_dynrout(intrins)) {
if (dynreg == DYNREG_INOUT) {
rout = reg_rid(in[1]);
out[0] = reg_setrid(out[0], rout);
} else if (dynreg == DYNREG_OPEXT) {
/* Destructive single register opcode */
rout = out[0] = reg_setrid(out[0], rin);
} else {
scatch = RSET_INIT & ~info.outset;
rset_clear(scatch, info.outcontext);
scatch = pickdynlist(out, 1, scatch);
rout = reg_rid(out[0]);
}
rset_set(info.outset, rout);
}
if (rout == RID_NONE && intrins->dyninsz > 1) {
lua_assert(reg_isdyn(intrins->in[1]));
rout = reg_rid(in[1]);
}
info.modregs |= info.inset|info.outset;
}
/* Used for picking scratch register when loading or saving boxed values */ /* Used for picking scratch register when loading or saving boxed values */
as->modset = info.modregs|RID_CONTEXT; as->modset = info.modregs|RID_CONTEXT;
@ -2651,16 +2757,49 @@ restart:
emit_storeofsirt(as, IRT_INTP, info.outcontext, RID_SP, TEMPSPILL); emit_storeofsirt(as, IRT_INTP, info.outcontext, RID_SP, TEMPSPILL);
} }
if (intrins->flags & INTRINSFLAG_CALLED) { #if LJ_TARGET_X86ORX64
Reg rin = 0; /* Setup modrm to tobe a load from the input context pointer we assume offset
* will be to the first value in either the gpr or fpr part of the context
* because the first input register should always be the dynamic one for opcodes
*/
as->mrm.idx = RID_NONE;
as->mrm.scale = XM_SCALE1;
as->mrm.ofs = 0;
if (dynreg) {
if (info.fuse || (intrins->flags & INTRINSFLAG_INDIRECT)) {
lua_assert(!reg_isvec(in[0]));
as->mrm.base = rin;
rin = RID_MRM;
if (info.fuse) {
/* Set the fused offset into the input context */
if (reg_isfp(in[0])) {
as->mrm.ofs = offsetof(RegContext, fpr);
} else {
as->mrm.ofs = offsetof(RegContext, gpr);
}
}
} else {
as->mrm.base = RID_NONE;
lua_assert(rin != RID_NONE);
}
} else if(intrins->flags & INTRINSFLAG_CALLED) {
#if LJ_64 #if LJ_64
/* Pick a scratch register in case the relative distance for the call is /* Pick a scratch register in case the relative distance for the call is
** larger than a signed 32bit value ** larger than a signed 32bit value
*/ */
rin = intrinsic_scratch(as, RSET_GPR); rin = intrinsic_scratch(as, RSET_GPR);
#endif #endif
}
#endif
if (intrins->flags & INTRINSFLAG_CALLED) {
/* emit a call to the target which may be collocated after us */ /* emit a call to the target which may be collocated after us */
emit_intrins(as, intrins, rin, (uintptr_t)target); emit_intrins(as, intrins, rin, (uintptr_t)target);
} else if (dynreg) {
/* Write an opcode to the wrapper */
asmofs = emit_intrins(as, intrins, rin, rout);
} else { } else {
/* Append the user supplied machine code */ /* Append the user supplied machine code */
asmofs = asm_mcode(as, state->target, state->targetsz); asmofs = asm_mcode(as, state->target, state->targetsz);

View File

@ -1204,6 +1204,18 @@ static void cp_decl_msvcattribute(CPState *cp, CPDecl *decl)
#if LJ_HASINTRINSICS #if LJ_HASINTRINSICS
static uint8_t getsignedbyte(CPState *cp)
{
int32_t val = cp->val.i32;
if (cp->tok != CTOK_INTEGER)
cp_err_token(cp, CTOK_INTEGER);
/* Flatten negative values to a signed 8 bit number */
/* NYI: immediate values larger than 8 bits */
return (val < 0 ? (uint8_t)(int8_t)val : val);
}
static void cp_decl_mcode(CPState *cp, CPDecl *decl) static void cp_decl_mcode(CPState *cp, CPDecl *decl)
{ {
/* Check were declared after a function definition */ /* Check were declared after a function definition */
@ -1224,6 +1236,17 @@ static void cp_decl_mcode(CPState *cp, CPDecl *decl)
decl->redir = cp->str; decl->redir = cp->str;
cp_next(cp); cp_next(cp);
/* Check if we have immediate and prefix byte values */
if (cp_opt(cp, ',')) {
/* NYI: immediate values larger than 8 bits */
decl->bits = (CTSize)getsignedbyte(cp);
cp_next(cp);
if (cp_opt(cp, ',')) {
decl->bits |= getsignedbyte(cp) << 8;
cp_next(cp);
}
}
cp_check(cp, ')'); cp_check(cp, ')');
/* Mark the function as an intrinsic */ /* Mark the function as an intrinsic */
decl->stack[decl->top-1].info |= CTF_INTRINS; decl->stack[decl->top-1].info |= CTF_INTRINS;

View File

@ -175,7 +175,15 @@ typedef int (LJ_FASTCALL *IntrinsicWrapper)(void *incontext, void* outcontext);
typedef struct CIntrinsic { typedef struct CIntrinsic {
IntrinsicWrapper wrapped; IntrinsicWrapper wrapped;
union {
uint8_t in[8]; uint8_t in[8];
struct {
uint8_t opregs[5]; /* cmpxchg8b */
uint8_t immb;
uint8_t prefix; /* prefix byte see INTRINSFLAG_PREFIX */
uint8_t dyninsz; /* dynamic input register count */
};
};
union { union {
uint8_t out[8]; uint8_t out[8];
struct { struct {

View File

@ -650,7 +650,29 @@ static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
static MCode* emit_intrins(ASMState *as, CIntrinsic *intrins, Reg r1, static MCode* emit_intrins(ASMState *as, CIntrinsic *intrins, Reg r1,
uintptr_t r2) uintptr_t r2)
{ {
if (intrins->flags & INTRINSFLAG_CALLED) { uint32_t regmode = intrin_regmode(intrins);
if (regmode) {
if (regmode == DYNREG_OPEXT) {
r2 = intrin_getopextb(intrins);
}
/* force 64 bit operands */
if (intrins->flags & INTRINSFLAG_REXW) {
r2 |= REX_64;
}
if (intrins->flags & INTRINSFLAG_IMMB) {
*--as->mcp = intrins->immb;
}
emit_mrm(as, intrins->opcode, (Reg)r2, r1);
if (intrins->flags & INTRINSFLAG_PREFIX) {
*--as->mcp = intrins->prefix;
}
checkmclim(as);
} else if (intrins->flags & INTRINSFLAG_CALLED) {
lua_assert(r2); lua_assert(r2);
emit_call_(as, (MCode*)r2, r1); emit_call_(as, (MCode*)r2, r1);
return NULL; return NULL;

View File

@ -24,6 +24,7 @@
typedef enum RegFlags { typedef enum RegFlags {
REGFLAG_64BIT = REGKIND_GPR64 << 6, /* 64 bit override */ REGFLAG_64BIT = REGKIND_GPR64 << 6, /* 64 bit override */
REGFLAG_BLACKLIST = 1 << 17, REGFLAG_BLACKLIST = 1 << 17,
REGFLAG_DYN = 1 << 18,
}RegFlags; }RegFlags;
typedef struct RegEntry { typedef struct RegEntry {
@ -55,6 +56,8 @@ RegEntry reglut[] = {
#if LJ_64 #if LJ_64
GPRDEF_R64(MKREG_GPR64) GPRDEF_R64(MKREG_GPR64)
#endif #endif
{"gpr32", REGFLAG_DYN|RID_DYN_GPR},
{"gpr64", REGFLAG_64BIT|REGFLAG_DYN|RID_DYN_GPR}
}; };
static CTypeID register_intrinsic(lua_State *L, CIntrinsic* src, CType *func) static CTypeID register_intrinsic(lua_State *L, CIntrinsic* src, CType *func)
@ -118,7 +121,9 @@ static int parse_fprreg(const char *name, uint32_t len)
} }
rid += RID_MIN_FPR; rid += RID_MIN_FPR;
} else { } else {
return -1; /* Unnumbered reg is considered a placeholder for a dynamic reg */
flags = REGFLAG_DYN;
rid = RID_DYN_FPR;
} }
if (name[0] == 'y') { if (name[0] == 'y') {
@ -192,7 +197,7 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid,
CTypeID liststart) CTypeID liststart)
{ {
CTState *cts = ctype_cts(L); CTState *cts = ctype_cts(L);
uint32_t i, count = 0; uint32_t i, count = 0, dyncount = 0;
RegSet rset = 0; RegSet rset = 0;
const char *listname; const char *listname;
uint8_t *regout = NULL; uint8_t *regout = NULL;
@ -232,11 +237,20 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid,
r = reg_rid(reg); r = reg_rid(reg);
/* Check for duplicate registers in the list */ if (reg & REGFLAG_DYN) {
if (regsetid == REGSET_MOD)
lj_err_callerv(L, LJ_ERR_FFI_BADREG, "cannot use dynamic register", strdata(str), listname);
if (++dyncount > LJ_INTRINS_MAXDYNREG) {
lj_err_callerv(L, LJ_ERR_FFI_BADREG, "too many dynamic", strdata(str), listname);
}
} else {
/* Check for duplicate fixed registers in the list */
if (rset_test(rset, r)) { if (rset_test(rset, r)) {
lj_err_callerv(L, LJ_ERR_FFI_BADREG, "duplicate", strdata(str), listname); lj_err_callerv(L, LJ_ERR_FFI_BADREG, "duplicate", strdata(str), listname);
} }
rset_set(rset, r); rset_set(rset, r);
}
if (regsetid == REGSET_OUT && reg_isgpr(reg)) { if (regsetid == REGSET_OUT && reg_isgpr(reg)) {
CType *ct = ctype_rawchild(cts, ctarg); CType *ct = ctype_rawchild(cts, ctarg);
@ -261,6 +275,9 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid,
if (regsetid == REGSET_IN) { if (regsetid == REGSET_IN) {
intrins->insz = (uint8_t)count; intrins->insz = (uint8_t)count;
if (dyncount != 0) {
intrins->dyninsz = dyncount;
}
} else if (regsetid == REGSET_OUT) { } else if (regsetid == REGSET_OUT) {
intrins->outsz = (uint8_t)count; intrins->outsz = (uint8_t)count;
} }
@ -268,15 +285,91 @@ static RegSet process_reglist(lua_State *L, CIntrinsic *intrins, int regsetid,
return rset; return rset;
} }
static int parse_opmode(const char *op, MSize len)
{
MSize i = 0;
int m = 0;
int r = 0;
int flags = 0;
for (; i < len; i++) {
switch (op[i]) {
case 'm':
m = 1;
break;
case 'M':
m = 2;
break;
/* modrm register */
case 'r':
r = 1;
break;
case 'R':
r = r == 0 ? 2 : 3;
break;
case 'U':
flags |= INTRINSFLAG_IMMB;
break;
case 'C':
flags |= INTRINSFLAG_CALLED;
break;
case 'X':
flags |= INTRINSFLAG_REXW;
break;
case 'P':
flags |= INTRINSFLAG_PREFIX;
break;
case 'I':
flags |= INTRINSFLAG_INDIRECT;
break;
case 'E':
flags |= INTRINSFLAG_EXPLICTREGS;
break;
default:
/* return index of invalid flag */
return -(int)(i+1);
}
}
if ((r || m) & !(flags & INTRINSFLAG_REGMODEMASK)) {
/* 'Rm' mem/r is left reg is right */
if (r == 2 && m == 1) {
flags |= DYNREG_TWOSTORE; /* MR */
} else if(r == 0 && m == 1) {
flags |= DYNREG_OPEXT;
} else if ((r == 1 && m == 2) || r == 3) {
flags |= DYNREG_ONE; /* RM */
} else {
return -1;
}
/* if neither of the operands is listed as memory disable trying to fuse a load in */
if (r == 3) {
flags |= INTRINSFLAG_NOFUSE; /* rR */
}
}
return flags;
}
static void setopcode(lua_State *L, CIntrinsic *intrins, uint32_t opcode) static void setopcode(lua_State *L, CIntrinsic *intrins, uint32_t opcode)
{ {
int len; int len;
uint32_t opext = 0;
if (opcode == 0) { if (opcode == 0) {
lj_err_callermsg(L, "bad opcode literal"); lj_err_callermsg(L, "bad opcode literal");
} }
#if LJ_TARGET_X86ORX64 #if LJ_TARGET_X86ORX64
/* the LSB of the opcode should be the register number */
if (intrin_regmode(intrins) == DYNREG_OPEXT) {
opext = (opcode & 7);
opcode = opcode >> 4;
}
if (opcode <= 0xff) { if (opcode <= 0xff) {
len = 1; len = 1;
} else if (opcode <= 0xffff) { } else if (opcode <= 0xffff) {
@ -288,11 +381,16 @@ static void setopcode(lua_State *L, CIntrinsic *intrins, uint32_t opcode)
} }
opcode = lj_bswap(opcode); opcode = lj_bswap(opcode);
if (len < 4) { if (len < 4) {
opcode |= (uint8_t)(int8_t)-(len+1); opcode |= (uint8_t)(int8_t)-(len+1);
} else { } else {
lj_err_callermsg(L, "bad opcode literal"); lj_err_callermsg(L, "bad opcode literal");
} }
if (intrin_regmode(intrins) == DYNREG_OPEXT) {
intrin_setopextb(intrins, opext);
}
#endif #endif
intrins->opcode = opcode; intrins->opcode = opcode;
@ -303,6 +401,7 @@ static int parse_opstr(lua_State *L, GCstr *opstr, CIntrinsic *intrins, int* bui
const char *op = strdata(opstr); const char *op = strdata(opstr);
uint32_t opcode = 0; uint32_t opcode = 0;
uint32_t i; uint32_t i;
int flags;
/* Parse the opcode number if this is not a template */ /* Parse the opcode number if this is not a template */
if (op[0] != '?') { if (op[0] != '?') {
@ -320,11 +419,22 @@ static int parse_opstr(lua_State *L, GCstr *opstr, CIntrinsic *intrins, int* bui
opcode = (opcode << 4) + (d & 15); opcode = (opcode << 4) + (d & 15);
} }
if (*op == '_') op++;
} else { } else {
*buildflags |= INTRINSFLAG_TEMPLATE; *buildflags |= INTRINSFLAG_TEMPLATE;
op++; op++;
} }
flags = parse_opmode(op, opstr->len - (MSize)(op-strdata(opstr)));
if (flags < 0) {
lj_err_callerv(L, LJ_ERR_FFI_BADOPSTR, strdata(opstr), "bad mode flags");
} else {
intrins->flags |= flags;
}
/* Flags only used during construction of the intrinsic in the upper bits*/
*buildflags |= flags & 0xffff0000;
return opcode; return opcode;
} }
@ -378,7 +488,8 @@ CTypeID lj_intrinsic_template(lua_State *L, int narg)
intrins = lj_intrinsic_get(cts, ct->size); intrins = lj_intrinsic_get(cts, ct->size);
/* Can't be a template if it an opcode */ /* Can't be a template if it an opcode */
if ((intrins->opcode && intrins->outsz <= 4) || intrins->wrapped) if (intrin_regmode(intrins) != DYNREG_FIXED || (intrins->opcode && intrins->outsz <= 4) ||
intrins->wrapped)
lj_err_arg(L, narg, LJ_ERR_FFI_INVTYPE); lj_err_arg(L, narg, LJ_ERR_FFI_INVTYPE);
return id; return id;
@ -407,21 +518,80 @@ int lj_intrinsic_create(lua_State *L)
return 1; return 1;
} }
static int inferreg(CTState *cts, CType *ct) {
CTSize sz = ct->size;
int rid = -1, kind = -1;
if (ctype_isnum(ct->info)) {
if (ctype_isfp(ct->info)) {
rid = RID_DYN_FPR;
if (sz > 8)
return -1;
kind = sz == 4 ? REGKIND_FPR32 : REGKIND_FPR64;
} else {
rid = RID_DYN_GPR;
if (sz == 8) {
if (LJ_32)
return -1; /* NYI: 64 bit pair registers */
kind = REGKIND_GPR64;
rid |= INTRINSFLAG_REXW;
} else {
kind = ct->info & CTF_UNSIGNED ? REGKIND_GPR32CD : REGKIND_GPRI32;
}
}
} else if (ctype_isptr(ct->info)) {
ct = ctype_raw(cts, ctype_cid(ct->info));
if (ctype_isvector(ct->info)) {
goto vec;
} else {
rid = RID_DYN_GPR;
kind = LJ_32 ? REGKIND_GPR32CD : REGKIND_GPR64;
}
} else if (ctype_isvector(ct->info)) {
CType *vtype;
vec:
vtype = ctype_raw(cts, ctype_cid(ct->info));
if (ctype_typeid(cts, vtype) < CTID_BOOL || ctype_typeid(cts, vtype) > CTID_DOUBLE ||
(ct->size != 16 && ct->size != 32)) {
return -1;
}
if (ct->size == 32) {
kind = REGKIND_V256;
rid = RID_DYN_FPR | INTRINSFLAG_VEX256;
} else {
kind = REGKIND_V128;
rid = RID_DYN_FPR;
}
} else {
lua_assert(ctype_iscomplex(ct->info));
return -1;
}
return reg_make(rid, kind);
}
GCcdata *lj_intrinsic_createffi(CTState *cts, CType *func) GCcdata *lj_intrinsic_createffi(CTState *cts, CType *func)
{ {
GCcdata *cd; GCcdata *cd;
CIntrinsic *intrins = lj_intrinsic_get(cts, func->size); CIntrinsic *intrins = lj_intrinsic_get(cts, func->size);
CTypeID id = ctype_typeid(cts, func); CTypeID id = ctype_typeid(cts, func);
RegSet mod = intrin_getmodrset(cts, intrins); RegSet mod = intrin_getmodrset(cts, intrins);
uint32_t op = intrins->opcode;
void* mcode = ((char*)&op) + (4-intrin_oplen(intrins));
if (intrins->opcode == 0) { if (intrins->opcode == 0) {
lj_err_callermsg(cts->L, "expected non template intrinsic"); lj_err_callermsg(cts->L, "expected non template intrinsic");
} }
/* Build the interpreter wrapper */
if (intrin_regmode(intrins) == DYNREG_FIXED) {
uint32_t op = intrins->opcode;
void* mcode = ((char*)&op) + (4-intrin_oplen(intrins));
intrins->wrapped = lj_intrinsic_buildwrap(cts->L, intrins, mcode, intrins->wrapped = lj_intrinsic_buildwrap(cts->L, intrins, mcode,
intrin_oplen(intrins), mod); intrin_oplen(intrins), mod);
} else {
intrins->wrapped = lj_intrinsic_buildwrap(cts->L, intrins, NULL, 0, mod);
}
cd = lj_cdata_new(cts, id, CTSIZE_PTR); cd = lj_cdata_new(cts, id, CTSIZE_PTR);
*(void **)cdataptr(cd) = intrins->wrapped; *(void **)cdataptr(cd) = intrins->wrapped;
@ -433,8 +603,9 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm)
CTState *cts = ctype_cts(L); CTState *cts = ctype_cts(L);
CType *func = ctype_get(cts, fid); CType *func = ctype_get(cts, fid);
CTypeID sib = func->sib, retid = ctype_cid(func->info); CTypeID sib = func->sib, retid = ctype_cid(func->info);
RegSet routset = 0;
uint32_t opcode; uint32_t opcode;
int buildflags = 0; int buildflags = 0, dynout = 0;
CIntrinsic _intrins; CIntrinsic _intrins;
CIntrinsic* intrins = &_intrins; CIntrinsic* intrins = &_intrins;
memset(intrins, 0, sizeof(CIntrinsic)); memset(intrins, 0, sizeof(CIntrinsic));
@ -445,18 +616,62 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm)
return 0; return 0;
} }
if (sib) { if (buildflags & INTRINSFLAG_EXPLICTREGS) {
process_reglist(L, intrins, REGSET_IN, sib); process_reglist(L, intrins, REGSET_IN, sib);
} else {
/* Infer the types of input register based on parameter types */
while (sib != 0) {
CType *arg = ctype_get(cts, sib);
CType *ct = ctype_rawchild(cts, arg);
int reg = inferreg(cts, ct);
sib = arg->sib;
if (reg == -1) {
return 0;
} }
/* Save the register info in place of the argument index */
arg->size = reg & 0xff;
setarg_casttype(cts, arg, ct);
/* Merge shared register flags */
intrins->flags |= reg & 0xff00;
intrins->in[intrins->insz++] = reg & 0xff;
intrins->dyninsz++;
if (intrins->dyninsz > LJ_INTRINS_MAXDYNREG)
return 0;
if (sib != 0 && intrins->insz == LJ_INTRINS_MAXREG) {
return 0;
}
}
}
if (retid != CTID_VOID) { if (retid != CTID_VOID) {
CType *ct = ctype_get(cts, retid); CType *ct = ctype_get(cts, retid);
/* Check if the intrinsic had __reglist declared on it */ /* Check if the intrinsic had __reglist declared on it */
if (ctype_isfield(ct->info)) { if (ctype_isfield(ct->info)) {
process_reglist(L, intrins, REGSET_OUT, retid); routset = process_reglist(L, intrins, REGSET_OUT, retid);
sib = retid; sib = retid;
} else {
int reg = inferreg(cts, ct);
if (reg == -1) {
return 0;
}
/* Merge shared register flags */
intrins->flags |= reg & 0xff00;
/* Create a field entry for the return value that we make the ctype child
** of the function.
*/
sib = lj_ctype_new(cts, &ct);
ct->info = CTINFO(CT_FIELD, retid);
ct->size = reg;
intrins->out[intrins->outsz++] = reg & 0xff;
dynout = 1;
} }
} else { } else {
sib = retid; sib = retid;
@ -466,6 +681,60 @@ int lj_intrinsic_fromcdef(lua_State *L, CTypeID fid, GCstr *opstr, uint32_t imm)
if (opcode) { if (opcode) {
setopcode(L, intrins, opcode); setopcode(L, intrins, opcode);
} }
if (intrin_regmode(intrins) == DYNREG_FIXED) {
/* dyninsz is overlapped by input registers 6/7/8 */
if ((intrins->insz < 6 && intrins->dyninsz > 0) || dynout) {
lj_err_callerv(L, LJ_ERR_FFI_BADOPSTR, strdata(opstr),
"no register mode specified for dynamic registers");
}
}
#if LJ_TARGET_X86ORX64
/* Validate dynamic register count for the specified register mode*/
if (intrin_regmode(intrins) == DYNREG_ONE){
if (intrins->dyninsz == 2 && intrins->outsz == 1 && routset == 0) {
/* Infer destructive opcode if the single out */
intrin_setregmode(intrins, DYNREG_INOUT);
} else if(intrins->dyninsz == 2){
intrin_setregmode(intrins, DYNREG_TWOIN);
} else if (intrins->dyninsz == 0 || intrins->outsz == 0 ||
!reg_isdyn(intrins->out[0])) {
return 0;
}
}else if (intrin_regmode(intrins) == DYNREG_TWOSTORE) {
if (intrins->dyninsz == 1 && intrins->outsz != 0) {
intrin_setregmode(intrins, DYNREG_ONESTORE);
} else if (intrins->insz == 0 || intrins->dyninsz == 0) {
/* Store opcodes need at least an address the value could be an immediate */
return 0;
}
} else if (intrin_regmode(intrins) == DYNREG_OPEXT) {
if (intrins->dyninsz != 1)
return 0;
}
/* Swap the registers from there declared order to match how there
** processed
*/
if (intrin_regmode(intrins) >= DYNREG_SWAPREGS) {
uint8_t temp = intrins->in[0];
intrins->in[0] = intrins->in[1]; intrins->in[1] = temp;
}
#endif
if (intrins->flags & INTRINSFLAG_PREFIX) {
intrins->prefix = (uint8_t)imm;
/* Prefix value should be declared before an immediate value in the
** __mcode definition the second number declared is shifted right when
** packed in the ctype.
*/
imm >>= 8;
}
if (intrins->flags & INTRINSFLAG_IMMB) {
intrins->immb = (uint8_t)(imm & 0xff);
}
register_intrinsic(L, intrins, ctype_get(cts, fid)); register_intrinsic(L, intrins, ctype_get(cts, fid));
lua_assert(sib > 0 && sib < cts->top); lua_assert(sib > 0 && sib < cts->top);
@ -567,6 +836,18 @@ int lj_intrinsic_call(CTState *cts, CType *ct)
lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg+1) | CCF_INTRINS_ARG); lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg+1) | CCF_INTRINS_ARG);
} }
/* Swap input values around to match the platform ordering the wrapper expects */
if (intrin_regmode(intrins) >= DYNREG_SWAPREGS &&
reg_isgpr(intrins->in[0]) == reg_isgpr(intrins->in[1])) {
if (reg_isgpr(intrins->in[0])) {
intptr_t temp = context.gpr[0];
context.gpr[0] = context.gpr[1]; context.gpr[1] = temp;
} else {
double temp = context.fpr[0];
context.fpr[0] = context.fpr[1]; context.fpr[1] = temp;
}
}
/* Pass in the return type chain so the results are typed */ /* Pass in the return type chain so the results are typed */
outcontent = setup_results(L, intrins, ctype_cid(ctype_get(cts, funcid)->info)); outcontent = setup_results(L, intrins, ctype_cid(ctype_get(cts, funcid)->info));

View File

@ -14,20 +14,54 @@
#define LJ_INTRINS_MAXREG 8 #define LJ_INTRINS_MAXREG 8
#endif #endif
/* The max number of dynamic registers in each reglist(in/out)*/
#define LJ_INTRINS_MAXDYNREG 2
typedef struct LJ_ALIGN(16) RegContext { typedef struct LJ_ALIGN(16) RegContext {
intptr_t gpr[LJ_INTRINS_MAXREG]; intptr_t gpr[LJ_INTRINS_MAXREG];
double fpr[LJ_INTRINS_MAXREG]; double fpr[LJ_INTRINS_MAXREG];
} RegContext; } RegContext;
typedef enum REGMODE {
DYNREG_FIXED = 0,
/* one input register and optionally one output */
DYNREG_ONE,
/* 1(R) register in, 1 out(M) which can be a memory address to store the value */
DYNREG_ONESTORE,
/* 2 in 0 out first must always be treated as indirect */
DYNREG_TWOSTORE,
/* one input(M) register and the second is part of part of the opcode */
DYNREG_OPEXT,
/* Two input register and one output same register that's same RID the second input */
DYNREG_INOUT,
/* Two input registers with M dynamic output register */
DYNREG_TWOIN,
DYNREG_SWAPREGS = DYNREG_INOUT,
} REGMODE;
typedef enum INTRINSFLAGS { typedef enum INTRINSFLAGS {
INTRINSFLAG_REGMODEMASK = 7,
INTRINSFLAG_MEMORYSIDE = 0x08, /* has memory side effects so needs an IR memory barrier */ INTRINSFLAG_MEMORYSIDE = 0x08, /* has memory side effects so needs an IR memory barrier */
/* Intrinsic should be emitted as a naked function that is called */ /* Intrinsic should be emitted as a naked function that is called */
INTRINSFLAG_CALLED = 0x20, INTRINSFLAG_CALLED = 0x20,
/* MODRM should always be set as indirect mode */ /* MODRM should always be set as indirect mode */
INTRINSFLAG_INDIRECT = 0x40, INTRINSFLAG_INDIRECT = 0x40,
/* Don't fuse load into op */
INTRINSFLAG_NOFUSE = 0x80,
/* Force REX.w 64 bit size override bit to be set for x64 */
INTRINSFLAG_REXW = 0x100,
/* Append a user supplied prefixed before the opcode and its REX byte */
INTRINSFLAG_PREFIX = 0x200,
/* Opcode has an immediate byte that needs to be set at construction time */
INTRINSFLAG_IMMB = 0x400,
/* Opcode uses ymm registers */ /* Opcode uses ymm registers */
INTRINSFLAG_VEX256 = 0x4000, INTRINSFLAG_VEX256 = 0x4000,
/* Input parameters names explicitly declare input registers */
INTRINSFLAG_EXPLICTREGS = 0x10000,
/* Intrinsic is a template with no machine code set until instantiate at runtime with /* Intrinsic is a template with no machine code set until instantiate at runtime with
** user supplied code. ** user supplied code.
*/ */
@ -47,7 +81,19 @@ typedef struct AsmHeader {
uint32_t totalzs; uint32_t totalzs;
} AsmHeader; } AsmHeader;
#define intrin_regmode(intrins) ((intrins)->flags & INTRINSFLAG_REGMODEMASK)
#define intrin_setregmode(intrins, mode) \
(intrins)->flags = ((intrins)->flags & ~INTRINSFLAG_REGMODEMASK)|(mode)
#define intrin_getopextb(intrins) ((intrins)->out[3])
#define intrin_setopextb(intrins, opext) \
lua_assert((intrins)->outsz < 4); \
((intrins)->out[3] = (opext))
#define intrin_oplen(intrins) ((-(int8_t)(intrins)->opcode)-1) #define intrin_oplen(intrins) ((-(int8_t)(intrins)->opcode)-1)
/* odd numbered have an dynamic output */
#define intrin_dynrout(intrins) (intrin_regmode(intrins) && reg_isdyn(intrins->out[0]))
/* Get the optional RegSet of registers modified by the intrinsic */
#define intrin_getmodrset(cts, intrins) \ #define intrin_getmodrset(cts, intrins) \
((ctype_get(cts, (intrins)->id)->size >> 16) ? \ ((ctype_get(cts, (intrins)->id)->size >> 16) ? \
ctype_get(cts, ctype_get(cts, (intrins)->id)->size >> 16)->size : 0) ctype_get(cts, ctype_get(cts, (intrins)->id)->size >> 16)->size : 0)
@ -91,6 +137,7 @@ CTypeID1 regkind_ct[16];
#define reg_isgpr(reg) (reg_rid(reg) < RID_MAX_GPR) #define reg_isgpr(reg) (reg_rid(reg) < RID_MAX_GPR)
#define reg_isfp(reg) (reg_rid(reg) >= RID_MIN_FPR) #define reg_isfp(reg) (reg_rid(reg) >= RID_MIN_FPR)
#define reg_isvec(reg) (reg_rid(reg) >= RID_MIN_FPR && reg_kind(reg) >= REGKIND_VEC_START) #define reg_isvec(reg) (reg_rid(reg) >= RID_MIN_FPR && reg_kind(reg) >= REGKIND_VEC_START)
#define reg_isdyn(reg) (reg_rid(reg) == RID_DYN_GPR || reg_rid(reg) == RID_DYN_FPR)
#define reg_irt(reg) (reg_isgpr(reg) ? rk_irtgpr(reg_kind(reg)) : rk_irtfpr(reg_kind(reg))) #define reg_irt(reg) (reg_isgpr(reg) ? rk_irtgpr(reg_kind(reg)) : rk_irtfpr(reg_kind(reg)))
#define rk_irtgpr(kind) ((IRType)regkind_it[(kind)]) #define rk_irtgpr(kind) ((IRType)regkind_it[(kind)])

View File

@ -74,6 +74,9 @@ enum {
RID_CONTEXT = RID_ECX, RID_CONTEXT = RID_ECX,
RID_OUTCONTEXT = RID_EDX, RID_OUTCONTEXT = RID_EDX,
#endif #endif
/* Placeholder register ids for dynamic register entries in intrinsics */
RID_DYN_FPR = RID_MAX_FPR-1,
RID_DYN_GPR = RID_SP,
}; };
/* -- Register sets ------------------------------------------------------- */ /* -- Register sets ------------------------------------------------------- */

View File

@ -353,6 +353,21 @@ context("__mcode", function()
assert_equal(ffi.C.multi1(1.1), 1) assert_equal(ffi.C.multi1(1.1), 1)
end) end)
it("bad dynamic registers", function()
--No modrm specifed for the implicit output register decleared having a non void return type
assert_cdeferr([[int32_t dynerr1() __mcode("90");]])
assert_cdeferr([[void dynerr2(int32_t a) __mcode("90");]])
assert_cdeferr([[int32_t dynerr3(int32_t a) __mcode("90");]])
-- no dynamic registers listed
assert_cdeferr([[void dynerr4() __mcode("90m");]])
assert_cdeferr([[void dynerr5() __mcode("90rM");]])
assert_cdeferr([[void dynerr6() __mcode("90Mr");]])
--need 2 in or 1 in and a return type
assert_cdeferr([[void dynerr7(int32_t a) __mcode("90rM");]])
--too many dynamic registers
assert_cdeferr([[void dynerr8(int a, int b, int c) __mcode("90rR");]])
end)
it("bad ffi types mcode", function() it("bad ffi types mcode", function()
assert_cdeferr([[void testffi1(float a2, ...) __mcode("90");]]) assert_cdeferr([[void testffi1(float a2, ...) __mcode("90");]])
assert_cdeferr([[void testffi2(complex a2) __mcode("90");]]) assert_cdeferr([[void testffi2(complex a2) __mcode("90");]])
@ -382,6 +397,131 @@ context("__mcode", function()
assert_error(function() idiv(1, 2, 3, 4) end) assert_error(function() idiv(1, 2, 3, 4) end)
end) end)
it("output pointers", function()
assert_cdef([[const char* addptr(const char* nptr, int32_t n) __mcode("03rM");]], "addptr")
local s = "0123456789abcdefghijklmnopqrstvwxyz"
local ptr = ffi.C.addptr(s, 0)
assert_equal(ptr, ffi.cast("const char*", s))
assert_equal(ptr[0], string.byte(s))
local function checker(i, sptr)
assert(tostring(sptr), tostring(ptr+i))
assert(sptr == ptr+i)
end
assert_jitchecker(checker, function(i)
return (ffi.C.addptr(s, i))
end)
end)
it("signed/unsigned numbers", function()
assert_cdef([[int32_t sub_signed(int32_t n, int32_t i) __mcode("2brM");]], "sub_signed")
assert_cdef([[uint32_t sub_unsigned(uint32_t n, uint32_t i) __mcode("2brM");]], "sub_unsigned")
assert_cdef([[uint32_t sub_signedun(int32_t n, int32_t i) __mcode("2brM");]], "sub_signedun")
assert_equal(tonumber(ffi.C.sub_unsigned(3, 1)), 2)
local function unsignedtest(n1, n2)
return (tonumber(ffi.C.sub_unsigned(n1, n2)))
end
assert_jit(2, unsignedtest, 3, 1)
assert_jit(2999999999, unsignedtest, 3000000000, 1)
--wrap around
assert_jit(4294967295, unsignedtest, 300, 301)
local function unsignedtest_boxed(n1, n2)
return (ffi.C.sub_unsigned(n1, n2))
end
assert_jit(ffi.new("uint32_t", 2), unsignedtest_boxed, 3, 1)
assert_jit(ffi.new("uint32_t", 2999999999), unsignedtest_boxed, 3000000000, 1)
--wrap around
assert_jit(ffi.new("uint32_t", 4294967295), unsignedtest_boxed, 300, 301)
local function signedtest(n1, n2)
return (ffi.C.sub_signed(n1, n2))
end
assert_jit(-2, signedtest, -1, 1)
assert_noexit(3, signedtest, -1, -4)
end)
it("op encode", function()
assert_cdef([[int32_t not32(int32_t n) __mcode("F72m");]], "not32")
local function test_not(i)
return (ffi.C.not32(i))
end
assert_jit(-1, test_not, 0)
assert_noexit(0, test_not, -1)
assert_cdef([[int32_t add_imm3(int32_t n) __mcode("830mU", 3);]], "add_imm3")
local function checker(i, n)
return i+3, n
end
assert_jitchecker(checker, function(i)
return (ffi.C.add_imm3(i))
end)
end)
it("prefix byte", function()
assert_cdef([[void atomicadd(int32_t* nptr, int32_t n) __mcode("01mRIP", 0xF0);]], "atomicadd")
local sum = 0
local function checker(i, jsum)
sum = sum+i
if(jsum ~= sum) then
return jsum, sum
end
end
local numptr = ffi.new("int32_t[1]", 0)
assert_jitchecker(checker, function(i)
ffi.C.atomicadd(numptr, i)
return numptr[0]
end)
end)
if ffi.arch == "x64" then
it("prefix64", function()
assert_cdef([[void atomicadd64(int64_t* nptr, int64_t n) __mcode("01mRIP", 0xF0);]], "atomicadd64")
local sum = 0
local function checker(i, jsum)
sum = sum+i
assert(jsum == sum)
end
local numptr = ffi.new("int64_t[1]", 0)
assert_jitchecker(checker, function(i)
ffi.C.atomicadd64(numptr, i)
return numptr[0]
end)
end)
end
it("prefix and imm byte", function()
assert_cdef([[void atomicadd1(int32_t* nptr) __mcode("830mIUP", 0xF0, 0x01);]], "atomicadd1")
local function checker(i, jsum)
if(jsum ~= i) then
return i, jsum
end
end
local numptr = ffi.new("int32_t[1]", 0)
assert_jitchecker(checker, function(i)
ffi.C.atomicadd1(numptr)
return numptr[0]
end)
end)
it("idiv(template)", function() it("idiv(template)", function()
assert_cdef([[void idivT(int32_t eax, int32_t ecx) __mcode("?E") __reglist(out, int32_t eax, int32_t edx)]]) assert_cdef([[void idivT(int32_t eax, int32_t ecx) __mcode("?E") __reglist(out, int32_t eax, int32_t edx)]])
--trying to create template intrinsic through C library should always fail --trying to create template intrinsic through C library should always fail
@ -416,6 +556,117 @@ context("__mcode", function()
assert_exit(10, test_idiv, 10, 5) assert_exit(10, test_idiv, 10, 5)
end) end)
it("prefetch", function()
assert_cdef([[void prefetch0(void* mem) __mcode("0F181mI")]], "prefetch0")
assert_cdef([[void prefetch1(void* mem) __mcode("0F182mI")]], "prefetch1")
assert_cdef([[void prefetch2(void* mem) __mcode("0F183mI")]], "prefetch2")
assert_cdef([[void prefetchnta(void* mem) __mcode("0F180mI")]], "prefetchnta")
local asm = ffi.C
local kmem = ffi.new("int[4]")
local mem = 1
mem = mem and ffi.new("int[8]", 1, 2, 3, 4, 5, 6, 7, 8)
local function testprefetch(a, b, c)
local n = a+b
local ptr = mem+c
asm.prefetch2(ptr)
asm.prefetch1(kmem)
asm.prefetch0(mem+a)
asm.prefetchnta(mem)
asm.prefetch0(kmem+a)
asm.prefetch1(kmem+b)
return (ptr) ~= 0 and ptr[0] + ptr[3]
end
assert_jit(11, testprefetch, 1, 2, 3)
end)
it("cmpxchg", function()
assert_cdef([[void cmpxchg(int32_t* gpr32, int32_t gpr32, int32_t eax) __mcode("0FB1mRPEI", 0xF0) __reglist(out, int32_t eax);]], "cmpxchg")
local kptr32 = ffi.new("int32_t[1]", 0)
int4[0] = 0
local function checker(i, n, eax)
assert(n == i)
assert(kptr32[0] == i)
assert(eax == i-1)
end
local function test_cmpxchg(i)
local eax = ffi.C.cmpxchg(kptr32, i, i-1)
return kptr32[0], eax
end
assert_jitchecker(checker, test_cmpxchg)
--test not equal non swapping
local num, eax = test_cmpxchg(0)
assert_equal(eax, kptr32[0])
num, eax = test_cmpxchg(kptr32[0]+1)
assert_equal(eax, kptr32[0]-1)
end)
if ffi.arch == "x64" then
it("cmpxchg64", function()
assert_cdef([[void cmpxchg64(int64_t* gpr64, int64_t gpr64, int64_t rax) __mcode("0FB1mRPEIX", 0xF0) __reglist(out, int64_t rax);]], "cmpxchg64")
local kptr64 = ffi.new("int64_t[1]", 0)
local function test_cmpxchg64(i)
local rax = ffi.C.cmpxchg64(kptr64, -i, -(i-1))
return kptr64[0], rax
end
local function checker(i, newval, rax)
assert(newval == -i)
assert(kptr64[0] == -i)
assert(rax == -(i-1))
end
assert_jitchecker(checker, test_cmpxchg64, 2)
--test not equal non swapping
local num, rax = test_cmpxchg64(0, 1)
assert_equal(rax, kptr64[0])
end)
end
it("cmpxchg8b", function()
ffi.cdef([[typedef struct int32pair {
int32_t i1;
int32_t i2;
} __attribute__((aligned(8))) int32pair;]])
assert_cdef([[void cmpxchg8b(void* gpr32, int32_t eax, int32_t edx, int32_t ebx, int32_t ecx) __mcode("0FC71mPEI", 0xf0)
__reglist(out, int32_t eax, int32_t edx);]], "cmpxchg8b")
local int32pair = ffi.new("int32pair")
int32pair.i1 = 1
int32pair.i2 = -1
local function test_cmpxchg8b(i)
local eax,edx = ffi.C.cmpxchg8b(int32pair, i, -i, i+1, -(i+1))
return int32pair.i1, int32pair.i2, eax, edx
end
local function checker(i, n1, n2, eax, edx)
assert(n1 == i+1)
assert(n2 == -(i+1))
assert(int32pair.i1 == i+1)
assert(int32pair.i2 == -(i+1))
assert(eax == i)
assert(edx == -i)
end
assert_jitchecker(checker, test_cmpxchg8b)
end)
it("cpuid_brand", function() it("cpuid_brand", function()
assert_cdef([[void cpuid(int32_t eax, int32_t ecx) __mcode("0FA2_E") __reglist(out, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx);]], "cpuid") assert_cdef([[void cpuid(int32_t eax, int32_t ecx) __mcode("0FA2_E") __reglist(out, int32_t eax, int32_t ebx, int32_t ecx, int32_t edx);]], "cpuid")
@ -526,6 +777,204 @@ context("__reglist", function()
end) end)
end) end)
it("popcnt", function()
assert_cdef([[int32_t popcnt(int32_t n) __mcode("f30fb8rM");]], "popcnt")
local popcnt = ffi.C.popcnt
assert_equal(popcnt(7), 3)
assert_equal(popcnt(1024), 1)
assert_equal(popcnt(1023), 10)
local function testpopcnt(num)
return (popcnt(num))
end
assert_jit(10, testpopcnt, 1023)
assert_noexit(32, testpopcnt, -1)
assert_noexit(0, testpopcnt, 0)
assert_noexit(1, testpopcnt, 1)
ffi.cdef([[int32_t popcntuf(int32_t n) __mcode("f30fb8rR");]])
--check unfused
popcnt = ffi.C.popcntuf
assert_equal(popcnt(7), 3)
assert_equal(popcnt(1024), 1)
end)
it("addsd", function()
assert_cdef([[double addsd(double n1, double n2) __mcode("F20F58rM");]], "addsd")
local addsd = ffi.C.addsd
function test_addsd(n1, n2)
return (addsd(n1, n2))
end
assert_equal(3, addsd(1, 2))
assert_equal(0, addsd(0, 0))
assert_jit(-3, test_addsd, -4.5, 1.5)
assert_noexit(3, test_addsd, 4.5, -1.5)
--check dual num exit
assert_equal(5, test_addsd(3 , 2))
--test same ref input
function test_addsd2(n)
return (addsd(n, n))
end
assert_jit(3, test_addsd2, 1.5)
assert_noexit(-3, test_addsd2, -1.5)
--check dual num exit
assert_equal(6, test_addsd2(3))
--check unfused
ffi.cdef([[double addsduf(double n1, double n2) __mcode("F20F58rR");]])
addsd = ffi.C.addsduf
assert_equal(3, addsd(1, 2))
assert_equal(0, addsd(0, 0))
end)
it("addss", function()
assert_cdef([[float addss(float n1, float n2) __mcode("F30F58rM");]], "addss")
local addsd = ffi.C.addss
function test_addsd(n1, n2)
return (addsd(n1, n2))
end
assert_equal(3, addsd(1, 2))
assert_equal(0, addsd(0, 0))
assert_jit(-3, test_addsd, -4.5, 1.5)
assert_noexit(3, test_addsd, 4.5, -1.5)
--check dual num exit
assert_equal(5, test_addsd(3, 2))
--test same ref input
function test_addss2(n)
return (addsd(n, n))
end
assert_jit(-9, test_addss2, -4.5)
assert_noexit(3, test_addss2, 1.5)
--check unfused
ffi.cdef[[float addssuf(float n1, float n2) __mcode("F30F58rR");]]
addsd = ffi.C.addssuf
assert_equal(3, addsd(1, 2))
assert_equal(0, addsd(0, 0))
end)
it("shufps", function()
assert_cdef([[float4 shufps(float4 v1, float4 v2) __mcode("0FC6rMU", 0);]], "shufps")
local shufps = ffi.C.shufps
local v = ffi.new("float4", 1.5, 2.25, 3.125, 4.0625)
local vzero = ffi.new("float4", 1)
function test_shufps(v1, v2)
return (shufps(v1, v2))
end
local vout = shufps(v, v)
assert_equal(vout[0], 1.5)
assert_equal(vout[1], 1.5)
assert_equal(vout[2], 1.5)
assert_equal(vout[3], 1.5)
assert_cdef([[float4 shufpsrev(float4 v1, float4 v2) __mcode("0FC6rMU", 0x1b);]], "shufpsrev")
local vout = ffi.C.shufpsrev(v, v)
assert_equal(vout[0], 4.0625)
assert_equal(vout[1], 3.125)
assert_equal(vout[2], 2.25)
assert_equal(vout[3], 1.5)
end)
context("mixed register type opcodes", function()
it("cvttsd2s", function()
assert_cdef([[int cvttsd2s(double n) __mcode("F20F2CrM");]], "cvttsd2s")
local cvttsd2s = ffi.C.cvttsd2s
function test_cvttsd2s(n)
return (cvttsd2s(n))
end
assert_equal(0, cvttsd2s(-0))
assert_equal(1, cvttsd2s(1))
assert_equal(1, cvttsd2s(1.2))
assert_jit(3, test_cvttsd2s, 3.3)
assert_noexit(-1, test_cvttsd2s, -1.5)
--check dual num exit
assert_equal(5, test_cvttsd2s(5))
--check unfused
ffi.cdef([[int cvttsd2suf(double n) __mcode("F20F2CrR");]])
cvttsd2s = ffi.C.cvttsd2suf
assert_equal(0, cvttsd2s(-0))
assert_equal(1, cvttsd2s(1))
assert_equal(1, cvttsd2s(1.2))
end)
it("cvtsi2sd", function()
assert_cdef([[double cvtsi2sd(int32_t n) __mcode("F20F2ArM");]], "cvtsi2sd")
local cvtsi2sd = ffi.C.cvtsi2sd
function test_cvtsi2sd(n1, n2)
return (cvtsi2sd(n1)+n2)
end
assert_equal(0.5, test_cvtsi2sd(0, 0.5))
assert_equal(1.25, test_cvtsi2sd(1.0, 0.25))
assert_equal(-1.5, test_cvtsi2sd(-2, 0.5))
assert_jit(3.25, test_cvtsi2sd, 3, 0.25)
assert_noexit(-1.5, test_cvtsi2sd, -2, 0.5)
--check dual num exit
assert_equal(11, test_cvtsi2sd(5, 6))
--check unfused
ffi.cdef([[double cvtsi2sduf(int32_t n) __mcode("F20F2ArR");]])
cvtsi2sd = ffi.C.cvtsi2sduf
assert_equal(0.5, test_cvtsi2sd(0, 0.5))
assert_equal(1.25, test_cvtsi2sd(1.0, 0.25))
assert_equal(-1.5, test_cvtsi2sd(-2, 0.5))
end)
it("pextrw", function()
local v = ffi.new("byte16", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
assert_cdef([[int32_t pextrw_0(byte16 v) __mcode("660FC5mRU", 0);]], "pextrw_0")
assert_equal(0x0201, ffi.C.pextrw_0(v))
assert_cdef([[int32_t pextrw_7(byte16 v) __mcode("660FC5mRU", 7);]], "pextrw_7")
assert_equal(0x100f, ffi.C.pextrw_7(v))
end)
it("pinsrw", function()
assert_cdef([[int4 pinsrw_0(byte16 v, int32_t word) __mcode("660FC4rMU", 0);]], "pinsrw_0")
local v = ffi.new("byte16", 0)
local vout = ffi.C.pinsrw_0(v, 0xf0f1)
assert_equal(0xf0f1, vout[0])
assert_cdef([[int4 pinsrw_7(byte16 v, int32_t word) __mcode("660FC4rMU", 7);]], "pinsrw_7")
vout = ffi.C.pinsrw_0(v, 0xf0f1)
assert_equal(0xf0f1, vout[0])
end)
end)
end) end)